diff options
author | Arthur de Jong <arthur@arthurdejong.org> | 2005-07-31 11:45:13 +0200 |
---|---|---|
committer | Arthur de Jong <arthur@arthurdejong.org> | 2005-07-31 11:45:13 +0200 |
commit | 1f349f9c8710a3afe6467946b2380b66c0386ca6 (patch) | |
tree | fcac5dd79c5c3b808843de1a309259bb49532ead /parsers | |
parent | 607f05e12d36af0f50bfb8a2a53d9edb7d57de8a (diff) |
replace numeric entity refs with their proper values based on patch by Eric W.Brown <eric@saugus.net>
git-svn-id: http://arthurdejong.org/svn/webcheck/webcheck@117 86f53f14-5ff3-0310-afe5-9b438ce3f40c
Diffstat (limited to 'parsers')
-rw-r--r-- | parsers/html.py | 13 |
1 files changed, 11 insertions, 2 deletions
diff --git a/parsers/html.py b/parsers/html.py index ad755e8..16c636f 100644 --- a/parsers/html.py +++ b/parsers/html.py @@ -23,6 +23,7 @@ import config import debugio import HTMLParser import urlparse +import re # the list of mimetypes this module should be able to handle mimetypes = ('text/html', 'application/xhtml+xml', 'text/x-server-parsed-html') @@ -123,6 +124,14 @@ class _MyHTMLParser(HTMLParser.HTMLParser): if self.collect is not None: self.collect += data +def _cleanurl(url): + """Do some translations of url.""" + # replace &#nnn; entity refs with proper characters + charEntityPattern = re.compile('&#[0-9]{1,3};') + for charEntity in charEntityPattern.findall(url): + url = url.replace(charEntity,chr(int(charEntity[2:-1]))) + return url + def parse(content, link): """Parse the specified content and extract an url list, a list of images a title and an author. The content is assumed to contain HMTL.""" @@ -152,6 +161,6 @@ def parse(content, link): base = parser.base # list embedded and children for embed in parser.embedded: - link.add_embed(urlparse.urljoin(base,embed)) + link.add_embed(urlparse.urljoin(base,_cleanurl(embed))) for child in parser.children: - link.add_child(urlparse.urljoin(base,child)) + link.add_child(urlparse.urljoin(base,_cleanurl(child))) |