From 1f349f9c8710a3afe6467946b2380b66c0386ca6 Mon Sep 17 00:00:00 2001 From: Arthur de Jong Date: Sun, 31 Jul 2005 09:45:13 +0000 Subject: replace numeric entity refs with their proper values based on patch by Eric W.Brown git-svn-id: http://arthurdejong.org/svn/webcheck/webcheck@117 86f53f14-5ff3-0310-afe5-9b438ce3f40c --- parsers/html.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'parsers') diff --git a/parsers/html.py b/parsers/html.py index ad755e8..16c636f 100644 --- a/parsers/html.py +++ b/parsers/html.py @@ -23,6 +23,7 @@ import config import debugio import HTMLParser import urlparse +import re # the list of mimetypes this module should be able to handle mimetypes = ('text/html', 'application/xhtml+xml', 'text/x-server-parsed-html') @@ -123,6 +124,14 @@ class _MyHTMLParser(HTMLParser.HTMLParser): if self.collect is not None: self.collect += data +def _cleanurl(url): + """Do some translations of url.""" + # replace &#nnn; entity refs with proper characters + charEntityPattern = re.compile('&#[0-9]{1,3};') + for charEntity in charEntityPattern.findall(url): + url = url.replace(charEntity,chr(int(charEntity[2:-1]))) + return url + def parse(content, link): """Parse the specified content and extract an url list, a list of images a title and an author. The content is assumed to contain HMTL.""" @@ -152,6 +161,6 @@ def parse(content, link): base = parser.base # list embedded and children for embed in parser.embedded: - link.add_embed(urlparse.urljoin(base,embed)) + link.add_embed(urlparse.urljoin(base,_cleanurl(embed))) for child in parser.children: - link.add_child(urlparse.urljoin(base,child)) + link.add_child(urlparse.urljoin(base,_cleanurl(child))) -- cgit v1.2.3