replace numeric entity refs with their proper values based on patch by Eric W.Brown <eric@saugus.net>

git-svn-id: http://arthurdejong.org/svn/webcheck/webcheck@117 86f53f14-5ff3-0310-afe5-9b438ce3f40c
author: Arthur de Jong <arthur@arthurdejong.org> 2005-07-31 11:45:13 +0200
committer: Arthur de Jong <arthur@arthurdejong.org> 2005-07-31 11:45:13 +0200
commit: 1f349f9c8710a3afe6467946b2380b66c0386ca6 (patch)
tree: fcac5dd79c5c3b808843de1a309259bb49532ead /parsers
parent: 607f05e12d36af0f50bfb8a2a53d9edb7d57de8a (diff)
1 files changed, 11 insertions, 2 deletions
diff --git a/parsers/html.py b/parsers/html.py
index ad755e8..16c636f 100644
--- a/parsers/html.py
+++ b/parsers/html.py
@@ -23,6 +23,7 @@ import config
 import debugio
 import HTMLParser
 import urlparse
+import re
 
 # the list of mimetypes this module should be able to handle
 mimetypes = ('text/html', 'application/xhtml+xml', 'text/x-server-parsed-html')
@@ -123,6 +124,14 @@ class _MyHTMLParser(HTMLParser.HTMLParser):
         if self.collect is not None:
             self.collect += data
 
+def _cleanurl(url):
+    """Do some translations of url."""
+    # replace &#nnn; entity refs with proper characters
+    charEntityPattern = re.compile('&#[0-9]{1,3};')
+    for charEntity in charEntityPattern.findall(url):
+        url = url.replace(charEntity,chr(int(charEntity[2:-1])))
+    return url
+
 def parse(content, link):
     """Parse the specified content and extract an url list, a list of images a
     title and an author. The content is assumed to contain HMTL."""
@@ -152,6 +161,6 @@ def parse(content, link):
         base = parser.base
     # list embedded and children
     for embed in parser.embedded:
-        link.add_embed(urlparse.urljoin(base,embed))
+        link.add_embed(urlparse.urljoin(base,_cleanurl(embed)))
     for child in parser.children:
-        link.add_child(urlparse.urljoin(base,child))
+        link.add_child(urlparse.urljoin(base,_cleanurl(child)))
author	Arthur de Jong <arthur@arthurdejong.org>	2005-07-31 11:45:13 +0200
committer	Arthur de Jong <arthur@arthurdejong.org>	2005-07-31 11:45:13 +0200
commit	1f349f9c8710a3afe6467946b2380b66c0386ca6 (patch)
tree	fcac5dd79c5c3b808843de1a309259bb49532ead /parsers
parent	607f05e12d36af0f50bfb8a2a53d9edb7d57de8a (diff)