From 1f349f9c8710a3afe6467946b2380b66c0386ca6 Mon Sep 17 00:00:00 2001
From: Arthur de Jong <arthur@arthurdejong.org>
Date: Sun, 31 Jul 2005 09:45:13 +0000
Subject: replace numeric entity refs with their proper values based on patch
 by Eric W.Brown <eric@saugus.net>

git-svn-id: http://arthurdejong.org/svn/webcheck/webcheck@117 86f53f14-5ff3-0310-afe5-9b438ce3f40c
---
 parsers/html.py | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

(limited to 'parsers')

diff --git a/parsers/html.py b/parsers/html.py
index ad755e8..16c636f 100644
--- a/parsers/html.py
+++ b/parsers/html.py
@@ -23,6 +23,7 @@ import config
 import debugio
 import HTMLParser
 import urlparse
+import re
 
 # the list of mimetypes this module should be able to handle
 mimetypes = ('text/html', 'application/xhtml+xml', 'text/x-server-parsed-html')
@@ -123,6 +124,14 @@ class _MyHTMLParser(HTMLParser.HTMLParser):
         if self.collect is not None:
             self.collect += data
 
+def _cleanurl(url):
+    """Do some translations of url."""
+    # replace &#nnn; entity refs with proper characters
+    charEntityPattern = re.compile('&#[0-9]{1,3};')
+    for charEntity in charEntityPattern.findall(url):
+        url = url.replace(charEntity,chr(int(charEntity[2:-1])))
+    return url
+
 def parse(content, link):
     """Parse the specified content and extract an url list, a list of images a
     title and an author. The content is assumed to contain HMTL."""
@@ -152,6 +161,6 @@ def parse(content, link):
         base = parser.base
     # list embedded and children
     for embed in parser.embedded:
-        link.add_embed(urlparse.urljoin(base,embed))
+        link.add_embed(urlparse.urljoin(base,_cleanurl(embed)))
     for child in parser.children:
-        link.add_child(urlparse.urljoin(base,child))
+        link.add_child(urlparse.urljoin(base,_cleanurl(child)))
-- 
cgit v1.2.3