Arthur de Jong

Open Source / Free Software developer

summaryrefslogtreecommitdiffstats
path: root/parsers
diff options
context:
space:
mode:
authorArthur de Jong <arthur@arthurdejong.org>2005-07-31 11:45:13 +0200
committerArthur de Jong <arthur@arthurdejong.org>2005-07-31 11:45:13 +0200
commit1f349f9c8710a3afe6467946b2380b66c0386ca6 (patch)
treefcac5dd79c5c3b808843de1a309259bb49532ead /parsers
parent607f05e12d36af0f50bfb8a2a53d9edb7d57de8a (diff)
replace numeric entity refs with their proper values based on patch by Eric W.Brown <eric@saugus.net>
git-svn-id: http://arthurdejong.org/svn/webcheck/webcheck@117 86f53f14-5ff3-0310-afe5-9b438ce3f40c
Diffstat (limited to 'parsers')
-rw-r--r--parsers/html.py13
1 files changed, 11 insertions, 2 deletions
diff --git a/parsers/html.py b/parsers/html.py
index ad755e8..16c636f 100644
--- a/parsers/html.py
+++ b/parsers/html.py
@@ -23,6 +23,7 @@ import config
import debugio
import HTMLParser
import urlparse
+import re
# the list of mimetypes this module should be able to handle
mimetypes = ('text/html', 'application/xhtml+xml', 'text/x-server-parsed-html')
@@ -123,6 +124,14 @@ class _MyHTMLParser(HTMLParser.HTMLParser):
if self.collect is not None:
self.collect += data
+def _cleanurl(url):
+ """Do some translations of url."""
+ # replace &#nnn; entity refs with proper characters
+ charEntityPattern = re.compile('&#[0-9]{1,3};')
+ for charEntity in charEntityPattern.findall(url):
+ url = url.replace(charEntity,chr(int(charEntity[2:-1])))
+ return url
+
def parse(content, link):
"""Parse the specified content and extract an url list, a list of images a
title and an author. The content is assumed to contain HMTL."""
@@ -152,6 +161,6 @@ def parse(content, link):
base = parser.base
# list embedded and children
for embed in parser.embedded:
- link.add_embed(urlparse.urljoin(base,embed))
+ link.add_embed(urlparse.urljoin(base,_cleanurl(embed)))
for child in parser.children:
- link.add_child(urlparse.urljoin(base,child))
+ link.add_child(urlparse.urljoin(base,_cleanurl(child)))