Arthur de Jong

Open Source / Free Software developer

summaryrefslogtreecommitdiffstats
path: root/parsers
diff options
context:
space:
mode:
authorArthur de Jong <arthur@arthurdejong.org>2006-05-07 11:36:47 +0200
committerArthur de Jong <arthur@arthurdejong.org>2006-05-07 11:36:47 +0200
commit6bc31318910cc06e66cf464882c69a3d9cbd1ac8 (patch)
tree8c73af50e0f19d4e14640babfe3eee777c557516 /parsers
parentfbf614231415cd72eab37c440fec44e577299ed5 (diff)
move html escaping and unescaping functions to parsers.html
git-svn-id: http://arthurdejong.org/svn/webcheck/webcheck@255 86f53f14-5ff3-0310-afe5-9b438ce3f40c
Diffstat (limited to 'parsers')
-rw-r--r--parsers/html.py63
1 files changed, 52 insertions, 11 deletions
diff --git a/parsers/html.py b/parsers/html.py
index c924824..f82a939 100644
--- a/parsers/html.py
+++ b/parsers/html.py
@@ -27,6 +27,7 @@ import HTMLParser
import urlparse
import re
import crawler
+import htmlentitydefs
# the list of mimetypes this module should be able to handle
mimetypes = ('text/html', 'application/xhtml+xml', 'text/x-server-parsed-html')
@@ -35,7 +36,7 @@ mimetypes = ('text/html', 'application/xhtml+xml', 'text/x-server-parsed-html')
_charentitypattern = re.compile('&#([0-9]{1,3});')
# pattern for matching all html entities
-_entitypattern = re.compile('&[^ ;]+;')
+_entitypattern = re.compile('&(#[0-9]{1,6}|[a-zA-Z]{2,10});')
# pattern for matching spaces
_spacepattern = re.compile(' ')
@@ -46,6 +47,54 @@ _charsetpattern = re.compile('charset=([^ ]*)', re.I)
# pattern for matching the encoding part of an xml declaration
_encodingpattern = re.compile('^xml .*encoding="([^"]*)"', re.I)
+def htmlescape(txt, inattr=False):
+ """HTML escape the given string and return an ASCII clean string with
+ known entities and character entities for the other values.
+ If the inattr parameter is set quotes and newlines will also be escaped."""
+ # convert to unicode object
+ if type(txt) is str:
+ txt = unicode(txt, errors='replace')
+ # the output string
+ out = ''
+ # loop over the characters of the string
+ for c in txt:
+ if c == '"':
+ if inattr:
+ out += '&%s;' % htmlentitydefs.codepoint2name[ord(c)]
+ else:
+ out += '"'
+ elif htmlentitydefs.codepoint2name.has_key(ord(c)):
+ out += '&%s;' % htmlentitydefs.codepoint2name[ord(c)]
+ elif ord(c) > 126:
+ out += '&#%d;'% ord(c)
+ elif inattr and c == u'\n':
+ out += '&#10;'
+ else:
+ out += c.encode('utf-8')
+ return out
+
+def _unescape_entity(match):
+ """Helper function for _htmlunescape().
+ This funcion unescapes a html entity, it is passed to the sub()
+ function."""
+ if htmlentitydefs.name2codepoint.has_key(match.group(1)):
+ return unichr(htmlentitydefs.name2codepoint[match.group(1)])
+ elif match.group(1)[0] == '#':
+ return unichr(int(match.group(1)[1:]))
+ else:
+ raise IOError('parse error')
+
+def htmlunescape(txt):
+ """This function unescapes a html encoded string.
+ This function returns a unicode string."""
+ # convert to unicode
+ if type(txt) is str:
+ txt = unicode(txt, errors='replace')
+ # replace &name; and &#nn; refs with proper characters
+ txt = _entitypattern.sub(_unescape_entity, txt)
+ # we're done
+ return txt
+
class _MyHTMLParser(HTMLParser.HTMLParser):
"""A simple subclass of HTMLParser.HTMLParser continuing after errors
and gathering some information from the parsed content."""
@@ -237,7 +286,6 @@ class _MyHTMLParser(HTMLParser.HTMLParser):
def _maketxt(txt, encoding):
"""Return an unicode text of the specified string do correct character
conversions and replacing html entities with normal characters."""
- import htmlentitydefs
# convert string to unicode
# TODO: check for encoding errors (first try unicode() function with strict)
try:
@@ -248,15 +296,8 @@ def _maketxt(txt, encoding):
# TODO: log unknown encoding problem as page problem
# fall back to locale's encoding
txt = unicode(txt, errors='replace')
- # replace &#nnn; entity refs with proper characters
- txt = _charentitypattern.sub(lambda x:unichr(int(x.group(1))), txt)
- # replace html entity refs with proper characters
- for entity in _entitypattern.findall(txt):
- if (htmlentitydefs.name2codepoint.has_key(entity[1:-1])):
- txt = txt.replace(
- entity,
- unichr(htmlentitydefs.name2codepoint[entity[1:-1]]) )
- return txt
+ # replace entity refs with proper characters
+ return htmlunescape(txt)
def parse(content, link):
"""Parse the specified content and extract an url list, a list of images a