move html escaping and unescaping functions to parsers.html

git-svn-id: http://arthurdejong.org/svn/webcheck/webcheck@255 86f53f14-5ff3-0310-afe5-9b438ce3f40c
author: Arthur de Jong <arthur@arthurdejong.org> 2006-05-07 11:36:47 +0200
committer: Arthur de Jong <arthur@arthurdejong.org> 2006-05-07 11:36:47 +0200
commit: 6bc31318910cc06e66cf464882c69a3d9cbd1ac8 (patch)
tree: 8c73af50e0f19d4e14640babfe3eee777c557516 /parsers
parent: fbf614231415cd72eab37c440fec44e577299ed5 (diff)
1 files changed, 52 insertions, 11 deletions
diff --git a/parsers/html.py b/parsers/html.py
index c924824..f82a939 100644
--- a/parsers/html.py
+++ b/parsers/html.py
@@ -27,6 +27,7 @@ import HTMLParser
 import urlparse
 import re
 import crawler
+import htmlentitydefs
 
 # the list of mimetypes this module should be able to handle
 mimetypes = ('text/html', 'application/xhtml+xml', 'text/x-server-parsed-html')
@@ -35,7 +36,7 @@ mimetypes = ('text/html', 'application/xhtml+xml', 'text/x-server-parsed-html')
 _charentitypattern = re.compile('&#([0-9]{1,3});')
 
 # pattern for matching all html entities
-_entitypattern = re.compile('&[^ ;]+;')
+_entitypattern = re.compile('&(#[0-9]{1,6}|[a-zA-Z]{2,10});')
 
 # pattern for matching spaces
 _spacepattern = re.compile(' ')
@@ -46,6 +47,54 @@ _charsetpattern = re.compile('charset=([^ ]*)', re.I)
 # pattern for matching the encoding part of an xml declaration
 _encodingpattern = re.compile('^xml .*encoding="([^"]*)"', re.I)
 
+def htmlescape(txt, inattr=False):
+    """HTML escape the given string and return an ASCII clean string with
+    known entities and character entities for the other values.
+    If the inattr parameter is set quotes and newlines will also be escaped."""
+    # convert to unicode object
+    if type(txt) is str:
+        txt = unicode(txt, errors='replace')
+    # the output string
+    out = ''
+    # loop over the characters of the string
+    for c in txt:
+        if c == '"':
+            if inattr:
+                out += '&%s;' % htmlentitydefs.codepoint2name[ord(c)]
+            else:
+                out += '"'
+        elif htmlentitydefs.codepoint2name.has_key(ord(c)):
+            out += '&%s;' % htmlentitydefs.codepoint2name[ord(c)]
+        elif ord(c) > 126:
+            out += '&#%d;'% ord(c)
+        elif inattr and c == u'\n':
+            out += '&#10;'
+        else:
+            out += c.encode('utf-8')
+    return out
+
+def _unescape_entity(match):
+    """Helper function for _htmlunescape().
+    This funcion unescapes a html entity, it is passed to the sub()
+    function."""
+    if htmlentitydefs.name2codepoint.has_key(match.group(1)):
+        return unichr(htmlentitydefs.name2codepoint[match.group(1)])
+    elif match.group(1)[0] == '#':
+        return unichr(int(match.group(1)[1:]))
+    else:
+        raise IOError('parse error')
+
+def htmlunescape(txt):
+    """This function unescapes a html encoded string.
+    This function returns a unicode string."""
+    # convert to unicode
+    if type(txt) is str:
+        txt = unicode(txt, errors='replace')
+    # replace &name; and &#nn; refs with proper characters
+    txt = _entitypattern.sub(_unescape_entity, txt)
+    # we're done
+    return txt
+
 class _MyHTMLParser(HTMLParser.HTMLParser):
     """A simple subclass of HTMLParser.HTMLParser continuing after errors
     and gathering some information from the parsed content."""
@@ -237,7 +286,6 @@ class _MyHTMLParser(HTMLParser.HTMLParser):
 def _maketxt(txt, encoding):
     """Return an unicode text of the specified string do correct character
     conversions and replacing html entities with normal characters."""
-    import htmlentitydefs
     # convert string to unicode
     # TODO: check for encoding errors (first try unicode() function with strict)
     try:
@@ -248,15 +296,8 @@ def _maketxt(txt, encoding):
             # TODO: log unknown encoding problem as page problem
         # fall back to locale's encoding
         txt = unicode(txt, errors='replace')
-    # replace &#nnn; entity refs with proper characters
-    txt = _charentitypattern.sub(lambda x:unichr(int(x.group(1))), txt)
-    # replace html entity refs with proper characters
-    for entity in _entitypattern.findall(txt):
-        if (htmlentitydefs.name2codepoint.has_key(entity[1:-1])):
-            txt = txt.replace(
-                    entity,
-                    unichr(htmlentitydefs.name2codepoint[entity[1:-1]]) )
-    return txt
+    # replace entity refs with proper characters
+    return htmlunescape(txt)
 
 def parse(content, link):
     """Parse the specified content and extract an url list, a list of images a
author	Arthur de Jong <arthur@arthurdejong.org>	2006-05-07 11:36:47 +0200
committer	Arthur de Jong <arthur@arthurdejong.org>	2006-05-07 11:36:47 +0200
commit	6bc31318910cc06e66cf464882c69a3d9cbd1ac8 (patch)
tree	8c73af50e0f19d4e14640babfe3eee777c557516 /parsers
parent	fbf614231415cd72eab37c440fec44e577299ed5 (diff)