also try to get character encoding from XML declaration and http-equiv meta tag

git-svn-id: http://arthurdejong.org/svn/webcheck/webcheck@178 86f53f14-5ff3-0310-afe5-9b438ce3f40c
author: Arthur de Jong <arthur@arthurdejong.org> 2005-09-17 17:58:42 +0200
committer: Arthur de Jong <arthur@arthurdejong.org> 2005-09-17 17:58:42 +0200
commit: a6512ccafe4ac02bf06c476e8f3a2dd2971c55e0 (patch)
tree: 946d9c3f0d818f4201717efaf50e366c00d16257 /parsers/html.py
parent: 8d591e6eb49354cd01a3adeb70bee7ab31feb6c7 (diff)
1 files changed, 22 insertions, 0 deletions
diff --git a/parsers/html.py b/parsers/html.py
index 5e4083f..27a3ea5 100644
--- a/parsers/html.py
+++ b/parsers/html.py
@@ -34,6 +34,12 @@ _charentitypattern = re.compile('&#[0-9]{1,3};')
 # pattern for matching spaces
 _spacepattern = re.compile(" ")
 
+# pattern for matching charset declaration for http-equiv tag
+_charsetpattern = re.compile('charset=([^ ]*)', re.I)
+
+# pattern for matching the encoding part of an xml declaration
+_encodingpattern = re.compile('^xml .*encoding="([^"]*)"', re.I)
+
 class _MyHTMLParser(HTMLParser.HTMLParser):
     """A simple subclass of HTMLParser.HTMLParser continuing after errors
     and gathering some information from the parsed content."""
@@ -116,6 +122,13 @@ class _MyHTMLParser(HTMLParser.HTMLParser):
         # <meta http-equiv="refresh" content="0;url=http://ch.tudelft.nl/~arthur/">
         elif tag == "meta" and attrs.has_key("http-equiv") and attrs.has_key("content") and attrs["http-equiv"].lower() == "refresh":
             pass # TODO: implement
+        # <meta http-equiv="content-type" content="text/html; charset=utf-8" />
+        elif tag == "meta" and attrs.has_key("http-equiv") and attrs.has_key("content") and attrs["http-equiv"].lower() == "content-type":
+            if self.link.encoding is None:
+                try:
+                    self.link.encoding = _charsetpattern.search(attrs["content"]).group(1)
+                except AttributeError:
+                    pass
         # <img src="url">
         elif tag == "img" and attrs.has_key("src"):
             self.embedded.append(self._cleanurl(attrs["src"]))
@@ -167,6 +180,15 @@ class _MyHTMLParser(HTMLParser.HTMLParser):
         handle_data()."""
         self.handle_data('&'+name+';')
 
+    def handle_pi(self, data):
+        """Hanlde xml declaration."""
+        # find character encoding from declaration
+        if self.link.encoding is None:
+            try:
+                self.link.encoding = _encodingpattern.search(data).group(1)
+            except AttributeError:
+                pass
+
 def parse(content, link):
     """Parse the specified content and extract an url list, a list of images a
     title and an author. The content is assumed to contain HMTL."""
author	Arthur de Jong <arthur@arthurdejong.org>	2005-09-17 17:58:42 +0200
committer	Arthur de Jong <arthur@arthurdejong.org>	2005-09-17 17:58:42 +0200
commit	a6512ccafe4ac02bf06c476e8f3a2dd2971c55e0 (patch)
tree	946d9c3f0d818f4201717efaf50e366c00d16257 /parsers/html.py
parent	8d591e6eb49354cd01a3adeb70bee7ab31feb6c7 (diff)