Arthur de Jong

Open Source / Free Software developer

summaryrefslogtreecommitdiffstats
path: root/parsers/html.py
diff options
context:
space:
mode:
authorArthur de Jong <arthur@arthurdejong.org>2005-09-17 17:58:42 +0200
committerArthur de Jong <arthur@arthurdejong.org>2005-09-17 17:58:42 +0200
commita6512ccafe4ac02bf06c476e8f3a2dd2971c55e0 (patch)
tree946d9c3f0d818f4201717efaf50e366c00d16257 /parsers/html.py
parent8d591e6eb49354cd01a3adeb70bee7ab31feb6c7 (diff)
also try to get character encoding from XML declaration and http-equiv meta tag
git-svn-id: http://arthurdejong.org/svn/webcheck/webcheck@178 86f53f14-5ff3-0310-afe5-9b438ce3f40c
Diffstat (limited to 'parsers/html.py')
-rw-r--r--parsers/html.py22
1 files changed, 22 insertions, 0 deletions
diff --git a/parsers/html.py b/parsers/html.py
index 5e4083f..27a3ea5 100644
--- a/parsers/html.py
+++ b/parsers/html.py
@@ -34,6 +34,12 @@ _charentitypattern = re.compile('&#[0-9]{1,3};')
# pattern for matching spaces
_spacepattern = re.compile(" ")
+# pattern for matching charset declaration for http-equiv tag
+_charsetpattern = re.compile('charset=([^ ]*)', re.I)
+
+# pattern for matching the encoding part of an xml declaration
+_encodingpattern = re.compile('^xml .*encoding="([^"]*)"', re.I)
+
class _MyHTMLParser(HTMLParser.HTMLParser):
"""A simple subclass of HTMLParser.HTMLParser continuing after errors
and gathering some information from the parsed content."""
@@ -116,6 +122,13 @@ class _MyHTMLParser(HTMLParser.HTMLParser):
# <meta http-equiv="refresh" content="0;url=http://ch.tudelft.nl/~arthur/">
elif tag == "meta" and attrs.has_key("http-equiv") and attrs.has_key("content") and attrs["http-equiv"].lower() == "refresh":
pass # TODO: implement
+ # <meta http-equiv="content-type" content="text/html; charset=utf-8" />
+ elif tag == "meta" and attrs.has_key("http-equiv") and attrs.has_key("content") and attrs["http-equiv"].lower() == "content-type":
+ if self.link.encoding is None:
+ try:
+ self.link.encoding = _charsetpattern.search(attrs["content"]).group(1)
+ except AttributeError:
+ pass
# <img src="url">
elif tag == "img" and attrs.has_key("src"):
self.embedded.append(self._cleanurl(attrs["src"]))
@@ -167,6 +180,15 @@ class _MyHTMLParser(HTMLParser.HTMLParser):
handle_data()."""
self.handle_data('&'+name+';')
+ def handle_pi(self, data):
+ """Hanlde xml declaration."""
+ # find character encoding from declaration
+ if self.link.encoding is None:
+ try:
+ self.link.encoding = _encodingpattern.search(data).group(1)
+ except AttributeError:
+ pass
+
def parse(content, link):
"""Parse the specified content and extract an url list, a list of images a
title and an author. The content is assumed to contain HMTL."""