diff options
author | Arthur de Jong <arthur@arthurdejong.org> | 2005-09-17 17:58:42 +0200 |
---|---|---|
committer | Arthur de Jong <arthur@arthurdejong.org> | 2005-09-17 17:58:42 +0200 |
commit | a6512ccafe4ac02bf06c476e8f3a2dd2971c55e0 (patch) | |
tree | 946d9c3f0d818f4201717efaf50e366c00d16257 /parsers/html.py | |
parent | 8d591e6eb49354cd01a3adeb70bee7ab31feb6c7 (diff) |
also try to get character encoding from XML declaration and http-equiv meta tag
git-svn-id: http://arthurdejong.org/svn/webcheck/webcheck@178 86f53f14-5ff3-0310-afe5-9b438ce3f40c
Diffstat (limited to 'parsers/html.py')
-rw-r--r-- | parsers/html.py | 22 |
1 files changed, 22 insertions, 0 deletions
diff --git a/parsers/html.py b/parsers/html.py index 5e4083f..27a3ea5 100644 --- a/parsers/html.py +++ b/parsers/html.py @@ -34,6 +34,12 @@ _charentitypattern = re.compile('&#[0-9]{1,3};') # pattern for matching spaces _spacepattern = re.compile(" ") +# pattern for matching charset declaration for http-equiv tag +_charsetpattern = re.compile('charset=([^ ]*)', re.I) + +# pattern for matching the encoding part of an xml declaration +_encodingpattern = re.compile('^xml .*encoding="([^"]*)"', re.I) + class _MyHTMLParser(HTMLParser.HTMLParser): """A simple subclass of HTMLParser.HTMLParser continuing after errors and gathering some information from the parsed content.""" @@ -116,6 +122,13 @@ class _MyHTMLParser(HTMLParser.HTMLParser): # <meta http-equiv="refresh" content="0;url=http://ch.tudelft.nl/~arthur/"> elif tag == "meta" and attrs.has_key("http-equiv") and attrs.has_key("content") and attrs["http-equiv"].lower() == "refresh": pass # TODO: implement + # <meta http-equiv="content-type" content="text/html; charset=utf-8" /> + elif tag == "meta" and attrs.has_key("http-equiv") and attrs.has_key("content") and attrs["http-equiv"].lower() == "content-type": + if self.link.encoding is None: + try: + self.link.encoding = _charsetpattern.search(attrs["content"]).group(1) + except AttributeError: + pass # <img src="url"> elif tag == "img" and attrs.has_key("src"): self.embedded.append(self._cleanurl(attrs["src"])) @@ -167,6 +180,15 @@ class _MyHTMLParser(HTMLParser.HTMLParser): handle_data().""" self.handle_data('&'+name+';') + def handle_pi(self, data): + """Hanlde xml declaration.""" + # find character encoding from declaration + if self.link.encoding is None: + try: + self.link.encoding = _encodingpattern.search(data).group(1) + except AttributeError: + pass + def parse(content, link): """Parse the specified content and extract an url list, a list of images a title and an author. The content is assumed to contain HMTL.""" |