# html.py - parser functions for html content
#
# Copyright (C) 2005, 2006 Arthur de Jong
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
#
# The files produced as output from the software do not automatically fall
# under the copyright of the software, unless explicitly stated otherwise.
"""Parser functions for processing HTML content."""
import config
import debugio
import HTMLParser
import urlparse
import urllib
import re
# the list of mimetypes this module should be able to handle
mimetypes = ('text/html', 'application/xhtml+xml', 'text/x-server-parsed-html')
# pattern for matching numeric html entities
_charentitypattern = re.compile('[0-9]{1,3};')
# pattern for matching all html entities
_entitypattern = re.compile('&[^ ;]+;')
# pattern for matching spaces
_spacepattern = re.compile(" ")
# pattern for matching charset declaration for http-equiv tag
_charsetpattern = re.compile('charset=([^ ]*)', re.I)
# pattern for matching the encoding part of an xml declaration
_encodingpattern = re.compile('^xml .*encoding="([^"]*)"', re.I)
class _MyHTMLParser(HTMLParser.HTMLParser):
"""A simple subclass of HTMLParser.HTMLParser continuing after errors
and gathering some information from the parsed content."""
def __init__(self, link):
"""Inialize the menbers in which we collect data from parsing the
document."""
self.link = link
self.collect = None
self.base = None
self.title = None
self.author = None
self.embedded = []
self.children = []
self.errmsg = None
self.errcount = 0
HTMLParser.HTMLParser.__init__(self)
def _location(self):
"""Return the current parser location as a string."""
(lineno, offset) = self.getpos()
if lineno is not None:
msg = 'at line %d' % lineno
else:
msg = 'at unknown line'
if offset is not None:
msg += ', column %d' % (offset + 1)
return msg
def _cleanurl(self, url):
"""Do some translations of url."""
# check for spaces in urls
if _spacepattern.search(url):
self.link.add_pageproblem('link contains unescaped spaces: ' + url + ', ' + self._location())
# replace spaces by %20
url=_spacepattern.sub('%20',url)
# replace nnn; entity refs with proper characters
for charEntity in _charentitypattern.findall(url):
url = url.replace(charEntity,chr(int(charEntity[2:-1])))
# url encode strange characters (reserved chars are unharmed)
url = urllib.quote(url, ';/?:@&=+$,%')
return url
def error(self, message):
"""Override superclass' error() method to ignore errors."""
# construct error message
message += ', ' + self._location()
# store error message
debugio.debug("parsers.html._MyHTMLParser.error(): problem parsing html: "+message)
if self.errmsg is None:
self.errmsg = message
# increment error count
self.errcount += 1
if self.errcount > 10:
raise HTMLParser.HTMLParseError(message, self.getpos())
def check_for_whole_start_tag(self, i):
"""Override to catch assertion exception."""
try:
return HTMLParser.HTMLParser.check_for_whole_start_tag(self, i)
except AssertionError, e:
debugio.debug("parsers.html._MyHTMLParser.check_for_whole_start_tag(): caugt assertion error")
def handle_starttag(self, tag, attrs):
"""Handle start tags in html."""
# turn attrs into hash
attrs=dict(attrs)
#
content
if tag == "title":
self.collect = ""
#
elif tag == "base" and attrs.has_key("href"):
self.base = self._cleanurl(attrs["href"])
#
elif tag == "link" and attrs.has_key("rel") and attrs.has_key("href"):
if attrs["rel"].lower() in ("stylesheet", "alternate stylesheet", "icon", "shortcut icon"):
self.embedded.append(self._cleanurl(attrs["href"]))
#
elif tag == "meta" and attrs.has_key("name") and attrs.has_key("content") and attrs["name"].lower() == "author":
if self.author is None:
self.author = attrs["content"]
#
elif tag == "meta" and attrs.has_key("http-equiv") and attrs.has_key("content") and attrs["http-equiv"].lower() == "refresh":
pass # TODO: implement
#
elif tag == "meta" and attrs.has_key("http-equiv") and attrs.has_key("content") and attrs["http-equiv"].lower() == "content-type":
if self.link.encoding is None:
try:
self.link.encoding = _charsetpattern.search(attrs["content"]).group(1)
except AttributeError:
pass
#
elif tag == "img" and attrs.has_key("src"):
self.embedded.append(self._cleanurl(attrs["src"]))
#
elif tag == "a" and attrs.has_key("href"):
self.children.append(self._cleanurl(attrs["href"]))
#
elif tag == "frame" and attrs.has_key("src"):
self.embedded.append(self._cleanurl(attrs["src"]))
#
elif tag == "area" and attrs.has_key("href"):
self.children.append(self._cleanurl(attrs["href"]))
#