# html.py - parser functions for html content
#
# Copyright (C) 2005 Arthur de Jong
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
"""Parser functions for processing HTML content."""
import config
import debugio
import HTMLParser
import urlparse
import re
# the list of mimetypes this module should be able to handle
mimetypes = ('text/html', 'application/xhtml+xml', 'text/x-server-parsed-html')
# pattern for matching numeric html entities
_charentitypattern = re.compile('[0-9]{1,3};')
class _MyHTMLParser(HTMLParser.HTMLParser):
"""A simple subclass of HTMLParser.HTMLParser continuing after errors
and gathering some information from the parsed content."""
def __init__(self):
"""Inialize the menbers in which we collect data from parsing the
document."""
self.collect = None
self.base = None
self.title = None
self.author = None
self.embedded = []
self.children = []
self.errmsg = None
self.errcount = 0
HTMLParser.HTMLParser.__init__(self)
def error(self, message):
"""Override superclass' error() method to ignore errors."""
# construct error message
(lineno, offset) = self.getpos()
if lineno is not None:
message += ", at line %d" % lineno
if offset is not None:
message += ", column %d" % (offset + 1)
# store error message
debugio.debug("parsers.html._MyHTMLParser.error(): problem parsing html: "+message)
if self.errmsg is None:
self.errmsg = message
# increment error count
self.errcount += 1
if self.errcount > 10:
raise HTMLParser.HTMLParseError(message, self.getpos())
def check_for_whole_start_tag(self, i):
"""Override to catch assertion exception."""
try:
return HTMLParser.HTMLParser.check_for_whole_start_tag(self, i)
except AssertionError, e:
debugio.debug("parsers.html._MyHTMLParser.check_for_whole_start_tag(): caugt assertion error")
def handle_starttag(self, tag, attrs):
"""Handle start tags in html."""
# turn attrs into hash
attrs=dict(attrs)
#