# html.py - parser functions for html content # # Copyright (C) 2005 Arthur de Jong # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA """Parser functions for processing HTML content.""" import config import debugio import HTMLParser import urlparse import re # the list of mimetypes this module should be able to handle mimetypes = ('text/html', 'application/xhtml+xml', 'text/x-server-parsed-html') # pattern for matching numeric html entities _charentitypattern = re.compile('[0-9]{1,3};') # pattern for matching spaces _spacepattern = re.compile(" ") class _MyHTMLParser(HTMLParser.HTMLParser): """A simple subclass of HTMLParser.HTMLParser continuing after errors and gathering some information from the parsed content.""" def __init__(self, link): """Inialize the menbers in which we collect data from parsing the document.""" self.link = link self.collect = None self.base = None self.title = None self.author = None self.embedded = [] self.children = [] self.errmsg = None self.errcount = 0 HTMLParser.HTMLParser.__init__(self) def _location(self): """Return the current parser location as a string.""" (lineno, offset) = self.getpos() if lineno is not None: msg = 'at line %d' % lineno else: msg = 'at unknown line' if offset is not None: msg += ', column %d' % (offset + 1) return msg def _cleanurl(self, url): """Do some translations of url.""" # check for spaces in urls if _spacepattern.search(url): self.link.add_pageproblem('link contains unescaped spaces: ' + url + ', ' + self._location()) # replace spaces by %20 url=_spacepattern.sub('%20',url) # replace nnn; entity refs with proper characters for charEntity in _charentitypattern.findall(url): url = url.replace(charEntity,chr(int(charEntity[2:-1]))) return url def error(self, message): """Override superclass' error() method to ignore errors.""" # construct error message message += ', ' + self._location() # store error message debugio.debug("parsers.html._MyHTMLParser.error(): problem parsing html: "+message) if self.errmsg is None: self.errmsg = message # increment error count self.errcount += 1 if self.errcount > 10: raise HTMLParser.HTMLParseError(message, self.getpos()) def check_for_whole_start_tag(self, i): """Override to catch assertion exception.""" try: return HTMLParser.HTMLParser.check_for_whole_start_tag(self, i) except AssertionError, e: debugio.debug("parsers.html._MyHTMLParser.check_for_whole_start_tag(): caugt assertion error") def handle_starttag(self, tag, attrs): """Handle start tags in html.""" # turn attrs into hash attrs=dict(attrs) #