content

# html.py - parser functions for html content # # Copyright (C) 2005 Arthur de Jong # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA """Parser functions for processing HTML content.""" import config import debugio import HTMLParser import urlparse import re # the list of mimetypes this module should be able to handle mimetypes = ('text/html', 'application/xhtml+xml', 'text/x-server-parsed-html') class _MyHTMLParser(HTMLParser.HTMLParser): """A simple subclass of HTMLParser.HTMLParser continuing after errors and gathering some information from the parsed content.""" def __init__(self): """Inialize the menbers in which we collect data from parsing the document.""" self.collect = None self.base = None self.title = None self.author = None self.embedded = [] self.children = [] self.errmsg = None self.errcount = 0 HTMLParser.HTMLParser.__init__(self) def error(self, message): """Override superclass' error() method to ignore errors.""" # construct error message (lineno, offset) = self.getpos() if lineno is not None: message += ", at line %d" % lineno if offset is not None: message += ", column %d" % (offset + 1) # store error message debugio.debug("parsers.html._MyHTMLParser.error(): problem parsing html: "+message) if self.errmsg is None: self.errmsg = message # increment error count self.errcount += 1 if self.errcount > 10: raise HTMLParser.HTMLParseError(message, self.getpos()) def check_for_whole_start_tag(self, i): """Override to catch assertion exception.""" try: return HTMLParser.HTMLParser.check_for_whole_start_tag(self, i) except AssertionError, e: debugio.debug("parsers.html._MyHTMLParser.check_for_whole_start_tag(): caugt assertion error") def handle_starttag(self, tag, attrs): """Handle start tags in html.""" # turn attrs into hash attrs=dict(attrs) # content if tag == "title": self.collect = "" # elif tag == "base" and attrs.has_key("href"): self.base = attrs["href"] # elif tag == "link" and attrs.has_key("rel") and attrs.has_key("href"): if attrs["rel"].lower() in ("stylesheet", "alternate stylesheet", "icon", "shortcut icon"): self.embedded.append(attrs["href"]) # elif tag == "meta" and attrs.has_key("name") and attrs.has_key("content") and attrs["name"].lower() == "author": if self.author is None: self.author = attrs["content"] # elif tag == "meta" and attrs.has_key("http-equiv") and attrs.has_key("content") and attrs["http-equiv"].lower() == "refresh": pass # TODO: implement #

elif tag == "img" and attrs.has_key("src"): self.embedded.append(attrs["src"]) # elif tag == "a" and attrs.has_key("href"): self.children.append(attrs["href"]) # ... elif tag == "frame" and attrs.has_key("src"): self.embedded.append(attrs["src"]) # ... elif tag == "area" and attrs.has_key("href"): self.children.append(attrs["href"]) # elif tag == "applet" and attrs.has_key("code"): self.embedded.append(attrs["code"]) #

elif tag == "embed" and attrs.has_key("src"): self.embedded.append(attrs["src"]) #

elif tag == "param" and attrs.has_key("name") and attrs.has_key("value"): if attrs["name"].lower() == "movie": self.embedded.append(attrs["value"]) def handle_endtag(self, tag): """Handle end tags in html.""" if tag == 'title' and self.title is None: self.title = self.collect self.collect = None def handle_data(self, data): """Collect data if we were collecting data.""" if self.collect is not None: self.collect += data def _cleanurl(url): """Do some translations of url.""" # replace &#nnn; entity refs with proper characters charEntityPattern = re.compile('&#[0-9]{1,3};') for charEntity in charEntityPattern.findall(url): url = url.replace(charEntity,chr(int(charEntity[2:-1]))) return url def parse(content, link): """Parse the specified content and extract an url list, a list of images a title and an author. The content is assumed to contain HMTL.""" # create parser and feed it the content parser = _MyHTMLParser() try: parser.feed(content) parser.close() except (HTMLParser.HTMLParseError, AttributeError): pass # check for parser errors if parser.errmsg is not None: debugio.debug("parsers.html.parse(): problem parsing html: "+parser.errmsg) link.add_problem('problem parsing html: %s' % parser.errmsg) pass # flag that the link contains a valid page link.ispage = True # save the title if parser.title is not None: link.title = parser.title # save the author if parser.author is not None: link.author = parser.author # figure out the base of the document (for building the other urls) base = link.url if parser.base is not None: base = parser.base # list embedded and children for embed in parser.embedded: link.add_embed(urlparse.urljoin(base,_cleanurl(embed))) for child in parser.children: link.add_child(urlparse.urljoin(base,_cleanurl(child)))