# html.py - parser functions for html content
#
# Copyright (C) 1998, 1999 Albert Hopkins (marduk)
# Copyright (C) 2002 Mike W. Meyer
# Copyright (C) 2005 Arthur de Jong
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
"""Parser functions for processing HTML content."""
import htmllib
import string
import debugio
import formatter
import sgmllib
import urlparse
mimetypes = ('text/html', 'application/xhtml+xml', 'text/x-server-parsed-html')
# TODO: switch to using HTMLParser (not from htmllib)
class _MyHTMLParser(htmllib.HTMLParser):
def __init__(self,formatter):
self.imagelist = []
self.title = None
self.author = None
self.base = None
htmllib.HTMLParser.__init__(self,formatter)
# override handle_image()
def handle_image(self,src,alt,*stuff):
if src not in self.imagelist:
self.imagelist.append(src)
def do_frame(self,attrs):
for name, val in attrs:
if name=="src":
self.anchorlist.append(val)
def save_bgn(self):
self.savedata = ''
def save_end(self):
data = self.savedata
self.savedata = None
return data
def start_title(self, attrs):
self.save_bgn()
def end_title(self):
#if not self.savedata:
# self.title = None
# return
if self.savedata and not self.title:
self.title = string.join(string.split(self.save_end()))
def do_meta(self,attrs):
fields={}
for name, value in attrs:
fields[name]=value
if fields.has_key('name'):
if string.lower(fields['name']) == 'author':
if fields.has_key('content'):
self.author = fields['content']
# stylesheet links
def do_link(self,attrs):
for name, val in attrs:
if name=="href":
if val not in self.anchorlist:
self.anchorlist.append(val)
# for client-side image maps
def do_area(self,attrs):
for name, val in attrs:
if name=="href":
if val not in self.anchorlist:
self.anchorlist.append(val)
def do_base(self,attrs):
for name,val in attrs:
if name=="href":
self.base = val
def parse(content, link):
"""Parse the specified content and extract an url list, a list of images a
title and an author. The content is assumed to contain HMTL."""
# parse the file
parser = _MyHTMLParser(formatter.NullFormatter())
try:
parser.feed(content)
parser.close()
except sgmllib.SGMLParseError, e:
debugio.warn('problem parsing html: %s' % (str(e)))
link.add_problem('problem parsing html: %s' % (str(e)))
# flag that the link contains a valid page
link.ispage = True
# figure out a base url to use for creating absolute urls
base = link.url
if parser.base is not None:
base = parser.base
# save the title
if parser.title is not None:
link.title = parser.title
# save the author
if parser.author is not None:
link.author = parser.author
# if the link is external we are not interested in the rest
if not link.isinternal:
return
# save the list of children (make links absolute)
for anchor in parser.anchorlist:
link.add_child(urlparse.urljoin(base,anchor))
# save list of images (make links absolute)
for image in parser.imagelist:
# create absolute url based on tag
link.add_embed(urlparse.urljoin(base,image))