# htmlparse.py - html parsing functions
#
# Copyright (C) 1998, 1999 Albert Hopkins (marduk)
# Copyright (C) 2002 Mike Meyer
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
"""Utilites for parsing HTML and urls"""
import htmllib
import string
import debugio
from urlparse import urlparse, urljoin, urlunparse
from formatter import NullFormatter
def urlformat(url,parent=None):
""" returns a formatted version of URL, which, adds trailing '/'s, if
necessary, deletes fragmentation identifiers '#' and expands partial url's
based on parent"""
method=urlparse(url)[0]
if (method=='') and (parent != None):
url=urljoin(parent,url)
#url=basejoin(parent,url)
parsedlist = list(urlparse(url))
parsedlist[5]='' # remove fragment
# parsedlist[4]='' # remove query string
url = urlunparse(tuple(parsedlist))
return url
class MyHTMLParser(htmllib.HTMLParser):
def __init__(self,formatter):
self.imagelist = []
self.title = None
self.author = None
self.base = None
htmllib.HTMLParser.__init__(self,formatter)
# override handle_image()
def handle_image(self,src,alt,*stuff):
if src not in self.imagelist: self.imagelist.append(src)
def do_frame(self,attrs):
for name, val in attrs:
if name=="src":
self.anchorlist.append(val)
def save_bgn(self):
self.savedata = ''
def save_end(self):
data = self.savedata
self.savedata = None
return data
def start_title(self, attrs):
self.save_bgn()
def end_title(self):
#if not self.savedata:
# self.title = None
# return
self.title = string.join(string.split(self.save_end()))
def do_meta(self,attrs):
fields={}
for name, value in attrs:
fields[name]=value
if fields.has_key('name'):
if string.lower(fields['name']) == 'author':
if fields.has_key('content'):
author = fields['content']
self.author = author
debugio.info(' author: ' + author)
# stylesheet links
def do_link(self,attrs):
for name, val in attrs:
if name=="href":
if val not in self.anchorlist:
self.anchorlist.append(val)
# for client-side image maps
def do_area(self,attrs):
for name, val in attrs:
if name=="href":
if val not in self.anchorlist:
self.anchorlist.append(val)
def do_base(self,attrs):
for name,val in attrs:
if name=="href":
self.base = val
def pageLinks(url,page):
""" returns a list of all the url's in a page. page should be a file object
Partial urls will be expanded using parameter unless the page contains
the tag."""
parser = MyHTMLParser(NullFormatter())
parser.feed(page)
parser.close()
urllist = []
imagelist = []
title = parser.title
author = parser.author
if parser.base is not None:
parent = parser.base
else:
parent = url
for anchor in parser.anchorlist:
anchor=urlformat(anchor,parent)
if anchor not in urllist: urllist.append(anchor)
for image in parser.imagelist:
image=urlformat(image,parent)
if image not in imagelist: imagelist.append(image)
return (urllist, imagelist, title, author)