code improvements thanks to pylint

git-svn-id: http://arthurdejong.org/svn/webcheck/webcheck@242 86f53f14-5ff3-0310-afe5-9b438ce3f40c
author: Arthur de Jong <arthur@arthurdejong.org> 2006-04-23 13:31:29 +0200
committer: Arthur de Jong <arthur@arthurdejong.org> 2006-04-23 13:31:29 +0200
commit: 83437235d294edc6db53aa10cd673594296f5f54 (patch)
tree: 9a5eb0ce81329433f09a1b61c66f675622c0b21e /parsers
parent: 1e2b11a0e49fcd563919aa7741600bef0b55a8e4 (diff)
3 files changed, 74 insertions, 65 deletions
diff --git a/parsers/__init__.py b/parsers/__init__.py
index 40cbbd5..9fe95dc 100644
--- a/parsers/__init__.py
+++ b/parsers/__init__.py
@@ -1,8 +1,8 @@
 
 # __init__.py - general content-type parser interface
 #
-# Copyright (C) 2005 Arthur de Jong
-# 
+# Copyright (C) 2005, 2006 Arthur de Jong
+#
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
 # the Free Software Foundation; either version 2 of the License, or
@@ -25,8 +25,8 @@
 A content-type module can be requested by the get_parsemodule() function.
 Each module should export the following function:
 
-    parse(content,link)
-        Based on the content fill in the common fields of the link object."""
+    parse(content, link)
+        Based on the content, fill in the common fields of the link object."""
 
 # the modules that should be imported
 _modules = ('html', 'css')
@@ -36,11 +36,12 @@ _parsermodules = {}
 
 def _init_modules():
     """Initialize the modules."""
-    # go throught all known modules to probe the content-types (do this only once)
-    for m in _modules:
-        p = __import__('parsers.'+m,globals(),locals(),[m])
-        for t in p.mimetypes:
-            _parsermodules[t] = p
+    # go throught all known modules to probe the content-types
+    # (do this only once)
+    for mod in _modules:
+        parser = __import__('parsers.'+mod, globals(), locals(), [mod])
+        for mimetype in parser.mimetypes:
+            _parsermodules[mimetype] = parser
 
 def get_parsermodule(mimetype):
     """Look up the correct module for the specified mimetype."""
@@ -52,6 +53,8 @@ def get_parsermodule(mimetype):
     return None
 
 def get_mimetypes():
+    """Return a list of supported mime types that can be parsed
+    by the installed parsers."""
     if _parsermodules == {}:
         _init_modules()
     return _parsermodules.keys()
diff --git a/parsers/css.py b/parsers/css.py
index a7f7573..3f707cc 100644
--- a/parsers/css.py
+++ b/parsers/css.py
@@ -1,7 +1,7 @@
 
 # css.py - parser functions for css content
 #
-# Copyright (C) 2005 Arthur de Jong
+# Copyright (C) 2005, 2006 Arthur de Jong
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@@ -20,6 +20,10 @@
 # The files produced as output from the software do not automatically fall
 # under the copyright of the software, unless explicitly stated otherwise.
 
+"""This modules attempts to parse CSS files.
+It currently looks for url() links in stylesheet contents and also
+looks for @import processing directives."""
+
 mimetypes = ('text/css',)
 
 import urlparse
@@ -29,7 +33,8 @@ import re
 _commentpattern = re.compile('/\*.*?\*/', re.IGNORECASE|re.DOTALL)
 
 # pattern for matching @import "url" statments in css
-_importpattern = re.compile('@import\s+["\']([^"\']*)["\']',re.IGNORECASE|re.DOTALL)
+_importpattern = re.compile('@import\s+["\']([^"\']*)["\']',
+                            re.IGNORECASE|re.DOTALL)
 
 # pattern for matching url(...) in css
 _urlpattern = re.compile('url\(["\']?(.*?)["\']?\)')
diff --git a/parsers/html.py b/parsers/html.py
index 73240f8..b7fe892 100644
--- a/parsers/html.py
+++ b/parsers/html.py
@@ -22,11 +22,9 @@
 
 """Parser functions for processing HTML content."""
 
-import config
 import debugio
 import HTMLParser
 import urlparse
-import urllib
 import re
 import crawler
 
@@ -34,13 +32,13 @@ import crawler
 mimetypes = ('text/html', 'application/xhtml+xml', 'text/x-server-parsed-html')
 
 # pattern for matching numeric html entities
-_charentitypattern = re.compile('&#[0-9]{1,3};')
+_charentitypattern = re.compile('&#([0-9]{1,3});')
 
 # pattern for matching all html entities
 _entitypattern = re.compile('&[^ ;]+;')
 
 # pattern for matching spaces
-_spacepattern = re.compile(" ")
+_spacepattern = re.compile(' ')
 
 # pattern for matching charset declaration for http-equiv tag
 _charsetpattern = re.compile('charset=([^ ]*)', re.I)
@@ -78,14 +76,15 @@ class _MyHTMLParser(HTMLParser.HTMLParser):
             msg += ', column %d' % (offset + 1)
         return msg
 
-    def _cleanurl(self, url, what="link"):
+    def _cleanurl(self, url, what='link'):
         """Do some translations of url."""
-        # check for spaces in urls (characters are escaped in crawler._urlclean())
+        # check for spaces in urls
+        # (characters are escaped in crawler.urlescape())
         if _spacepattern.search(url):
-            self.link.add_pageproblem(what + ' contains unescaped spaces: ' + url + ', ' + self._location())
+            self.link.add_pageproblem(
+              what+' contains unescaped spaces: '+url+', '+self._location() )
         # replace &#nnn; entity refs with proper characters
-        for charEntity in _charentitypattern.findall(url):
-            url = url.replace(charEntity,chr(int(charEntity[2:-1])))
+        url = _charentitypattern.sub(lambda x:chr(int(x.group(1))), url)
         return crawler.urlescape(url)
 
     def error(self, message):
@@ -93,7 +92,7 @@ class _MyHTMLParser(HTMLParser.HTMLParser):
         # construct error message
         message += ', ' + self._location()
         # store error message
-        debugio.debug("parsers.html._MyHTMLParser.error(): problem parsing html: "+message)
+        debugio.debug('parsers.html._MyHTMLParser.error(): problem parsing html: '+message)
         if self.errmsg is None:
             self.errmsg = message
         # increment error count
@@ -105,45 +104,46 @@ class _MyHTMLParser(HTMLParser.HTMLParser):
         """Override to catch assertion exception."""
         try:
             return HTMLParser.HTMLParser.check_for_whole_start_tag(self, i)
-        except AssertionError, e:
-            debugio.debug("parsers.html._MyHTMLParser.check_for_whole_start_tag(): caught assertion error")
+        except AssertionError:
+            debugio.debug('parsers.html._MyHTMLParser.check_for_whole_start_tag(): caught assertion error')
 
     def handle_starttag(self, tag, attrs):
         """Handle start tags in html."""
         # turn attrs into hash
-        attrs=dict(attrs)
-        # <title>content</title>
-        if tag == "title":
-            self.collect = ""
-        # <base href="url">
-        elif tag == "base" and attrs.has_key("href"):
-            self.base = self._cleanurl(attrs["href"])
-        # <link rel="type" href="url">
-        elif tag == "link" and attrs.has_key("rel") and attrs.has_key("href"):
-            if attrs["rel"].lower() in ("stylesheet", "alternate stylesheet", "icon", "shortcut icon"):
-                self.embedded.append(self._cleanurl(attrs["href"]))
-        # <meta name="author" content="Arthur de Jong">
-        elif tag == "meta" and attrs.has_key("name") and attrs.has_key("content") and attrs["name"].lower() == "author":
+        attrs = dict(attrs)
+        # <title>TITLE</title>
+        if tag == 'title':
+            self.collect = ''
+        # <base href="URL">
+        elif tag == 'base' and attrs.has_key('href'):
+            self.base = self._cleanurl(attrs['href'])
+        # <link rel="type" href="URL">
+        elif tag == 'link' and attrs.has_key('rel') and attrs.has_key('href'):
+            if attrs['rel'].lower() in ('stylesheet', 'alternate stylesheet', 'icon', 'shortcut icon'):
+                self.embedded.append(self._cleanurl(attrs['href']))
+        # <meta name="author" content="AUTHOR">
+        elif tag == 'meta' and attrs.has_key('name') and attrs.has_key('content') and attrs['name'].lower() == 'author':
             if self.author is None:
-                self.author = attrs["content"]
-        # <meta http-equiv="refresh" content="0;url=http://ch.tudelft.nl/~arthur/">
-        elif tag == "meta" and attrs.has_key("http-equiv") and attrs.has_key("content") and attrs["http-equiv"].lower() == "refresh":
+                self.author = attrs['content']
+        # <meta http-equiv="refresh" content="0;url=URL">
+        elif tag == 'meta' and attrs.has_key('http-equiv') and attrs.has_key('content') and attrs['http-equiv'].lower() == 'refresh':
             pass # TODO: implement
         # <meta http-equiv="content-type" content="text/html; charset=utf-8" />
-        elif tag == "meta" and attrs.has_key("http-equiv") and attrs.has_key("content") and attrs["http-equiv"].lower() == "content-type":
+        elif tag == 'meta' and attrs.has_key('http-equiv') and attrs.has_key('content') and attrs['http-equiv'].lower() == 'content-type':
             if self.link.encoding is None:
                 try:
-                    self.link.encoding = _charsetpattern.search(attrs["content"]).group(1)
+                    self.link.encoding = _charsetpattern.search(attrs['content']).group(1)
                 except AttributeError:
+                    # ignore cases where encoding is not set in header
                     pass
         # <img src="url">
-        elif tag == "img" and attrs.has_key("src"):
-            self.embedded.append(self._cleanurl(attrs["src"]))
+        elif tag == 'img' and attrs.has_key('src'):
+            self.embedded.append(self._cleanurl(attrs['src']))
         # <a href="url">
-        elif tag == "a" and attrs.has_key("href"):
-            self.children.append(self._cleanurl(attrs["href"]))
+        elif tag == 'a' and attrs.has_key('href'):
+            self.children.append(self._cleanurl(attrs['href']))
         # <a name="#anchor">
-        elif tag == "a" and attrs.has_key("name"):
+        elif tag == 'a' and attrs.has_key('name'):
             anchor = self._cleanurl(attrs['name'],'anchor')
             if anchor in self.anchors:
                 self.link.add_pageproblem(
@@ -153,24 +153,24 @@ class _MyHTMLParser(HTMLParser.HTMLParser):
             else:
                 self.anchors.append(anchor)
         # <frameset><frame src="url"...>...</frameset>
-        elif tag == "frame" and attrs.has_key("src"):
-            self.embedded.append(self._cleanurl(attrs["src"]))
+        elif tag == 'frame' and attrs.has_key('src'):
+            self.embedded.append(self._cleanurl(attrs['src']))
         # <map><area href="url"...>...</map>
-        elif tag == "area" and attrs.has_key("href"):
-            self.children.append(self._cleanurl(attrs["href"]))
+        elif tag == 'area' and attrs.has_key('href'):
+            self.children.append(self._cleanurl(attrs['href']))
         # <applet code="url"...>
-        elif tag == "applet" and attrs.has_key("code"):
-            self.embedded.append(self._cleanurl(attrs["code"]))
+        elif tag == 'applet' and attrs.has_key('code'):
+            self.embedded.append(self._cleanurl(attrs['code']))
         # <embed src="url"...>
-        elif tag == "embed" and attrs.has_key("src"):
-            self.embedded.append(self._cleanurl(attrs["src"]))
+        elif tag == 'embed' and attrs.has_key('src'):
+            self.embedded.append(self._cleanurl(attrs['src']))
         # <embed><param name="movie" value="url"></embed>
-        elif tag == "param" and attrs.has_key("name") and attrs.has_key("value"):
-            if attrs["name"].lower() == "movie":
-                self.embedded.append(self._cleanurl(attrs["value"]))
+        elif tag == 'param' and attrs.has_key('name') and attrs.has_key('value'):
+            if attrs['name'].lower() == 'movie':
+                self.embedded.append(self._cleanurl(attrs['value']))
         # <style>content</style>
-        elif tag == "style":
-            self.collect = ""
+        elif tag == 'style':
+            self.collect = ''
 
     def handle_endtag(self, tag):
         """Handle end tags in html."""
@@ -221,12 +221,13 @@ def _maketxt(txt, encoding):
         # fall back to locale's encoding
         txt = unicode(txt, errors='replace')
     # replace &#nnn; entity refs with proper characters
-    for charEntity in _charentitypattern.findall(txt):
-        txt = txt.replace(charEntity, unichr(int(charEntity[2:-1])))
+    txt = _charentitypattern.sub(lambda x:chr(int(x.group(1))), txt)
     # replace html entity refs with proper characters
     for entity in _entitypattern.findall(txt):
         if (htmlentitydefs.name2codepoint.has_key(entity[1:-1])):
-            txt = txt.replace(entity, unichr(htmlentitydefs.name2codepoint[entity[1:-1]]))
+            txt = txt.replace(
+                    entity,
+                    unichr(htmlentitydefs.name2codepoint[entity[1:-1]]) )
     return txt
 
 def parse(content, link):
@@ -239,13 +240,13 @@ def parse(content, link):
         parser.close()
     except Exception, e:
         # ignore (but log) all errors
-        debugio.debug("parsers.html.parse(): caught exception: "+str(e))
+        debugio.debug('parsers.html.parse(): caught exception: '+str(e))
     # check for parser errors
     if parser.errmsg is not None:
-        debugio.debug("parsers.html.parse(): problem parsing html: "+parser.errmsg)
+        debugio.debug('parsers.html.parse(): problem parsing html: '+parser.errmsg)
         link.add_pageproblem('problem parsing html: %s' % parser.errmsg)
     # dump encoding
-    debugio.debug("parsers.html.parse(): html encoding: %s" % str(link.encoding))
+    debugio.debug('parsers.html.parse(): html encoding: %s' % str(link.encoding))
     # flag that the link contains a valid page
     link.ispage = True
     # save the title
author	Arthur de Jong <arthur@arthurdejong.org>	2006-04-23 13:31:29 +0200
committer	Arthur de Jong <arthur@arthurdejong.org>	2006-04-23 13:31:29 +0200
commit	83437235d294edc6db53aa10cd673594296f5f54 (patch)
tree	9a5eb0ce81329433f09a1b61c66f675622c0b21e /parsers
parent	1e2b11a0e49fcd563919aa7741600bef0b55a8e4 (diff)