diff options
author | Arthur de Jong <arthur@arthurdejong.org> | 2011-09-16 15:36:38 +0200 |
---|---|---|
committer | Arthur de Jong <arthur@arthurdejong.org> | 2011-09-16 15:36:38 +0200 |
commit | 7d7b8cb696c023e3917c9f15485c0d544de7bbe7 (patch) | |
tree | d98b495f721e200d7a7feb59fd8c57751a62d7f1 | |
parent | e107eea095b856e37a407817214adc890265874c (diff) |
move all the code except the command-line handling to the webcheck package and reorganise imports accordingly
git-svn-id: http://arthurdejong.org/svn/webcheck/webcheck@435 86f53f14-5ff3-0310-afe5-9b438ce3f40c
-rwxr-xr-x | cmd.py (renamed from webcheck.py) | 28 | ||||
-rw-r--r-- | webcheck/__init__.py | 0 | ||||
-rw-r--r-- | webcheck/config.py (renamed from config.py) | 1 | ||||
-rw-r--r-- | webcheck/crawler.py (renamed from crawler.py) | 71 | ||||
-rw-r--r-- | webcheck/db.py (renamed from db.py) | 15 | ||||
-rw-r--r-- | webcheck/debugio.py (renamed from debugio.py) | 1 | ||||
-rw-r--r-- | webcheck/monkeypatch.py (renamed from monkeypatch.py) | 4 | ||||
-rw-r--r-- | webcheck/myurllib.py (renamed from myurllib.py) | 2 | ||||
-rw-r--r-- | webcheck/parsers/__init__.py (renamed from parsers/__init__.py) | 2 | ||||
-rw-r--r-- | webcheck/parsers/css.py (renamed from parsers/css.py) | 3 | ||||
-rw-r--r-- | webcheck/parsers/html/__init__.py (renamed from parsers/html/__init__.py) | 24 | ||||
-rw-r--r-- | webcheck/parsers/html/beautifulsoup.py (renamed from parsers/html/beautifulsoup.py) | 53 | ||||
-rw-r--r-- | webcheck/parsers/html/calltidy.py (renamed from parsers/html/calltidy.py) | 9 | ||||
-rw-r--r-- | webcheck/parsers/html/htmlparser.py (renamed from parsers/html/htmlparser.py) | 34 | ||||
-rw-r--r-- | webcheck/plugins/__init__.py (renamed from plugins/__init__.py) | 39 | ||||
-rw-r--r-- | webcheck/plugins/about.py (renamed from plugins/about.py) | 28 | ||||
-rw-r--r-- | webcheck/plugins/anchors.py (renamed from plugins/anchors.py) | 10 | ||||
-rw-r--r-- | webcheck/plugins/badlinks.py (renamed from plugins/badlinks.py) | 24 | ||||
-rw-r--r-- | webcheck/plugins/external.py (renamed from plugins/external.py) | 20 | ||||
-rw-r--r-- | webcheck/plugins/images.py (renamed from plugins/images.py) | 22 | ||||
-rw-r--r-- | webcheck/plugins/new.py (renamed from plugins/new.py) | 26 | ||||
-rw-r--r-- | webcheck/plugins/notchkd.py (renamed from plugins/notchkd.py) | 20 | ||||
-rw-r--r-- | webcheck/plugins/notitles.py (renamed from plugins/notitles.py) | 28 | ||||
-rw-r--r-- | webcheck/plugins/old.py (renamed from plugins/old.py) | 26 | ||||
-rw-r--r-- | webcheck/plugins/problems.py (renamed from plugins/problems.py) | 28 | ||||
-rw-r--r-- | webcheck/plugins/sitemap.py (renamed from plugins/sitemap.py) | 22 | ||||
-rw-r--r-- | webcheck/plugins/size.py (renamed from plugins/size.py) | 26 | ||||
-rw-r--r-- | webcheck/plugins/urllist.py (renamed from plugins/urllist.py) | 14 |
28 files changed, 294 insertions, 286 deletions
@@ -1,6 +1,6 @@ #!/usr/bin/env python -# webcheck.py - main module of webcheck doing command-line checking +# cmd.py - command-line front-end for webcheck # # Copyright (C) 1998, 1999 Albert Hopkins (marduk) # Copyright (C) 2002 Mike W. Meyer @@ -28,22 +28,22 @@ __version__ = '1.10.4' __homepage__ = 'http://arthurdejong.org/webcheck/' -import sys import os import re -import urlparse +import sys import urllib +import urlparse -import config +from webcheck import config # update some fields that currently are stored in config config.VERSION = __version__ config.HOMEPAGE = __homepage__ -import crawler -import plugins -import debugio -import monkeypatch -import db +from webcheck import debugio +import webcheck.crawler +import webcheck.db +import webcheck.monkeypatch +import webcheck.plugins debugio.loglevel = debugio.INFO @@ -166,9 +166,9 @@ def parse_args(site): filename = os.path.join(config.OUTPUT_DIR, 'webcheck.sqlite') from sqlalchemy import create_engine engine = create_engine('sqlite:///' + filename) - db.Session.configure(bind=engine) + webcheck.db.Session.configure(bind=engine) # ensure that all tables are created - db.Base.metadata.create_all(engine) + webcheck.db.Base.metadata.create_all(engine) # TODO: schema migraton goes here # add configuration to site for pattern in internal_urls: @@ -235,7 +235,7 @@ def install_file(source, text=False): 'strerror': strerror}) sys.exit(1) # create file in output directory (with overwrite question) - tfp = plugins.open_file(os.path.basename(source)) + tfp = webcheck.plugins.open_file(os.path.basename(source)) # copy contents shutil.copyfileobj(sfp, tfp) # close files @@ -247,7 +247,7 @@ def main(site): """Main program.""" # crawl through the website debugio.info('checking site....') - crawler.setup_urllib2() + webcheck.crawler.setup_urllib2() site.crawl() # this will take a while debugio.info('done.') # do postprocessing (building site structure, etc) @@ -269,7 +269,7 @@ def main(site): if __name__ == '__main__': try: # initialize site object - site = crawler.Site() + site = webcheck.crawler.Site() # parse command-line arguments parse_args(site) # run the main program diff --git a/webcheck/__init__.py b/webcheck/__init__.py new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/webcheck/__init__.py diff --git a/config.py b/webcheck/config.py index e106b8e..3d66d42 100644 --- a/config.py +++ b/webcheck/config.py @@ -29,6 +29,7 @@ items should be changeble from the command line.""" import urllib + # Whether to consider any URL not starting with the base URL to be external. # This is the state of the -b command line option. BASE_URLS_ONLY = False diff --git a/crawler.py b/webcheck/crawler.py index 5c842db..3614d3f 100644 --- a/crawler.py +++ b/webcheck/crawler.py @@ -40,10 +40,11 @@ import urllib import urllib2 import urlparse -import config -import db -import debugio -import parsers +from webcheck.db import Session, Link, LinkProblem, PageProblem, children, \ + embedded +from webcheck import debugio +import webcheck.config +import webcheck.parsers class RedirectError(urllib2.HTTPError): @@ -61,7 +62,7 @@ class NoRedirectHandler(urllib2.HTTPRedirectHandler): def setup_urllib2(): """Configure the urllib2 module to store cookies in the output directory.""" - filename = os.path.join(config.OUTPUT_DIR, 'cookies.lwp') + filename = os.path.join(webcheck.config.OUTPUT_DIR, 'cookies.lwp') # set up our cookie jar cookiejar = cookielib.LWPCookieJar(filename) try: @@ -73,9 +74,9 @@ def setup_urllib2(): opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookiejar), NoRedirectHandler()) opener.addheaders = [ - ('User-agent', 'webcheck %s' % config.VERSION), + ('User-agent', 'webcheck %s' % webcheck.config.VERSION), ] - if config.BYPASSHTTPCACHE: + if webcheck.config.BYPASSHTTPCACHE: opener.addheaders.append(('Cache-control', 'no-cache')) opener.addheaders.append(('Pragma', 'no-cache')) urllib2.install_opener(opener) @@ -116,7 +117,7 @@ class Site(object): def add_internal(self, url): """Add the given url and consider all urls below it to be internal. These links are all marked for checking with the crawl() function.""" - url = db.Link.clean_url(url) + url = Link.clean_url(url) if url not in self._internal_urls: self._internal_urls.add(url) @@ -145,7 +146,7 @@ class Site(object): return True res = False # check that the url starts with an internal url - if config.BASE_URLS_ONLY: + if webcheck.config.BASE_URLS_ONLY: # the url must start with one of the _internal_urls for i in self._internal_urls: res |= (i == url[:len(i)]) @@ -201,10 +202,10 @@ class Site(object): return 'yanked' # check if we should avoid external links is_internal = self._is_internal(url) - if not is_internal and config.AVOID_EXTERNAL_LINKS: + if not is_internal and webcheck.config.AVOID_EXTERNAL_LINKS: return 'external avoided' # check if we should use robot parsers - if not config.USE_ROBOTS: + if not webcheck.config.USE_ROBOTS: return None (scheme, netloc) = urlparse.urlsplit(url)[0:2] # skip schemes not having robot.txt files @@ -223,16 +224,16 @@ class Site(object): def get_link(self, session, url): # try to find the URL - url = db.Link.clean_url(url) - link = session.query(db.Link).filter_by(url=url).first() + url = Link.clean_url(url) + link = session.query(Link).filter_by(url=url).first() if not link: - link = db.Link(url=url) + link = Link(url=url) session.add(link) return link def get_links_to_crawl(self, session): - links = session.query(db.Link).filter(db.Link.fetched == None) - return links.filter(db.Link.yanked == None) + links = session.query(Link).filter(Link.fetched == None) + return links.filter(Link.yanked == None) def crawl(self): """Crawl the website based on the urls specified with @@ -240,22 +241,22 @@ class Site(object): is specified the crawler writes out updated links to the file while crawling the site.""" # get a database session - session = db.Session() + session = Session() # remove all links - if not config.CONTINUE: - session.query(db.LinkProblem).delete() + if not webcheck.config.CONTINUE: + session.query(LinkProblem).delete() session.commit() - session.query(db.PageProblem).delete() + session.query(PageProblem).delete() session.commit() - session.execute(db.children.delete()) + session.execute(children.delete()) session.commit() - session.execute(db.embedded.delete()) + session.execute(embedded.delete()) session.commit() - session.query(db.Link).delete() + session.query(Link).delete() session.commit() # add all internal urls to the database for url in self._internal_urls: - url = db.Link.clean_url(url) + url = Link.clean_url(url) self.get_link(session, url) # add some URLs from the database that haven't been fetched tocheck = self.get_links_to_crawl(session) @@ -284,10 +285,10 @@ class Site(object): # flush database changes session.commit() # sleep between requests if configured - if config.WAIT_BETWEEN_REQUESTS > 0: + if webcheck.config.WAIT_BETWEEN_REQUESTS > 0: debugio.debug('crawler.crawl(): sleeping %s seconds' % - config.WAIT_BETWEEN_REQUESTS) - time.sleep(config.WAIT_BETWEEN_REQUESTS) + webcheck.config.WAIT_BETWEEN_REQUESTS) + time.sleep(webcheck.config.WAIT_BETWEEN_REQUESTS) debugio.debug('crawler.crawl(): items left to check: %d' % (remaining + len(tocheck))) session.commit() @@ -346,7 +347,7 @@ class Site(object): def parse(self, link, response): """Parse the fetched response.""" # find a parser for the content-type - parsermodule = parsers.get_parsermodule(link.mimetype) + parsermodule = webcheck.parsers.get_parsermodule(link.mimetype) if parsermodule is None: debugio.debug('crawler.Link.fetch(): unsupported content-type: %s' % link.mimetype) return @@ -368,7 +369,7 @@ class Site(object): """Do some basic post processing of the collected data, including depth calculation of every link.""" # get a database session - session = db.Session() + session = Session() # build the list of urls that were set up with add_internal() that # do not have a parent (they form the base for the site) for url in self._internal_urls: @@ -381,11 +382,11 @@ class Site(object): self.bases.append(link) # if we got no bases, just use the first internal one if not self.bases: - link = session.query(db.Link).filter(db.Link.is_internal == True).first() + link = session.query(Link).filter(Link.is_internal == True).first() debugio.debug('crawler.postprocess(): fallback to adding %s to bases' % link.url) self.bases.append(link) # do a breadth first traversal of the website to determine depth - session.query(db.Link).update(dict(depth=None), synchronize_session=False) + session.query(Link).update(dict(depth=None), synchronize_session=False) session.commit() depth = 0 count = len(self.bases) @@ -396,15 +397,15 @@ class Site(object): while count > 0: # update the depth of all links without a depth that have a # parent with the previous depth - qry = session.query(db.Link).filter(db.Link.depth == None) - qry = qry.filter(db.Link.linked_from.any(db.Link.depth == depth)) + qry = session.query(Link).filter(Link.depth == None) + qry = qry.filter(Link.linked_from.any(Link.depth == depth)) count = qry.update(dict(depth=depth + 1), synchronize_session=False) session.commit() depth += 1 debugio.debug('crawler.postprocess(): %d links at depth %d' % (count, depth)) # TODO: also handle embeds # see if any of the plugins want to do postprocessing - for p in config.PLUGINS: + for p in webcheck.config.PLUGINS: # import the plugin plugin = __import__('plugins.' + p, globals(), locals(), [p]) if hasattr(plugin, 'postprocess'): @@ -413,7 +414,7 @@ class Site(object): def generate(self): """Generate pages for plugins.""" - for p in config.PLUGINS: + for p in webcheck.config.PLUGINS: # import the plugin plugin = __import__('plugins.' + p, globals(), locals(), [p]) if hasattr(plugin, 'generate'): @@ -29,9 +29,9 @@ from sqlalchemy.orm import relationship, backref, sessionmaker from sqlalchemy.orm.session import object_session from sqlalchemy.sql.expression import ClauseElement, union -import config -import debugio -import myurllib +from webcheck.myurllib import normalizeurl +import webcheck.config +import webcheck.debugio # provide session and schema classes @@ -93,15 +93,14 @@ class Link(Base): @staticmethod def clean_url(url): # normalise the URL, removing the fragment from the URL - url = myurllib.normalizeurl(url) - return urlparse.urldefrag(myurllib.normalizeurl(url))[0] + return urlparse.urldefrag(normalizeurl(url))[0] def _get_link(self, url): """Get a link object for the specified URL.""" # get the session session = object_session(self) # normalise the URL, removing the fragment from the URL - url, fragment = urlparse.urldefrag(myurllib.normalizeurl(url)) + url, fragment = urlparse.urldefrag(normalizeurl(url)) # try to find the link instance = session.query(Link).filter_by(url=url).first() if not instance: @@ -118,7 +117,7 @@ class Link(Base): the encoding is supported.""" if not self.encoding and encoding: try: - debugio.debug('crawler.Link.set_encoding(%r)' % encoding) + webcheck.debugio.debug('crawler.Link.set_encoding(%r)' % encoding) unicode('just some random text', encoding, 'replace') self.encoding = encoding except Exception, e: @@ -133,7 +132,7 @@ class Link(Base): self.redirectdepth = max([self.redirectdepth] + [x.redirectdepth for x in self.parents]) + 1 # check depth - if self.redirectdepth >= config.REDIRECT_DEPTH: + if self.redirectdepth >= webcheck.config.REDIRECT_DEPTH: self.add_linkproblem('too many redirects (%d)' % self.redirectdepth) return # check for redirect to self diff --git a/debugio.py b/webcheck/debugio.py index 6d7f698..3da73e3 100644 --- a/debugio.py +++ b/webcheck/debugio.py @@ -30,6 +30,7 @@ info(), warn() and error() whenever you want to print something.""" import sys + # log levels that can be used ERROR = 0 WARN = 1 diff --git a/monkeypatch.py b/webcheck/monkeypatch.py index cf9218e..a1af120 100644 --- a/monkeypatch.py +++ b/webcheck/monkeypatch.py @@ -21,9 +21,9 @@ # under the copyright of the software, unless explicitly stated otherwise. import re -import urlparse -import urllib import sys +import urllib +import urlparse __all__ = [] diff --git a/myurllib.py b/webcheck/myurllib.py index bd5987c..abe26b2 100644 --- a/myurllib.py +++ b/webcheck/myurllib.py @@ -20,9 +20,9 @@ # The files produced as output from the software do not automatically fall # under the copyright of the software, unless explicitly stated otherwise. -import urlparse import re import urllib +import urlparse # this is a workaround for Python 2.3 try: diff --git a/parsers/__init__.py b/webcheck/parsers/__init__.py index 3bfbd1f..f0f5f97 100644 --- a/parsers/__init__.py +++ b/webcheck/parsers/__init__.py @@ -40,7 +40,7 @@ def _init_modules(): # go throught all known modules to probe the content-types # (do this only once) for mod in _modules: - parser = __import__('parsers.' + mod, globals(), locals(), [mod]) + parser = __import__('webcheck.parsers.' + mod, globals(), locals(), [mod]) for mimetype in parser.mimetypes: _parsermodules[mimetype] = parser diff --git a/parsers/css.py b/webcheck/parsers/css.py index 5ab2905..1b22a9d 100644 --- a/parsers/css.py +++ b/webcheck/parsers/css.py @@ -26,8 +26,9 @@ looks for @import processing directives.""" mimetypes = ('text/css',) -import urlparse import re +import urlparse + # pattern for matching /* ... */ comments in css _commentpattern = re.compile('/\*.*?\*/', re.IGNORECASE | re.DOTALL) diff --git a/parsers/html/__init__.py b/webcheck/parsers/html/__init__.py index 09966f4..d4c8fe7 100644 --- a/parsers/html/__init__.py +++ b/webcheck/parsers/html/__init__.py @@ -24,10 +24,12 @@ module that tries to load the BeatifulSoup parser first and falls back to loading the legacy HTMLParser parser.""" -import debugio -import re import htmlentitydefs -import config +import re + +from webcheck import debugio +import webcheck.config + # the list of mimetypes this module should be able to handle mimetypes = ('text/html', 'application/xhtml+xml', 'text/x-server-parsed-html') @@ -93,15 +95,15 @@ def _parsefunction(content, link): global _parsefunction try: # try BeautifulSoup parser first - import parsers.html.beautifulsoup - debugio.debug('parsers.html.parse(): the BeautifulSoup parser is ok') - _parsefunction = parsers.html.beautifulsoup.parse + import webcheck.parsers.html.beautifulsoup + debugio.debug('webcheck.parsers.html.parse(): the BeautifulSoup parser is ok') + _parsefunction = webcheck.parsers.html.beautifulsoup.parse except ImportError: # fall back to legacy HTMLParser parser debugio.warn('falling back to the legacy HTML parser, ' 'consider installing BeautifulSoup') - import parsers.html.htmlparser - _parsefunction = parsers.html.htmlparser.parse + import webcheck.parsers.html.htmlparser + _parsefunction = webcheck.parsers.html.htmlparser.parse # call the actual parse function _parsefunction(content, link) @@ -112,12 +114,12 @@ def parse(content, link): # call the normal parse function _parsefunction(content, link) # call the tidy parse function - if config.TIDY_OPTIONS: + if webcheck.config.TIDY_OPTIONS: try: import calltidy - debugio.debug('parsers.html.parse(): the Tidy parser is ok') + debugio.debug('webcheck.parsers.html.parse(): the Tidy parser is ok') calltidy.parse(content, link) except ImportError: debugio.warn('tidy library (python-utidylib) is unavailable') # remove config to only try once - config.TIDY_OPTIONS = None + webcheck.config.TIDY_OPTIONS = None diff --git a/parsers/html/beautifulsoup.py b/webcheck/parsers/html/beautifulsoup.py index 268014d..0c71a5f 100644 --- a/parsers/html/beautifulsoup.py +++ b/webcheck/parsers/html/beautifulsoup.py @@ -24,13 +24,16 @@ BeautifulSoup HTML parser and is more flexible than the legacy HTMLParser module.""" -import urlparse -import crawler -import re import htmlentitydefs +import re +import urlparse + import BeautifulSoup -import myurllib -from parsers.html import htmlunescape + +from webcheck.myurllib import normalizeurl +from webcheck.parsers.html import htmlunescape +import crawler + # pattern for matching http-equiv and content part of # <meta http-equiv="refresh" content="0;url=URL"> @@ -57,21 +60,21 @@ def parse(content, link): if title and title.string: link.title = htmlunescape(title.string).strip() - # FIXME: using myurllib.normalizeurl is wrong below, we should probably use + # FIXME: using normalizeurl is wrong below, we should probably use # something like link.urlunescape() to do the escaping and check # and log at the same time # <base href="URL"> base = soup.find('base', href=True) if base: - base = myurllib.normalizeurl(htmlunescape(base['href']).strip()) + base = normalizeurl(htmlunescape(base['href']).strip()) else: base = link.url # <link rel="TYPE" href="URL"> for l in soup.findAll('link', rel=True, href=True): if l['rel'].lower() in ('stylesheet', 'alternate stylesheet', 'icon', 'shortcut icon'): - embed = myurllib.normalizeurl(htmlunescape(l['href']).strip()) + embed = normalizeurl(htmlunescape(l['href']).strip()) if embed: link.add_embed(urlparse.urljoin(base, embed)) # <meta name="author" content="AUTHOR"> @@ -91,26 +94,26 @@ def parse(content, link): link.add_child(urlparse.urljoin(base, child)) # <img src="URL"> for img in soup.findAll('img', src=True): - embed = myurllib.normalizeurl(htmlunescape(img['src']).strip()) + embed = normalizeurl(htmlunescape(img['src']).strip()) if embed: link.add_embed(urlparse.urljoin(base, embed)) # <a href="URL"> for a in soup.findAll('a', href=True): - child = myurllib.normalizeurl(htmlunescape(a['href']).strip()) + child = normalizeurl(htmlunescape(a['href']).strip()) if child: link.add_child(urlparse.urljoin(base, child)) # <a name="NAME"> # TODO: consistent url escaping? for a in soup.findAll('a', attrs={'name': True}): # get anchor name - a_name = myurllib.normalizeurl(htmlunescape(a['name']).strip()) + a_name = normalizeurl(htmlunescape(a['name']).strip()) # if both id and name are used they should be the same if 'id' in a and \ - a_name != myurllib.normalizeurl(htmlunescape(a['id']).strip()): + a_name != normalizeurl(htmlunescape(a['id']).strip()): link.add_pageproblem( 'anchors defined in name and id attributes do not match') # add the id anchor anyway - link.add_anchor(myurllib.normalizeurl(htmlunescape(a['id']).strip())) + link.add_anchor(normalizeurl(htmlunescape(a['id']).strip())) # add the anchor link.add_anchor(a_name) # <ANY id="ID"> @@ -119,51 +122,51 @@ def parse(content, link): if elem.name == 'a' and 'name' in elem: continue # add the anchor - link.add_anchor(myurllib.normalizeurl(htmlunescape(elem['id']).strip())) + link.add_anchor(normalizeurl(htmlunescape(elem['id']).strip())) # <frameset><frame src="URL"...>...</frameset> for frame in soup.findAll('frame', src=True): - embed = myurllib.normalizeurl(htmlunescape(frame['src']).strip()) + embed = normalizeurl(htmlunescape(frame['src']).strip()) if embed: link.add_embed(urlparse.urljoin(base, embed)) # <iframe src="URL"...> for frame in soup.findAll('iframe', src=True): - embed = myurllib.normalizeurl(htmlunescape(frame['src']).strip()) + embed = normalizeurl(htmlunescape(frame['src']).strip()) if embed: link.add_embed(urlparse.urljoin(base, embed)) # <object data="URL"...> for obj in soup.findAll('object', data=True): - embed = myurllib.normalizeurl(htmlunescape(obj['data']).strip()) + embed = normalizeurl(htmlunescape(obj['data']).strip()) if embed: link.add_embed(urlparse.urljoin(base, embed)) # <object><param name="movie" value="URL"...></object> for para in soup.findAll('param', attrs={'name': 'movie', 'value': True}): - embed = myurllib.normalizeurl(htmlunescape(para['value']).strip()) + embed = normalizeurl(htmlunescape(para['value']).strip()) if embed: link.add_embed(urlparse.urljoin(base, embed)) # <map><area href="URL"...>...</map> for area in soup.findAll('area', href=True): - child = myurllib.normalizeurl(htmlunescape(area['href']).strip()) + child = normalizeurl(htmlunescape(area['href']).strip()) if child: link.add_child(urlparse.urljoin(base, child)) # <applet code="URL" [archive="URL"]...> for applet in soup.findAll('applet', code=True): # if applet has archive tag check that if 'archive' in applet: - embed = myurllib.normalizeurl(htmlunescape(applet['archive']).strip()) + embed = normalizeurl(htmlunescape(applet['archive']).strip()) else: - embed = myurllib.normalizeurl(htmlunescape(applet['code']).strip()) + embed = normalizeurl(htmlunescape(applet['code']).strip()) if embed: link.add_embed(urlparse.urljoin(base, embed)) # <embed src="URL"...> for embedd in soup.findAll('frame', src=True): - embed = myurllib.normalizeurl(htmlunescape(embedd['src']).strip()) + embed = normalizeurl(htmlunescape(embedd['src']).strip()) if embed: link.add_embed(urlparse.urljoin(base, embed)) # <embed><param name="movie" value="url"></embed> for param in soup.findAll('param', attrs={ 'name': re.compile("^movie$", re.I), 'value': True}): - embed = myurllib.normalizeurl(htmlunescape(param['value']).strip()) + embed = normalizeurl(htmlunescape(param['value']).strip()) if embed: link.add_embed(urlparse.urljoin(base, embed)) # <style>content</style> @@ -179,12 +182,12 @@ def parse(content, link): parsers.css.parse(elem['style'], link, base) # <script src="url"> for script in soup.findAll('script', src=True): - embed = myurllib.normalizeurl(htmlunescape(script['src']).strip()) + embed = normalizeurl(htmlunescape(script['src']).strip()) if embed: link.add_embed(urlparse.urljoin(base, embed)) # <body|table|td background="url"> for t in soup.findAll(('body', 'table', 'td'), background=True): - embed = myurllib.normalizeurl(htmlunescape(t['background']).strip()) + embed = normalizeurl(htmlunescape(t['background']).strip()) if embed: link.add_embed(urlparse.urljoin(base, embed)) # flag that the link contains a valid page diff --git a/parsers/html/calltidy.py b/webcheck/parsers/html/calltidy.py index 6623b08..505f185 100644 --- a/parsers/html/calltidy.py +++ b/webcheck/parsers/html/calltidy.py @@ -21,8 +21,9 @@ # under the copyright of the software, unless explicitly stated otherwise. import tidy -import config -import parsers.html + +import webcheck.config +import webcheck.parsers.html def parse(content, link): @@ -30,7 +31,7 @@ def parse(content, link): link.""" # only call tidy on internal pages if link.is_internal: - t = tidy.parseString(content, **config.TIDY_OPTIONS) + t = tidy.parseString(content, **webcheck.config.TIDY_OPTIONS) for err in t.errors: # error messages are escaped so we unescape them - link.add_pageproblem(parsers.html.htmlunescape(unicode(err))) + link.add_pageproblem(webcheck.parsers.html.htmlunescape(unicode(err))) diff --git a/parsers/html/htmlparser.py b/webcheck/parsers/html/htmlparser.py index 2bae745..fa82045 100644 --- a/parsers/html/htmlparser.py +++ b/webcheck/parsers/html/htmlparser.py @@ -25,13 +25,15 @@ the legacy HTMLParser module. It will only be used if BeatifulSoup is not available and can be considered depricated. This parser will only handle properly formatted HTML.""" -import debugio import HTMLParser -import urlparse import re -import crawler -import myurllib -from parsers.html import htmlunescape +import urlparse + +from webcheck import debugio +from webcheck.myurllib import normalizeurl +from webcheck.parsers.html import htmlunescape +import webcheck.crawler + # pattern for matching numeric html entities _charentitypattern = re.compile('&#([0-9]{1,3});') @@ -79,20 +81,20 @@ class _MyHTMLParser(HTMLParser.HTMLParser): def _cleanurl(self, url, what='link'): """Do some translations of url.""" # check for spaces in urls - # (characters are escaped in myurllib.normalizeurl()) + # (characters are escaped in normalizeurl()) if _spacepattern.search(url): self.link.add_pageproblem( what + ' contains unescaped spaces: ' + url + ', ' + self._location()) # replace &#nnn; entity refs with proper characters url = _charentitypattern.sub(lambda x: chr(int(x.group(1))), url) - return myurllib.normalizeurl(url) + return normalizeurl(url) def error(self, message): """Override superclass' error() method to ignore errors.""" # construct error message message += ', ' + self._location() # store error message - debugio.debug('parsers.html.htmlparser._MyHTMLParser.error(): problem parsing html: ' + message) + debugio.debug('webcheck.parsers.html.htmlparser._MyHTMLParser.error(): problem parsing html: ' + message) if self.errmsg is None: self.errmsg = message # increment error count @@ -105,7 +107,7 @@ class _MyHTMLParser(HTMLParser.HTMLParser): try: return HTMLParser.HTMLParser.check_for_whole_start_tag(self, i) except AssertionError: - debugio.debug('parsers.html.htmlparser._MyHTMLParser.check_for_whole_start_tag(): caught assertion error') + debugio.debug('webcheck.parsers.html.htmlparser._MyHTMLParser.check_for_whole_start_tag(): caught assertion error') return None def handle_starttag(self, tag, attrs): @@ -210,8 +212,8 @@ class _MyHTMLParser(HTMLParser.HTMLParser): # pick up any tags with a style attribute if 'style' in attrs: # delegate handling of inline css to css module - import parsers.css - parsers.css.parse(attrs['style'], self.link, self.base) + import webcheck.parsers.css + webcheck.parsers.css.parse(attrs['style'], self.link, self.base) def handle_endtag(self, tag): """Handle end tags in html.""" @@ -220,8 +222,8 @@ class _MyHTMLParser(HTMLParser.HTMLParser): self.collect = None elif tag == 'style' and self.collect is not None: # delegate handling of inline css to css module - import parsers.css - parsers.css.parse(self.collect, self.link, self.base) + import webcheck.parsers.css + webcheck.parsers.css.parse(self.collect, self.link, self.base) def handle_data(self, data): """Collect data if we were collecting data.""" @@ -272,13 +274,13 @@ def parse(content, link): parser.close() except Exception, e: # ignore (but log) all errors - debugio.debug('parsers.html.htmlparser.parse(): caught exception: ' + str(e)) + debugio.debug('webcheck.parsers.html.htmlparser.parse(): caught exception: ' + str(e)) # check for parser errors if parser.errmsg is not None: - debugio.debug('parsers.html.htmlparser.parse(): problem parsing html: ' + parser.errmsg) + debugio.debug('webcheck.parsers.html.htmlparser.parse(): problem parsing html: ' + parser.errmsg) link.add_pageproblem('problem parsing html: %s' % parser.errmsg) # dump encoding - debugio.debug('parsers.html.htmlparser.parse(): html encoding: %s' % str(link.encoding)) + debugio.debug('webcheck.parsers.html.htmlparser.parse(): html encoding: %s' % str(link.encoding)) # flag that the link contains a valid page link.is_page = True # save the title diff --git a/plugins/__init__.py b/webcheck/plugins/__init__.py index 2607101..e753ed0 100644 --- a/plugins/__init__.py +++ b/webcheck/plugins/__init__.py @@ -49,13 +49,10 @@ import time from sqlalchemy.orm import joinedload from sqlalchemy.orm.session import object_session -import config -import db -import debugio -import parsers.html - -# reference function from html module -htmlescape = parsers.html.htmlescape +from webcheck.db import Link +from webcheck.parsers.html import htmlescape +import webcheck.config +import webcheck.debugio def _floatformat(f): @@ -129,7 +126,7 @@ def make_link(link, title=None): is external, insert "class=external" in the <a> tag.""" return '<a href="%(url)s" %(target)sclass="%(cssclass)s" title="%(info)s">%(title)s</a>' % \ dict(url=htmlescape(link.url), - target='target="_blank" ' if config.REPORT_LINKS_IN_NEW_WINDOW else '', + target='target="_blank" ' if webcheck.config.REPORT_LINKS_IN_NEW_WINDOW else '', cssclass='internal' if link.is_internal else 'external', info=htmlescape(_get_info(link)).replace('\n', ' '), title=htmlescape(title or link.title or link.url)) @@ -142,7 +139,7 @@ def print_parents(fp, link, indent=' '): count = link.count_parents if not count: return - parents = link.parents.order_by(db.Link.title, db.Link.url).options(joinedload(db.Link.linkproblems))[:config.PARENT_LISTLEN] + parents = link.parents.order_by(Link.title, Link.url).options(joinedload(Link.linkproblems))[:webcheck.config.PARENT_LISTLEN] fp.write( indent + '<div class="parents">\n' + indent + ' referenced from:\n' + @@ -165,26 +162,26 @@ def print_parents(fp, link, indent=' '): def open_file(filename, istext=True, makebackup=False): """This returns an open file object which can be used for writing. This file is created in the output directory. The output directory (stored in - config.OUTPUT_DIR is created if it does not yet exist. If the second + webcheck.config.OUTPUT_DIR is created if it does not yet exist. If the second parameter is True (default) the file is opened as an UTF-8 text file.""" import os # check if output directory exists and create it if needed - if not os.path.isdir(config.OUTPUT_DIR): + if not os.path.isdir(webcheck.config.OUTPUT_DIR): try: - os.mkdir(config.OUTPUT_DIR) + os.mkdir(webcheck.config.OUTPUT_DIR) except OSError, (errno, strerror): debugio.error('error creating directory %(dir)s: %(strerror)s' % - {'dir': config.OUTPUT_DIR, + {'dir': webcheck.config.OUTPUT_DIR, 'strerror': strerror}) sys.exit(1) # build the output file name - fname = os.path.join(config.OUTPUT_DIR, filename) + fname = os.path.join(webcheck.config.OUTPUT_DIR, filename) # check if file exists if os.path.exists(fname): if makebackup: # create backup of original (overwriting previous backup) os.rename(fname, fname + '~') - elif not config.OVERWRITE_FILES: + elif not webcheck.config.OVERWRITE_FILES: # ask to overwrite try: res = raw_input('webcheck: overwrite %s? [y]es, [a]ll, [q]uit: ' % fname) @@ -194,7 +191,7 @@ def open_file(filename, istext=True, makebackup=False): res = 'q' res = res.lower() + ' ' if res[0] == 'a': - config.OVERWRITE_FILES = True + webcheck.config.OVERWRITE_FILES = True elif res[0] != 'y': print 'Aborted.' sys.exit(1) @@ -214,9 +211,9 @@ def open_file(filename, istext=True, makebackup=False): def _print_navbar(fp, plugin): """Return an html fragement representing the navigation bar for a page.""" fp.write(' <ul class="navbar">\n') - for p in config.PLUGINS: + for p in webcheck.config.PLUGINS: # import the plugin - report = __import__('plugins.' + p, globals(), locals(), [p]) + report = __import__('webcheck.plugins.' + p, globals(), locals(), [p]) # skip if no outputfile if not hasattr(report, '__outputfile__'): continue @@ -258,7 +255,7 @@ def open_html(plugin, site): % {'sitetitle': htmlescape(base.title or base.url), 'plugintitle': htmlescape(plugin.__title__), 'siteurl': base.url, - 'version': config.VERSION}) + 'version': webcheck.config.VERSION}) # write navigation bar _print_navbar(fp, plugin) # write plugin heading @@ -279,6 +276,6 @@ def close_html(fp): ' </body>\n' '</html>\n' % {'time': htmlescape(time.ctime(time.time())), - 'homepage': config.HOMEPAGE, - 'version': htmlescape(config.VERSION)}) + 'homepage': webcheck.config.HOMEPAGE, + 'version': htmlescape(webcheck.config.VERSION)}) fp.close() diff --git a/plugins/about.py b/webcheck/plugins/about.py index fdfb3a1..ada2685 100644 --- a/plugins/about.py +++ b/webcheck/plugins/about.py @@ -30,15 +30,15 @@ __outputfile__ = 'about.html' import time -import config -import db -import plugins +from webcheck.db import Session, Link +import webcheck.config +import webcheck.plugins def generate(site): """Output a list of modules, it's authors and the webcheck version.""" - fp = plugins.open_html(plugins.about, site) - session = db.Session() + fp = webcheck.plugins.open_html(webcheck.plugins.about, site) + session = Session() # TODO: xxx links were fetched, xxx pages were examined and a total of xxx notes and problems were found # TODO: include some runtime information (e.g. supported schemes, user configuration, etc) # output some general information about the report @@ -56,10 +56,10 @@ def generate(site): ' This report was generated on %(time)s, a total of %(numurls)d\n' ' links were found.\n' ' </p>\n\n' - % {'version': plugins.htmlescape(config.VERSION), - 'time': plugins.htmlescape(time.ctime(time.time())), - 'numurls': session.query(db.Link).count(), - 'homepage': config.HOMEPAGE}) + % {'version': webcheck.plugins.htmlescape(webcheck.config.VERSION), + 'time': webcheck.plugins.htmlescape(time.ctime(time.time())), + 'numurls': session.query(Link).count(), + 'homepage': webcheck.config.HOMEPAGE}) # output copyright information fp.write( ' <h3>Copyright</h3>\n' @@ -100,15 +100,15 @@ def generate(site): fp.write( ' <h3>Plugins</h3>\n' ' <ul>\n') - for plugin in config.PLUGINS: - report = __import__('plugins.' + plugin, globals(), locals(), [plugin]) + for plugin in webcheck.config.PLUGINS: + report = __import__('webcheck.plugins.' + plugin, globals(), locals(), [plugin]) fp.write( ' <li>\n' ' <strong>%s</strong><br />\n' - % plugins.htmlescape(report.__title__)) + % webcheck.plugins.htmlescape(report.__title__)) if hasattr(report, '__doc__'): - fp.write(' %s<br />\n' % plugins.htmlescape(report.__doc__)) + fp.write(' %s<br />\n' % webcheck.plugins.htmlescape(report.__doc__)) fp.write(' </li>\n') fp.write( ' </ul>\n') - plugins.close_html(fp) + webcheck.plugins.close_html(fp) diff --git a/plugins/anchors.py b/webcheck/plugins/anchors.py index 420b0fc..3b1e88d 100644 --- a/plugins/anchors.py +++ b/webcheck/plugins/anchors.py @@ -27,22 +27,22 @@ This plugin does not output any files, it just finds problems.""" __title__ = 'missing anchors' __author__ = 'Arthur de Jong' -import db +from webcheck.db import Session, Link, Anchor def postprocess(site): """Add all missing anchors as page problems to the referring page.""" - session = db.Session() + session = Session() # find all fetched links with requested anchors - links = session.query(db.Link).filter(db.Link.reqanchors.any()) - links = links.filter(db.Link.fetched != None) + links = session.query(Link).filter(Link.reqanchors.any()) + links = links.filter(Link.fetched != None) # go over list and find missing anchors # TODO: we can probably make a nicer query for this for link in links: # check that all requested anchors exist for anchor in link.reqanchors: # if the anchor is not there there, report problem - if not link.anchors.filter(db.Anchor.anchor == anchor.anchor).first(): + if not link.anchors.filter(Anchor.anchor == anchor.anchor).first(): anchor.parent.add_pageproblem( u'bad link: %(url)s#%(anchor)s: unknown anchor' % {'url': link.url, diff --git a/plugins/badlinks.py b/webcheck/plugins/badlinks.py index 1816adf..080d2da 100644 --- a/plugins/badlinks.py +++ b/webcheck/plugins/badlinks.py @@ -30,15 +30,15 @@ __outputfile__ = 'badlinks.html' from sqlalchemy.orm import joinedload -import db -import plugins +from webcheck.db import Session, Link +import webcheck.plugins def postporcess(site): """Add all bad links as pageproblems on pages where they are linked.""" - session = db.Session() + session = Session() # find all links with link problems - links = session.query(db.Link).filter(db.Link.linkproblems.any()).options(joinedload(db.Link.linkproblems)) + links = session.query(Link).filter(Link.linkproblems.any()).options(joinedload(Link.linkproblems)) # TODO: probably make it a nicer query over all linkproblems for link in links: # add a reference to the problem map @@ -50,17 +50,17 @@ def postporcess(site): def generate(site): """Present the list of bad links.""" - session = db.Session() + session = Session() # find all links with link problems - links = session.query(db.Link).filter(db.Link.linkproblems.any()).order_by(db.Link.url).options(joinedload(db.Link.linkproblems)) + links = session.query(Link).filter(Link.linkproblems.any()).order_by(Link.url).options(joinedload(Link.linkproblems)) # present results - fp = plugins.open_html(plugins.badlinks, site) + fp = webcheck.plugins.open_html(webcheck.plugins.badlinks, site) if not links: fp.write( ' <p class="description">\n' ' There were no problems retrieving links from the website.\n' ' </p>\n') - plugins.close_html(fp) + webcheck.plugins.close_html(fp) return fp.write( ' <p class="description">\n' @@ -73,18 +73,18 @@ def generate(site): ' <li>\n' ' %(badurl)s\n' ' <ul class="problems">\n' - % {'badurl': plugins.make_link(link, link.url)}) + % {'badurl': webcheck.plugins.make_link(link, link.url)}) # list the problems for problem in link.linkproblems: fp.write( ' <li>%(problem)s</li>\n' - % {'problem': plugins.htmlescape(problem)}) + % {'problem': webcheck.plugins.htmlescape(problem)}) fp.write( ' </ul>\n') # present a list of parents - plugins.print_parents(fp, link, ' ') + webcheck.plugins.print_parents(fp, link, ' ') fp.write( ' </li>\n') fp.write( ' </ol>\n') - plugins.close_html(fp) + webcheck.plugins.close_html(fp) diff --git a/plugins/external.py b/webcheck/plugins/external.py index 056b766..17f292e 100644 --- a/plugins/external.py +++ b/webcheck/plugins/external.py @@ -30,23 +30,23 @@ __outputfile__ = 'external.html' from sqlalchemy.orm import joinedload -import db -import plugins +from webcheck.db import Session, Link +import webcheck.plugins def generate(site): """Generate the list of external links.""" - session = db.Session() + session = Session() # get all external links - links = session.query(db.Link).filter(db.Link.is_internal != True).order_by(db.Link.url) + links = session.query(Link).filter(Link.is_internal != True).order_by(Link.url) # present results - fp = plugins.open_html(plugins.external, site) + fp = webcheck.plugins.open_html(webcheck.plugins.external, site) if not links: fp.write( ' <p class="description">' ' No external links were found on the website.' ' </p>\n') - plugins.close_html(fp) + webcheck.plugins.close_html(fp) return fp.write( ' <p class="description">' @@ -54,15 +54,15 @@ def generate(site): ' examination of the website.' ' </p>\n' ' <ol>\n') - for link in links.options(joinedload(db.Link.linkproblems)): + for link in links.options(joinedload(Link.linkproblems)): fp.write( ' <li>\n' ' %(link)s\n' - % {'link': plugins.make_link(link)}) + % {'link': webcheck.plugins.make_link(link)}) # present a list of parents - plugins.print_parents(fp, link, ' ') + webcheck.plugins.print_parents(fp, link, ' ') fp.write( ' </li>\n') fp.write( ' </ol>\n') - plugins.close_html(fp) + webcheck.plugins.close_html(fp) diff --git a/plugins/images.py b/webcheck/plugins/images.py index e3575db..ddeb9af 100644 --- a/plugins/images.py +++ b/webcheck/plugins/images.py @@ -30,27 +30,27 @@ __outputfile__ = 'images.html' import re -import db -import plugins +from webcheck.db import Session, Link +import webcheck.plugins def generate(site): """Generate a list of image URLs that were found.""" - session = db.Session() + session = Session() # get non-page links that have an image/* mimetype - links = session.query(db.Link) - links = links.filter((db.Link.is_page != True) | (db.Link.is_page == None)) - links = links.filter(db.Link.mimetype.startswith('image/')) - links = links.order_by(db.Link.url) + links = session.query(Link) + links = links.filter((Link.is_page != True) | (Link.is_page == None)) + links = links.filter(Link.mimetype.startswith('image/')) + links = links.order_by(Link.url) # present results - fp = plugins.open_html(plugins.images, site) + fp = webcheck.plugins.open_html(webcheck.plugins.images, site) if not links: fp.write( ' <p class="description">\n' ' No images were linked on the website.\n' ' </p>\n' ' <ol>\n') - plugins.close_html(fp) + webcheck.plugins.close_html(fp) return fp.write( ' <p class="description">\n' @@ -58,7 +58,7 @@ def generate(site): ' </p>\n' ' <ol>\n') for link in links: - fp.write(' <li>%s</li>\n' % plugins.make_link(link, link.url)) + fp.write(' <li>%s</li>\n' % webcheck.plugins.make_link(link, link.url)) fp.write( ' </ol>\n') - plugins.close_html(fp) + webcheck.plugins.close_html(fp) diff --git a/plugins/new.py b/webcheck/plugins/new.py index aa69315..94d607d 100644 --- a/plugins/new.py +++ b/webcheck/plugins/new.py @@ -30,9 +30,9 @@ __outputfile__ = 'new.html' import time -import config -import db -import plugins +from webcheck.db import Session, Link +import webcheck.config +import webcheck.plugins SECS_PER_DAY = 60 * 60 * 24 @@ -40,28 +40,28 @@ SECS_PER_DAY = 60 * 60 * 24 def generate(site): """Output the list of recently modified pages.""" - session = db.Session() + session = Session() # the time for which links are considered new - newtime = time.time() - SECS_PER_DAY * config.REPORT_WHATSNEW_URL_AGE + newtime = time.time() - SECS_PER_DAY * webcheck.config.REPORT_WHATSNEW_URL_AGE # get all internal pages that are new - links = session.query(db.Link).filter_by(is_page=True, is_internal=True) - links = links.filter(db.Link.mtime > newtime).order_by(db.Link.mtime.desc()) + links = session.query(Link).filter_by(is_page=True, is_internal=True) + links = links.filter(Link.mtime > newtime).order_by(Link.mtime.desc()) # present results - fp = plugins.open_html(plugins.new, site) + fp = webcheck.plugins.open_html(webcheck.plugins.new, site) if not links.count(): fp.write( ' <p class="description">\n' ' No pages were found that were modified within the last %(new)d days.\n' ' </p>\n' - % {'new': config.REPORT_WHATSNEW_URL_AGE}) - plugins.close_html(fp) + % {'new': webcheck.config.REPORT_WHATSNEW_URL_AGE}) + webcheck.plugins.close_html(fp) return fp.write( ' <p class="description">\n' ' These pages have been recently modified (within %(new)d days).\n' ' </p>\n' ' <ul>\n' - % {'new': config.REPORT_WHATSNEW_URL_AGE}) + % {'new': webcheck.config.REPORT_WHATSNEW_URL_AGE}) for link in links: age = (time.time() - link.mtime) / SECS_PER_DAY fp.write( @@ -71,7 +71,7 @@ def generate(site): ' <li>age: %(age)d days</li>\n' ' </ul>\n' ' </li>\n' - % {'link': plugins.make_link(link), + % {'link': webcheck.plugins.make_link(link), 'age': age}) fp.write(' </ul>\n') - plugins.close_html(fp) + webcheck.plugins.close_html(fp) diff --git a/plugins/notchkd.py b/webcheck/plugins/notchkd.py index 923813e..737afaa 100644 --- a/plugins/notchkd.py +++ b/webcheck/plugins/notchkd.py @@ -30,23 +30,23 @@ __outputfile__ = 'notchkd.html' from sqlalchemy.orm import joinedload -import db -import plugins +from webcheck.db import Session, Link +import webcheck.plugins def generate(site): """Output the list of not checked pages.""" - session = db.Session() + session = Session() # get all yanked urls - links = session.query(db.Link).filter(db.Link.yanked != None).order_by(db.Link.url) + links = session.query(Link).filter(Link.yanked != None).order_by(Link.url) # present results - fp = plugins.open_html(plugins.notchkd, site) + fp = webcheck.plugins.open_html(webcheck.plugins.notchkd, site) if not links.count(): fp.write( ' <p class="description">\n' ' All links have been checked.\n' ' </p>\n') - plugins.close_html(fp) + webcheck.plugins.close_html(fp) return fp.write( ' <p class="description">\n' @@ -54,15 +54,15 @@ def generate(site): ' at all during the examination of the website.\n' ' </p>\n' ' <ol>\n') - for link in links.options(joinedload(db.Link.linkproblems)): + for link in links.options(joinedload(Link.linkproblems)): fp.write( ' <li>\n' ' %(link)s\n' - % {'link': plugins.make_link(link, link.url)}) + % {'link': webcheck.plugins.make_link(link, link.url)}) # present a list of parents - plugins.print_parents(fp, link, ' ') + webcheck.plugins.print_parents(fp, link, ' ') fp.write( ' </li>\n') fp.write( ' </ol>\n') - plugins.close_html(fp) + webcheck.plugins.close_html(fp) diff --git a/plugins/notitles.py b/webcheck/plugins/notitles.py index c378fbf..531acf7 100644 --- a/plugins/notitles.py +++ b/webcheck/plugins/notitles.py @@ -30,17 +30,17 @@ __outputfile__ = 'notitles.html' from sqlalchemy.sql.functions import char_length -import db -import plugins +from webcheck.db import Session, Link +import webcheck.plugins def postprocess(site): """Add page problems for all pages without a title.""" - session = db.Session() + session = Session() # get all internal pages without a title - links = session.query(db.Link).filter_by(is_page=True, is_internal=True) - links = links.filter((char_length(db.Link.title) == 0) | - (db.Link.title == None)) + links = session.query(Link).filter_by(is_page=True, is_internal=True) + links = links.filter((char_length(Link.title) == 0) | + (Link.title == None)) for link in links: link.add_pageproblem('missing title') session.commit() @@ -48,19 +48,19 @@ def postprocess(site): def generate(site): """Output the list of pages without a title.""" - session = db.Session() + session = Session() # get all internal pages without a title - links = session.query(db.Link).filter_by(is_page=True, is_internal=True) - links = links.filter((char_length(db.Link.title) == 0) | - (db.Link.title == None)).order_by(db.Link.url) + links = session.query(Link).filter_by(is_page=True, is_internal=True) + links = links.filter((char_length(Link.title) == 0) | + (Link.title == None)).order_by(Link.url) # present results - fp = plugins.open_html(plugins.notitles, site) + fp = webcheck.plugins.open_html(webcheck.plugins.notitles, site) if not links.count(): fp.write( ' <p class="description">\n' ' All pages had a title specified.\n' ' </p>\n') - plugins.close_html(fp) + webcheck.plugins.close_html(fp) return fp.write( ' <p class="description">\n' @@ -71,7 +71,7 @@ def generate(site): for link in links: fp.write( ' <li>%(link)s</li>\n' - % {'link': plugins.make_link(link, link.url)}) + % {'link': webcheck.plugins.make_link(link, link.url)}) fp.write( ' </ol>\n') - plugins.close_html(fp) + webcheck.plugins.close_html(fp) diff --git a/plugins/old.py b/webcheck/plugins/old.py index 5596f4e..859f54d 100644 --- a/plugins/old.py +++ b/webcheck/plugins/old.py @@ -30,9 +30,9 @@ __outputfile__ = 'old.html' import time -import config -import db -import plugins +from webcheck.db import Session, Link +import webcheck.config +import webcheck.plugins SECS_PER_DAY = 60 * 60 * 24 @@ -40,21 +40,21 @@ SECS_PER_DAY = 60 * 60 * 24 def generate(site): """Output the list of outdated pages to the specified file descriptor.""" - session = db.Session() + session = Session() # the time for which links are considered old - oldtime = time.time() - SECS_PER_DAY * config.REPORT_WHATSOLD_URL_AGE + oldtime = time.time() - SECS_PER_DAY * webcheck.config.REPORT_WHATSOLD_URL_AGE # get all internal pages that are old - links = session.query(db.Link).filter_by(is_page=True, is_internal=True) - links = links.filter(db.Link.mtime < oldtime).order_by(db.Link.mtime) + links = session.query(Link).filter_by(is_page=True, is_internal=True) + links = links.filter(Link.mtime < oldtime).order_by(Link.mtime) # present results - fp = plugins.open_html(plugins.old, site) + fp = webcheck.plugins.open_html(webcheck.plugins.old, site) if not links.count(): fp.write( ' <p class="description">\n' ' No pages were found that were older than %(old)d days old.\n' ' </p>\n' - % {'old': config.REPORT_WHATSOLD_URL_AGE}) - plugins.close_html(fp) + % {'old': webcheck.config.REPORT_WHATSOLD_URL_AGE}) + webcheck.plugins.close_html(fp) return fp.write( ' <p class="description">\n' @@ -62,7 +62,7 @@ def generate(site): ' days) and may be outdated.\n' ' </p>\n' ' <ul>\n' - % {'old': config.REPORT_WHATSOLD_URL_AGE}) + % {'old': webcheck.config.REPORT_WHATSOLD_URL_AGE}) for link in links: age = (time.time() - link.mtime) / SECS_PER_DAY fp.write( @@ -72,8 +72,8 @@ def generate(site): ' <li>age: %(age)d days</li>\n' ' </ul>\n' ' </li>\n' - % {'link': plugins.make_link(link), + % {'link': webcheck.plugins.make_link(link), 'age': age}) fp.write( ' </ul>\n') - plugins.close_html(fp) + webcheck.plugins.close_html(fp) diff --git a/plugins/problems.py b/webcheck/plugins/problems.py index 4a03f7b..4f9403c 100644 --- a/plugins/problems.py +++ b/webcheck/plugins/problems.py @@ -30,8 +30,8 @@ __outputfile__ = 'problems.html' import urllib -import db -import plugins +from webcheck.db import Session, Link +import webcheck.plugins def _mk_id(name): @@ -50,12 +50,12 @@ def _mk_id(name): def generate(site): """Output the overview of problems per author.""" - session = db.Session() + session = Session() # make a list of problems per author problem_db = {} # get internal links with page problems - links = session.query(db.Link).filter_by(is_internal=True) - links = links.filter(db.Link.pageproblems.any()).order_by(db.Link.url) + links = session.query(Link).filter_by(is_internal=True) + links = links.filter(Link.pageproblems.any()).order_by(Link.url) for link in links: # make a normal name for the author if link.author: @@ -67,13 +67,13 @@ def generate(site): problem_db[author].append(link) else: problem_db[author] = [link] - fp = plugins.open_html(plugins.problems, site) + fp = webcheck.plugins.open_html(webcheck.plugins.problems, site) if not problem_db: fp.write( ' <p class="description">\n' ' No problems were found on this site, hurray.\n' ' </p>\n') - plugins.close_html(fp) + webcheck.plugins.close_html(fp) return # print description fp.write( @@ -90,8 +90,8 @@ def generate(site): for author in authors: fp.write( ' <li><a href="#author_%(authorref)s">Author: %(author)s</a></li>\n' - % {'authorref': plugins.htmlescape(_mk_id(author)), - 'author': plugins.htmlescape(author)}) + % {'authorref': webcheck.plugins.htmlescape(_mk_id(author)), + 'author': webcheck.plugins.htmlescape(author)}) fp.write(' </ul>\n') # generate problem report fp.write(' <ul>\n') @@ -100,8 +100,8 @@ def generate(site): ' <li id="author_%(authorref)s">\n' ' Author: %(author)s\n' ' <ul>\n' - % {'authorref': plugins.htmlescape(_mk_id(author)), - 'author': plugins.htmlescape(author)}) + % {'authorref': webcheck.plugins.htmlescape(_mk_id(author)), + 'author': webcheck.plugins.htmlescape(author)}) # sort pages by url problem_db[author].sort(lambda a, b: cmp(a.url, b.url)) # list problems for this author @@ -111,12 +111,12 @@ def generate(site): ' <li>\n' ' %(link)s\n' ' <ul class="problems">\n' - % {'link': plugins.make_link(link)}) + % {'link': webcheck.plugins.make_link(link)}) # list the problems for problem in link.pageproblems: fp.write( ' <li>%(problem)s</li>\n' - % {'problem': plugins.htmlescape(problem)}) + % {'problem': webcheck.plugins.htmlescape(problem)}) # end the list item fp.write( ' </ul>\n' @@ -126,4 +126,4 @@ def generate(site): ' </li>\n') fp.write( ' </ul>\n') - plugins.close_html(fp) + webcheck.plugins.close_html(fp) diff --git a/plugins/sitemap.py b/webcheck/plugins/sitemap.py index f6b8963..93c975a 100644 --- a/plugins/sitemap.py +++ b/webcheck/plugins/sitemap.py @@ -28,25 +28,25 @@ __title__ = 'site map' __author__ = 'Arthur de Jong' __outputfile__ = 'index.html' -import config -import db -import plugins +from webcheck.db import Session, Link +import webcheck.config +import webcheck.plugins def add_pagechildren(link, children, explored): """Determine the page children of this link, combining the children of embedded items and following redirects.""" # get all internal children - qry = link.children.filter(db.Link.is_internal == True) + qry = link.children.filter(Link.is_internal == True) if link.depth: - qry = qry.filter((db.Link.depth > link.depth) | (db.Link.depth == None)) + qry = qry.filter((Link.depth > link.depth) | (Link.depth == None)) # follow redirects children.update(y for y in (x.follow_link() for x in qry) if y and y.is_page and y.is_internal and y.id not in explored) explored.update(x.id for x in children) # add embedded element's pagechildren (think frames) - for embed in link.embedded.filter(db.Link.is_internal == True).filter(db.Link.is_page == True): + for embed in link.embedded.filter(Link.is_internal == True).filter(Link.is_page == True): # TODO: put this in a query if embed.id not in explored and \ (embed.depth == None or embed.depth > link.depth): @@ -58,9 +58,9 @@ def _explore(fp, link, explored, depth=0, indent=' '): site. Prints the html results to the file descriptor.""" # output this link fp.write(indent + '<li>\n') - fp.write(indent + ' ' + plugins.make_link(link) + '\n') + fp.write(indent + ' ' + webcheck.plugins.make_link(link) + '\n') # only check children if we are not too deep yet - if depth <= config.REPORT_SITEMAP_LEVEL: + if depth <= webcheck.config.REPORT_SITEMAP_LEVEL: # figure out the links to follow and ensure that they are only # explored from here children = set() @@ -80,8 +80,8 @@ def _explore(fp, link, explored, depth=0, indent=' '): def generate(site): """Output the sitemap.""" - session = db.Session() - fp = plugins.open_html(plugins.sitemap, site) + session = Session() + fp = webcheck.plugins.open_html(webcheck.plugins.sitemap, site) # output the site structure using breadth first traversal fp.write( ' <p class="description">\n' @@ -93,4 +93,4 @@ def generate(site): _explore(fp, l, explored) fp.write( ' </ul>\n') - plugins.close_html(fp) + webcheck.plugins.close_html(fp) diff --git a/plugins/size.py b/webcheck/plugins/size.py index c4d33f8..641936f 100644 --- a/plugins/size.py +++ b/webcheck/plugins/size.py @@ -28,9 +28,9 @@ __title__ = "what's big" __author__ = 'Arthur de Jong' __outputfile__ = 'size.html' -import config -import db -import plugins +from webcheck.db import Session, Link +import webcheck.config +import webcheck.plugins def _getsize(link, done=None): @@ -57,22 +57,22 @@ def _getsize(link, done=None): def generate(site): """Output the list of large pages.""" - session = db.Session() + session = Session() # get all internal pages and get big links - links = session.query(db.Link).filter_by(is_page=True, is_internal=True) + links = session.query(Link).filter_by(is_page=True, is_internal=True) links = [x for x in links - if _getsize(x) >= config.REPORT_SLOW_URL_SIZE * 1024] + if _getsize(x) >= webcheck.config.REPORT_SLOW_URL_SIZE * 1024] # sort links by size (biggest first) links.sort(lambda a, b: cmp(b.total_size, a.total_size)) # present results - fp = plugins.open_html(plugins.size, site) + fp = webcheck.plugins.open_html(webcheck.plugins.size, site) if not links: fp.write( ' <p class="description">\n' ' No pages over %(size)dK were found.\n' ' </p>\n' - % {'size': config.REPORT_SLOW_URL_SIZE}) - plugins.close_html(fp) + % {'size': webcheck.config.REPORT_SLOW_URL_SIZE}) + webcheck.plugins.close_html(fp) return fp.write( ' <p class="description">\n' @@ -80,9 +80,9 @@ def generate(site): ' slow to download.\n' ' </p>\n' ' <ul>\n' - % {'size': config.REPORT_SLOW_URL_SIZE}) + % {'size': webcheck.config.REPORT_SLOW_URL_SIZE}) for link in links: - size = plugins.get_size(link.total_size) + size = webcheck.plugins.get_size(link.total_size) fp.write( ' <li>\n' ' %(link)s\n' @@ -90,8 +90,8 @@ def generate(site): ' <li>size: %(size)s</li>\n' ' </ul>\n' ' </li>\n' - % {'link': plugins.make_link(link), + % {'link': webcheck.plugins.make_link(link), 'size': size}) fp.write( ' </ul>\n') - plugins.close_html(fp) + webcheck.plugins.close_html(fp) diff --git a/plugins/urllist.py b/webcheck/plugins/urllist.py index 1160b2e..f630c63 100644 --- a/plugins/urllist.py +++ b/webcheck/plugins/urllist.py @@ -26,14 +26,14 @@ __title__ = 'url list' __author__ = 'Arthur de Jong' __outputfile__ = 'urllist.html' -import db -import plugins +from webcheck.db import Session, Link +import webcheck.plugins def generate(site): """Output a sorted list of URLs.""" - session = db.Session() - fp = plugins.open_html(plugins.urllist, site) + session = Session() + fp = webcheck.plugins.open_html(webcheck.plugins.urllist, site) fp.write( ' <p class="description">\n' ' This is the list of all urls encountered during the examination of\n' @@ -41,9 +41,9 @@ def generate(site): ' non-examined urls.\n' ' </p>\n' ' <ol>\n') - links = session.query(db.Link).order_by(db.Link.url) + links = session.query(Link).order_by(Link.url) for link in links: - fp.write(' <li>' + plugins.make_link(link, link.url) + '</li>\n') + fp.write(' <li>' + webcheck.plugins.make_link(link, link.url) + '</li>\n') fp.write( ' </ol>\n') - plugins.close_html(fp) + webcheck.plugins.close_html(fp) |