From 7d7b8cb696c023e3917c9f15485c0d544de7bbe7 Mon Sep 17 00:00:00 2001
From: Arthur de Jong <arthur@arthurdejong.org>
Date: Fri, 16 Sep 2011 13:36:38 +0000
Subject: move all the code except the command-line handling to the webcheck
 package and reorganise imports accordingly

git-svn-id: http://arthurdejong.org/svn/webcheck/webcheck@435 86f53f14-5ff3-0310-afe5-9b438ce3f40c
---
 cmd.py                                 | 295 +++++++++++++++++++++++
 config.py                              | 131 ----------
 crawler.py                             | 421 --------------------------------
 db.py                                  | 300 -----------------------
 debugio.py                             |  65 -----
 monkeypatch.py                         |  81 -------
 myurllib.py                            | 120 ----------
 parsers/__init__.py                    |  63 -----
 parsers/css.py                         |  55 -----
 parsers/html/__init__.py               | 123 ----------
 parsers/html/beautifulsoup.py          | 191 ---------------
 parsers/html/calltidy.py               |  36 ---
 parsers/html/htmlparser.py             | 304 ------------------------
 plugins/__init__.py                    | 284 ----------------------
 plugins/about.py                       | 114 ---------
 plugins/anchors.py                     |  51 ----
 plugins/badlinks.py                    |  90 -------
 plugins/external.py                    |  68 ------
 plugins/images.py                      |  64 -----
 plugins/new.py                         |  77 ------
 plugins/notchkd.py                     |  68 ------
 plugins/notitles.py                    |  77 ------
 plugins/old.py                         |  79 ------
 plugins/problems.py                    | 129 ----------
 plugins/sitemap.py                     |  96 --------
 plugins/size.py                        |  97 --------
 plugins/urllist.py                     |  49 ----
 webcheck.py                            | 295 -----------------------
 webcheck/__init__.py                   |   0
 webcheck/config.py                     | 132 +++++++++++
 webcheck/crawler.py                    | 422 +++++++++++++++++++++++++++++++++
 webcheck/db.py                         | 299 +++++++++++++++++++++++
 webcheck/debugio.py                    |  66 ++++++
 webcheck/monkeypatch.py                |  81 +++++++
 webcheck/myurllib.py                   | 120 ++++++++++
 webcheck/parsers/__init__.py           |  63 +++++
 webcheck/parsers/css.py                |  56 +++++
 webcheck/parsers/html/__init__.py      | 125 ++++++++++
 webcheck/parsers/html/beautifulsoup.py | 194 +++++++++++++++
 webcheck/parsers/html/calltidy.py      |  37 +++
 webcheck/parsers/html/htmlparser.py    | 306 ++++++++++++++++++++++++
 webcheck/plugins/__init__.py           | 281 ++++++++++++++++++++++
 webcheck/plugins/about.py              | 114 +++++++++
 webcheck/plugins/anchors.py            |  51 ++++
 webcheck/plugins/badlinks.py           |  90 +++++++
 webcheck/plugins/external.py           |  68 ++++++
 webcheck/plugins/images.py             |  64 +++++
 webcheck/plugins/new.py                |  77 ++++++
 webcheck/plugins/notchkd.py            |  68 ++++++
 webcheck/plugins/notitles.py           |  77 ++++++
 webcheck/plugins/old.py                |  79 ++++++
 webcheck/plugins/problems.py           | 129 ++++++++++
 webcheck/plugins/sitemap.py            |  96 ++++++++
 webcheck/plugins/size.py               |  97 ++++++++
 webcheck/plugins/urllist.py            |  49 ++++
 55 files changed, 3536 insertions(+), 3528 deletions(-)
 create mode 100755 cmd.py
 delete mode 100644 config.py
 delete mode 100644 crawler.py
 delete mode 100644 db.py
 delete mode 100644 debugio.py
 delete mode 100644 monkeypatch.py
 delete mode 100644 myurllib.py
 delete mode 100644 parsers/__init__.py
 delete mode 100644 parsers/css.py
 delete mode 100644 parsers/html/__init__.py
 delete mode 100644 parsers/html/beautifulsoup.py
 delete mode 100644 parsers/html/calltidy.py
 delete mode 100644 parsers/html/htmlparser.py
 delete mode 100644 plugins/__init__.py
 delete mode 100644 plugins/about.py
 delete mode 100644 plugins/anchors.py
 delete mode 100644 plugins/badlinks.py
 delete mode 100644 plugins/external.py
 delete mode 100644 plugins/images.py
 delete mode 100644 plugins/new.py
 delete mode 100644 plugins/notchkd.py
 delete mode 100644 plugins/notitles.py
 delete mode 100644 plugins/old.py
 delete mode 100644 plugins/problems.py
 delete mode 100644 plugins/sitemap.py
 delete mode 100644 plugins/size.py
 delete mode 100644 plugins/urllist.py
 delete mode 100755 webcheck.py
 create mode 100644 webcheck/__init__.py
 create mode 100644 webcheck/config.py
 create mode 100644 webcheck/crawler.py
 create mode 100644 webcheck/db.py
 create mode 100644 webcheck/debugio.py
 create mode 100644 webcheck/monkeypatch.py
 create mode 100644 webcheck/myurllib.py
 create mode 100644 webcheck/parsers/__init__.py
 create mode 100644 webcheck/parsers/css.py
 create mode 100644 webcheck/parsers/html/__init__.py
 create mode 100644 webcheck/parsers/html/beautifulsoup.py
 create mode 100644 webcheck/parsers/html/calltidy.py
 create mode 100644 webcheck/parsers/html/htmlparser.py
 create mode 100644 webcheck/plugins/__init__.py
 create mode 100644 webcheck/plugins/about.py
 create mode 100644 webcheck/plugins/anchors.py
 create mode 100644 webcheck/plugins/badlinks.py
 create mode 100644 webcheck/plugins/external.py
 create mode 100644 webcheck/plugins/images.py
 create mode 100644 webcheck/plugins/new.py
 create mode 100644 webcheck/plugins/notchkd.py
 create mode 100644 webcheck/plugins/notitles.py
 create mode 100644 webcheck/plugins/old.py
 create mode 100644 webcheck/plugins/problems.py
 create mode 100644 webcheck/plugins/sitemap.py
 create mode 100644 webcheck/plugins/size.py
 create mode 100644 webcheck/plugins/urllist.py

diff --git a/cmd.py b/cmd.py
new file mode 100755
index 0000000..dbbe9d6
--- /dev/null
+++ b/cmd.py
@@ -0,0 +1,295 @@
+#!/usr/bin/env python
+
+# cmd.py - command-line front-end for webcheck
+#
+# Copyright (C) 1998, 1999 Albert Hopkins (marduk)
+# Copyright (C) 2002 Mike W. Meyer
+# Copyright (C) 2005, 2006, 2007, 2008, 2010, 2011 Arthur de Jong
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
+#
+# The files produced as output from the software do not automatically fall
+# under the copyright of the software, unless explicitly stated otherwise.
+
+"""This is the main webcheck module."""
+
+__version__ = '1.10.4'
+__homepage__ = 'http://arthurdejong.org/webcheck/'
+
+import os
+import re
+import sys
+import urllib
+import urlparse
+
+from webcheck import config
+# update some fields that currently are stored in config
+config.VERSION = __version__
+config.HOMEPAGE = __homepage__
+
+from webcheck import debugio
+import webcheck.crawler
+import webcheck.db
+import webcheck.monkeypatch
+import webcheck.plugins
+
+debugio.loglevel = debugio.INFO
+
+
+def print_version():
+    """Print version information."""
+    sys.stdout.write(
+      'webcheck %(version)s\n'
+      'Written by Albert Hopkins (marduk), Mike W. Meyer and Arthur de Jong.\n'
+      '\n'
+      'Copyright (C) 1998-2011\n'
+      'Albert Hopkins (marduk), Mike W. Meyer and Arthur de Jong.\n'
+      'This is free software; see the source for copying conditions.  There is NO\n'
+      'warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.\n'
+      % {'version': __version__})
+
+
+def print_usage():
+    """Print short usage information."""
+    sys.stderr.write(
+      'Usage: webcheck [OPTION]... URL...\n')
+
+
+def print_tryhelp():
+    """Print friendly pointer to more information."""
+    sys.stderr.write(
+      'Try \'webcheck --help\' for more information.\n')
+
+
+def print_help():
+    """Print the option list."""
+    sys.stdout.write(
+      'Usage: webcheck [OPTION]... URL...\n'
+      'Generate a report for the given URLs\n'
+      '\n'
+      '  -i, --internal=PATTERN mark URLs matching PATTERN as internal\n'
+      '  -x, --external=PATTERN mark URLs matching PATTERN as external\n'
+      '  -y, --yank=PATTERN     do not check URLs matching PATTERN\n'
+      '  -b, --base-only        base URLs only: consider any URL not starting\n'
+      '                         with any of the base URLs to be external\n'
+      '  -a, --avoid-external   do not check external URLs\n'
+      '      --ignore-robots    do not retrieve and parse robots.txt files\n'
+      '  -q, --quiet, --silent  suppress progress messages\n'
+      '  -d, --debug            do programmer-level debugging\n'
+      '  -o, --output=DIRECTORY store the generated reports in the specified\n'
+      '                         directory\n'
+      '  -c, --continue         try to continue from a previous run\n'
+      '  -f, --force            overwrite files without asking\n'
+      '  -r, --redirects=N      the number of redirects webcheck should follow,\n'
+      '                         0 implies to follow all redirects (default=%(redirects)d)\n'
+      '  -u, --userpass=URL     specify a URL with user:pass so username and password are given\n'
+      '                         to matching network locations, -u http://user:pass@example.com\n'
+      '  -w, --wait=SECONDS     wait SECONDS between retrievals\n'
+      '  -V, --version          output version information and exit\n'
+      '  -h, --help             display this help and exit\n'
+      % {'redirects': config.REDIRECT_DEPTH})
+
+
+def parse_args(site):
+    """Parse command-line arguments."""
+    import getopt
+    try:
+        optlist, args = getopt.gnu_getopt(sys.argv[1:],
+          'i:x:y:l:baqdo:cfr:u:w:Vh',
+          ('internal=', 'external=', 'yank=', 'base-only', 'avoid-external',
+           'ignore-robots',
+           'quiet', 'silent', 'debug', 'profile', 'output=', 'continue',
+           'force', 'redirects=', 'userpass=', 'wait=', 'version', 'help'))
+        internal_urls = []
+        external_urls = []
+        yank_urls = []
+        for flag, arg in optlist:
+            if flag in ('-i', '--internal'):
+                internal_urls.append(arg)
+            elif flag in ('-x', '--external'):
+                external_urls.append(arg)
+            elif flag in ('-y', '--yank'):
+                yank_urls.append(arg)
+            elif flag in ('-b', '--base-only'):
+                config.BASE_URLS_ONLY = True
+            elif flag in ('-a', '--avoid-external'):
+                config.AVOID_EXTERNAL_LINKS = True
+            elif flag in ('--ignore-robots',):
+                config.USE_ROBOTS = False
+            elif flag in ('-q', '--quiet', '--silent'):
+                debugio.loglevel = debugio.ERROR
+            elif flag in ('-d', '--debug'):
+                debugio.loglevel = debugio.DEBUG
+            elif flag in ('--profile',):
+                # undocumented on purpose
+                config.PROFILE = True
+            elif flag in ('-o', '--output'):
+                config.OUTPUT_DIR = arg
+            elif flag in ('-c', '--continue'):
+                config.CONTINUE = True
+            elif flag in ('-f', '--force'):
+                config.OVERWRITE_FILES = True
+            elif flag in ('-r', '--redirects'):
+                config.REDIRECT_DEPTH = int(arg)
+            elif flag in ('-u', '--userpass'):
+                (_scheme, _netloc, _path, _params, _query, _frag) = urlparse.urlparse(arg)
+                (_userpass, _netloc) = urllib.splituser(_netloc)
+                config.USERPASS[_netloc] = _userpass
+            elif flag in ('-w', '--wait'):
+                config.WAIT_BETWEEN_REQUESTS = float(arg)
+            elif flag in ('-V', '--version'):
+                print_version()
+                sys.exit(0)
+            elif flag in ('-h', '--help'):
+                print_help()
+                sys.exit(0)
+        if len(args) == 0 and not config.CONTINUE:
+            print_usage()
+            print_tryhelp()
+            sys.exit(1)
+        # ensure output directory exists
+        if not os.path.isdir(config.OUTPUT_DIR):
+            os.mkdir(config.OUTPUT_DIR)
+        # set up database connection
+        filename = os.path.join(config.OUTPUT_DIR, 'webcheck.sqlite')
+        from sqlalchemy import create_engine
+        engine = create_engine('sqlite:///' + filename)
+        webcheck.db.Session.configure(bind=engine)
+        # ensure that all tables are created
+        webcheck.db.Base.metadata.create_all(engine)
+        # TODO: schema migraton goes here
+        # add configuration to site
+        for pattern in internal_urls:
+            site.add_internal_re(pattern)
+        for pattern in external_urls:
+            site.add_external_re(pattern)
+        for pattern in yank_urls:
+            site.add_yanked_re(pattern)
+        for arg in args:
+            # if it does not look like a url it is probably a local file
+            if urlparse.urlsplit(arg)[0] == '':
+                arg = 'file://' + urllib.pathname2url(os.path.abspath(arg))
+            site.add_internal(arg)
+    except getopt.error, reason:
+        sys.stderr.write('webcheck: %s\n' % reason)
+        print_tryhelp()
+        sys.exit(1)
+    except re.error, e:
+        sys.stderr.write('webcheck: %s\n' % str(e))
+        sys.exit(1)
+
+
+def install_file(source, text=False):
+    """Install the given file in the output directory.
+    If the text flag is set to true it is assumed the file is text,
+    translating line endings."""
+    import shutil
+    import urlparse
+    # figure out mode to open the file with
+    mode = 'r'
+    if text:
+        mode += 'U'
+    # check with what kind of argument we are called
+    scheme = urlparse.urlsplit(source)[0]
+    if scheme == 'file':
+        # this is a file:/// url, translate to normal path and open
+        import urllib
+        source = urllib.url2pathname(urlparse.urlsplit(source)[2])
+    elif scheme == '' and os.path.isabs(source):
+        # this is an absolute path, just open it as is
+        pass
+    elif scheme == '':
+        # this is a relavite path, try to fetch it from the python path
+        for directory in sys.path:
+            tst = os.path.join(directory, source)
+            if os.path.isfile(tst):
+                source = tst
+                break
+    # TODO: support more schemes here
+    # figure out the destination name
+    target = os.path.join(config.OUTPUT_DIR, os.path.basename(source))
+    # test if source and target are the same
+    source = os.path.realpath(source)
+    if source == os.path.realpath(target):
+        debugio.warn('attempt to overwrite %(fname)s with itself' % {'fname': source})
+        return
+    # open the input file
+    sfp = None
+    try:
+        sfp = open(source, mode)
+    except IOError, (errno, strerror):
+        debugio.error('%(fname)s: %(strerror)s' %
+                      {'fname': source,
+                       'strerror': strerror})
+        sys.exit(1)
+    # create file in output directory (with overwrite question)
+    tfp = webcheck.plugins.open_file(os.path.basename(source))
+    # copy contents
+    shutil.copyfileobj(sfp, tfp)
+    # close files
+    tfp.close()
+    sfp.close()
+
+
+def main(site):
+    """Main program."""
+    # crawl through the website
+    debugio.info('checking site....')
+    webcheck.crawler.setup_urllib2()
+    site.crawl()  # this will take a while
+    debugio.info('done.')
+    # do postprocessing (building site structure, etc)
+    debugio.info('postprocessing....')
+    site.postprocess()
+    debugio.info('done.')
+    # now we can write out the files
+    # start with the frame-description page
+    debugio.info('generating reports...')
+    # for every plugin, generate a page
+    site.generate()
+    # put extra files in the output directory
+    install_file('webcheck.css', True)
+    install_file('fancytooltips/fancytooltips.js', True)
+    install_file('favicon.ico', False)
+    debugio.info('done.')
+
+
+if __name__ == '__main__':
+    try:
+        # initialize site object
+        site = webcheck.crawler.Site()
+        # parse command-line arguments
+        parse_args(site)
+        # run the main program
+        if config.PROFILE:
+            fname = os.path.join(config.OUTPUT_DIR, 'webcheck.prof')
+            try:
+                import cProfile
+            except ImportError:
+                import profile as cProfile
+            try:
+                import sqltap
+                sqltap.start()
+            except ImportError:
+                pass
+            cProfile.run('main(site)', fname)
+            if 'sqltap' in locals():
+                statistics = sqltap.collect()
+                sqltap.report(statistics, os.path.join(config.OUTPUT_DIR, 'sqltap.html'))
+        else:
+            main(site)
+    except KeyboardInterrupt:
+        sys.stderr.write('Interrupted\n')
+        sys.exit(1)
diff --git a/config.py b/config.py
deleted file mode 100644
index e106b8e..0000000
--- a/config.py
+++ /dev/null
@@ -1,131 +0,0 @@
-
-# config.py - configuration state for webcheck
-#
-# Copyright (C) 1998, 1999 Albert Hopkins (marduk)
-# Copyright (C) 2002 Mike Meyer
-# Copyright (C) 2005, 2006, 2007, 2008, 2010, 2011 Arthur de Jong
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software
-# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
-#
-# The files produced as output from the software do not automatically fall
-# under the copyright of the software, unless explicitly stated otherwise.
-
-"""Configuration state for webcheck.
-
-This file contains the default configuration for webcheck. All configurable
-items should be changeble from the command line."""
-
-import urllib
-
-# Whether to consider any URL not starting with the base URL to be external.
-# This is the state of the -b command line option.
-BASE_URLS_ONLY = False
-
-# Avoid checking external links at all. This is the state of the -a command
-# line option.
-AVOID_EXTERNAL_LINKS = False
-
-# The proxy configuration.
-PROXIES = urllib.getproxies_environment()
-
-# IO timeout as passed to socket.setdefaulttimeout()
-# value is a float in seconds None disables the timeout
-IOTIMEOUT = 10.0
-
-# Output directory. This is the state of the -o command line option.
-OUTPUT_DIR = '.'
-
-# Whether to try to read a state file to continue from.
-CONTINUE = False
-
-# Whether to produce profiling information. This is for development
-# purposes and as such undocumented.
-# http://docs.python.org/lib/profile.html
-PROFILE = False
-
-# This is the time in seconds to wait between requests. This is the state of
-# the -w command line option.
-WAIT_BETWEEN_REQUESTS = 0
-
-# Redirect depth, the number of redirects to follow. This is the state of the
-# -r command line option.
-REDIRECT_DEPTH = 5
-
-# The list of plugins that will be used to generate the report.
-PLUGINS = ['anchors',
-           'sitemap',
-           'urllist',
-           'images',
-           'external',
-           'notchkd',
-           'badlinks',
-           'old',
-           'new',
-           'size',
-           'notitles',
-           'problems',
-           'about']
-
-# Whether to overwrite files without asking. This is the state of the -f
-# command line option.
-OVERWRITE_FILES = False
-
-# Whether to add extra headers to outgoing requests, requesting to
-# disable caching, ensuring that a fresh page is returned
-BYPASSHTTPCACHE = False
-
-# The number of levels the sitemap plugin should show.
-REPORT_SITEMAP_LEVEL = 8
-
-# The age of pages in days that after which a page is considered too old.
-REPORT_WHATSOLD_URL_AGE = 700
-
-# The age of pages in days within wich a page is considered new.
-REPORT_WHATSNEW_URL_AGE = 7
-
-# The size of a page in kilobytes after which the page is considered too big.
-REPORT_SLOW_URL_SIZE = 76
-
-# The maximum number of links to show in the "referenced from:" lists
-PARENT_LISTLEN = 10
-
-# Whether to open links in a new window (add target="_blank")
-# (disabled by default because it is not xhtml 1.1)
-REPORT_LINKS_IN_NEW_WINDOW = False
-
-# A list of names that will be checked when encountering an file:///
-# directory. This file will be picked up instead of the directory list.
-FILE_INDEXES = ['index.html', 'index.htm']
-
-# A list of names that will be checked when encountering an ftp://
-# directory. This file will be picked up instead of the directory list.
-FTP_INDEXES = ['index.html', 'index.htm']
-
-# Whether to fetch robots.txt files and do checking based on the information
-# present in those files (normally matching links are yanked).
-USE_ROBOTS = True
-
-# This is a hash that maps netlocs (e.g. some.server.com:8000) to
-# username/password combinations that are passed as basic authentication
-# to that netloc
-USERPASS = {}
-
-# Options for tidy (make None to disable running tidy)
-# See http://tidy.sourceforge.net/docs/quickref.html for details.
-TIDY_OPTIONS = dict(quiet=1,
-                    accessibility_check=1,
-                    show_errors=6,
-                    show_warnings=1,
-                    char_encoding='raw')
diff --git a/crawler.py b/crawler.py
deleted file mode 100644
index 5c842db..0000000
--- a/crawler.py
+++ /dev/null
@@ -1,421 +0,0 @@
-
-# crawler.py - definition of Link class for storing the crawled site
-#
-# Copyright (C) 1998, 1999 Albert Hopkins (marduk)
-# Copyright (C) 2002 Mike W. Meyer
-# Copyright (C) 2005, 2006, 2007, 2008, 2011 Arthur de Jong
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software
-# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
-#
-# The files produced as output from the software do not automatically fall
-# under the copyright of the software, unless explicitly stated otherwise.
-
-"""General module to do site-checking. This module contains the Site class
-containing the state for the crawled site and some functions to access and
-manipulate the crawling of the website. This module also contains the Link
-class that holds all the link related properties."""
-
-import atexit
-import cookielib
-import datetime
-import httplib
-import os
-import re
-import robotparser
-import socket
-import time
-import urllib
-import urllib2
-import urlparse
-
-import config
-import db
-import debugio
-import parsers
-
-
-class RedirectError(urllib2.HTTPError):
-    def __init__(self, url, code, msg, hdrs, fp, newurl):
-        self.newurl = newurl
-        urllib2.HTTPError.__init__(self, url, code, msg, hdrs, fp)
-
-
-class NoRedirectHandler(urllib2.HTTPRedirectHandler):
-
-    def redirect_request(self, req, fp, code, msg, headers, newurl):
-        raise RedirectError(req.get_full_url(), code, msg, headers, fp, newurl)
-
-
-def setup_urllib2():
-    """Configure the urllib2 module to store cookies in the output
-    directory."""
-    filename = os.path.join(config.OUTPUT_DIR, 'cookies.lwp')
-    # set up our cookie jar
-    cookiejar = cookielib.LWPCookieJar(filename)
-    try:
-        cookiejar.load(ignore_discard=False, ignore_expires=False)
-    except IOError:
-        pass
-    atexit.register(cookiejar.save, ignore_discard=False, ignore_expires=False)
-    # set up our custom opener that sets a meaningful user agent
-    opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookiejar),
-                                  NoRedirectHandler())
-    opener.addheaders = [
-      ('User-agent', 'webcheck %s' % config.VERSION),
-      ]
-    if config.BYPASSHTTPCACHE:
-        opener.addheaders.append(('Cache-control', 'no-cache'))
-        opener.addheaders.append(('Pragma', 'no-cache'))
-    urllib2.install_opener(opener)
-
-
-# pattern for matching spaces
-_spacepattern = re.compile(' ')
-
-# pattern to match anchor part of a url
-_anchorpattern = re.compile('#([^#]+)$')
-
-
-# TODO: rename Site to Crawler
-class Site(object):
-    """Class to represent gathered data of a site.
-
-    The available properties of this class are:
-
-      bases      - a list of base link object
-   """
-
-    def __init__(self):
-        """Creates an instance of the Site class and initializes the
-        state of the site."""
-        # list of internal urls
-        self._internal_urls = set()
-        # list of regexps considered internal
-        self._internal_res = {}
-        # list of regexps considered external
-        self._external_res = {}
-        # list of regexps matching links that should not be checked
-        self._yanked_res = {}
-        # map of scheme+netloc to robot handleds
-        self._robotparsers = {}
-        # list of base urls (these are the internal urls to start from)
-        self.bases = []
-
-    def add_internal(self, url):
-        """Add the given url and consider all urls below it to be internal.
-        These links are all marked for checking with the crawl() function."""
-        url = db.Link.clean_url(url)
-        if url not in self._internal_urls:
-            self._internal_urls.add(url)
-
-    def add_internal_re(self, exp):
-        """Adds the gived regular expression as a pattern to match internal
-        urls."""
-        self._internal_res[exp] = re.compile(exp, re.IGNORECASE)
-
-    def add_external_re(self, exp):
-        """Adds the gived regular expression as a pattern to match external
-        urls."""
-        self._external_res[exp] = re.compile(exp, re.IGNORECASE)
-
-    def add_yanked_re(self, exp):
-        """Adds the gived regular expression as a pattern to match urls that
-        will not be checked at all."""
-        self._yanked_res[exp] = re.compile(exp, re.IGNORECASE)
-
-    def _is_internal(self, url):
-        """Check whether the specified url is external or internal.
-        This uses the urls marked with add_internal() and the regular
-        expressions passed with add_external_re()."""
-        # check if it is internal through the regexps
-        for regexp in self._internal_res.values():
-            if regexp.search(url) is not None:
-                return True
-        res = False
-        # check that the url starts with an internal url
-        if config.BASE_URLS_ONLY:
-            # the url must start with one of the _internal_urls
-            for i in self._internal_urls:
-                res |= (i == url[:len(i)])
-        else:
-            # the netloc must match a netloc of an _internal_url
-            netloc = urlparse.urlsplit(url)[1]
-            for i in self._internal_urls:
-                res |= (urlparse.urlsplit(i)[1] == netloc)
-        # if it is not internal now, it never will be
-        if not res:
-            return False
-        # check if it is external through the regexps
-        for x in self._external_res.values():
-            # if the url matches it is external and we can stop
-            if x.search(url):
-                return False
-        return True
-
-    def _get_robotparser(self, scheme, netloc):
-        """Return the proper robots parser for the given url or None if one
-        cannot be constructed. Robot parsers are cached per scheme and
-        netloc."""
-        # only some schemes have a meaningful robots.txt file
-        if scheme != 'http' and scheme != 'https':
-            debugio.debug('crawler._get_robotparser() '
-                          'called with unsupported scheme (%s)' % scheme)
-            return None
-        # split out the key part of the url
-        location = urlparse.urlunsplit((scheme, netloc, '', '', ''))
-        # try to create a new robotparser if we don't already have one
-        if location not in self._robotparsers:
-            debugio.info('  getting robots.txt for %s' % location)
-            self._robotparsers[location] = None
-            try:
-                rp = robotparser.RobotFileParser()
-                rp.set_url(urlparse.urlunsplit(
-                  (scheme, netloc, '/robots.txt', '', '')))
-                rp.read()
-                self._robotparsers[location] = rp
-            except (TypeError, IOError, httplib.HTTPException):
-                # ignore any problems setting up robot parser
-                pass
-        return self._robotparsers[location]
-
-    def _is_yanked(self, url):
-        """Check whether the specified url should not be checked at all.
-        This uses the regualr expressions passed with add_yanked_re() and the
-        robots information present."""
-        # check if it is yanked through the regexps
-        for regexp in self._yanked_res.values():
-            # if the url matches it is yanked and we can stop
-            if regexp.search(url):
-                return 'yanked'
-        # check if we should avoid external links
-        is_internal = self._is_internal(url)
-        if not is_internal and config.AVOID_EXTERNAL_LINKS:
-            return 'external avoided'
-        # check if we should use robot parsers
-        if not config.USE_ROBOTS:
-            return None
-        (scheme, netloc) = urlparse.urlsplit(url)[0:2]
-        # skip schemes not having robot.txt files
-        if scheme not in ('http', 'https'):
-            return None
-        # skip robot checks for external urls
-        # TODO: make this configurable
-        if not is_internal:
-            return None
-        # check robots for remaining links
-        rp = self._get_robotparser(scheme, netloc)
-        if rp and not rp.can_fetch('webcheck', url):
-            return 'robot restriced'
-        # fall back to allowing the url
-        return None
-
-    def get_link(self, session, url):
-        # try to find the URL
-        url = db.Link.clean_url(url)
-        link = session.query(db.Link).filter_by(url=url).first()
-        if not link:
-            link = db.Link(url=url)
-            session.add(link)
-        return link
-
-    def get_links_to_crawl(self, session):
-        links = session.query(db.Link).filter(db.Link.fetched == None)
-        return links.filter(db.Link.yanked == None)
-
-    def crawl(self):
-        """Crawl the website based on the urls specified with
-        add_internal(). If the serialization file pointer
-        is specified the crawler writes out updated links to
-        the file while crawling the site."""
-        # get a database session
-        session = db.Session()
-        # remove all links
-        if not config.CONTINUE:
-            session.query(db.LinkProblem).delete()
-            session.commit()
-            session.query(db.PageProblem).delete()
-            session.commit()
-            session.execute(db.children.delete())
-            session.commit()
-            session.execute(db.embedded.delete())
-            session.commit()
-            session.query(db.Link).delete()
-            session.commit()
-        # add all internal urls to the database
-        for url in self._internal_urls:
-            url = db.Link.clean_url(url)
-            self.get_link(session, url)
-        # add some URLs from the database that haven't been fetched
-        tocheck = self.get_links_to_crawl(session)
-        remaining = tocheck.count()
-        tocheck = tocheck[:100]
-        remaining -= len(tocheck)
-        # repeat until we have nothing more to check
-        while tocheck:
-            # choose a link from the tocheck list
-            link = tocheck.pop()
-            link.is_internal = self._is_internal(link.url)
-            link.yanked = self._is_yanked(link.url)
-            # see if there are any more links to check
-            if not tocheck:
-                tocheck = self.get_links_to_crawl(session)
-                remaining = tocheck.count()
-                tocheck = tocheck[:100]
-                remaining -= len(tocheck)
-            # skip link it there is nothing to check
-            if link.yanked or link.fetched:
-                continue
-            # fetch the link's contents
-            response = self.fetch(link)
-            if response:
-                self.parse(link, response)
-            # flush database changes
-            session.commit()
-            # sleep between requests if configured
-            if config.WAIT_BETWEEN_REQUESTS > 0:
-                debugio.debug('crawler.crawl(): sleeping %s seconds' %
-                              config.WAIT_BETWEEN_REQUESTS)
-                time.sleep(config.WAIT_BETWEEN_REQUESTS)
-            debugio.debug('crawler.crawl(): items left to check: %d' %
-                          (remaining + len(tocheck)))
-        session.commit()
-
-    def fetch(self, link):
-        """Attempt to fetch the url (if not yanked) and fill in link
-        attributes (based on is_internal)."""
-        debugio.info('  %s' % link.url)
-        # mark the link as fetched to avoid loops
-        link.fetched = datetime.datetime.now()
-        # see if we can import the proper module for this scheme
-        try:
-            # FIXME: if an URI has a username:passwd add the uri, username and password to the HTTPPasswordMgr
-            request = urllib2.Request(link.url)
-            parent = link.parents.first()
-            if parent:
-                request.add_header('Referer', parent.url)
-            response = urllib2.urlopen(request)
-            link.mimetype = response.info().gettype()
-            link.set_encoding(response.headers.getparam('charset'))
-            # FIXME: get result code and other stuff
-            link.status = str(response.code)
-            # link.size = int(response.getheader('Content-length'))
-            # link.mtime = time.mktime(response.msg.getdate('Last-Modified'))
-            # if response.status == 301: link.add_linkproblem(str(response.status)+': '+response.reason)
-            # elif response.status != 200: link.add_linkproblem(str(response.status)+': '+response.reason)
-            # TODO: add checking for size
-            return response
-        except RedirectError, e:
-            link.status = str(e.code)
-            debugio.info('    ' + str(e))
-            if e.code == 301:
-                link.add_linkproblem(str(e))
-            link.add_redirect(e.newurl)
-            return
-        except urllib2.HTTPError, e:
-            link.status = str(e.code)
-            debugio.info('    ' + str(e))
-            link.add_linkproblem(str(e))
-            return
-        except urllib2.URLError, e:
-            debugio.info('    ' + str(e))
-            link.add_linkproblem(str(e))
-            return
-        except KeyboardInterrupt:
-            # handle this in a higher-level exception handler
-            raise
-        except Exception, e:
-            # handle all other exceptions
-            debugio.warn('unknown exception caught: ' + str(e))
-            link.add_linkproblem('error reading HTTP response: %s' % str(e))
-            import traceback
-            traceback.print_exc()
-            return
-
-    def parse(self, link, response):
-        """Parse the fetched response."""
-        # find a parser for the content-type
-        parsermodule = parsers.get_parsermodule(link.mimetype)
-        if parsermodule is None:
-            debugio.debug('crawler.Link.fetch(): unsupported content-type: %s' % link.mimetype)
-            return
-        # skip parsing of content if we were returned nothing
-        content = response.read()
-        if content is None:
-            return
-        # parse the content
-        debugio.debug('crawler.Link.fetch(): parsing using %s' % parsermodule.__name__)
-        try:
-            parsermodule.parse(content, link)
-        except Exception, e:
-            import traceback
-            traceback.print_exc()
-            debugio.warn('problem parsing page: ' + str(e))
-            link.add_pageproblem('problem parsing page: ' + str(e))
-
-    def postprocess(self):
-        """Do some basic post processing of the collected data, including
-        depth calculation of every link."""
-        # get a database session
-        session = db.Session()
-        # build the list of urls that were set up with add_internal() that
-        # do not have a parent (they form the base for the site)
-        for url in self._internal_urls:
-            link = self.get_link(session, url).follow_link()
-            if not link:
-                debugio.warn('base link %s redirects to nowhere' % url)
-                continue
-            # add the link to bases
-            debugio.debug('crawler.postprocess(): adding %s to bases' % link.url)
-            self.bases.append(link)
-        # if we got no bases, just use the first internal one
-        if not self.bases:
-            link = session.query(db.Link).filter(db.Link.is_internal == True).first()
-            debugio.debug('crawler.postprocess(): fallback to adding %s to bases' % link.url)
-            self.bases.append(link)
-        # do a breadth first traversal of the website to determine depth
-        session.query(db.Link).update(dict(depth=None), synchronize_session=False)
-        session.commit()
-        depth = 0
-        count = len(self.bases)
-        for link in self.bases:
-            link.depth = 0
-        session.commit()
-        debugio.debug('crawler.postprocess(): %d links at depth 0' % count)
-        while count > 0:
-            # update the depth of all links without a depth that have a
-            # parent with the previous depth
-            qry = session.query(db.Link).filter(db.Link.depth == None)
-            qry = qry.filter(db.Link.linked_from.any(db.Link.depth == depth))
-            count = qry.update(dict(depth=depth + 1), synchronize_session=False)
-            session.commit()
-            depth += 1
-            debugio.debug('crawler.postprocess(): %d links at depth %d' % (count, depth))
-            # TODO: also handle embeds
-        # see if any of the plugins want to do postprocessing
-        for p in config.PLUGINS:
-            # import the plugin
-            plugin = __import__('plugins.' + p, globals(), locals(), [p])
-            if hasattr(plugin, 'postprocess'):
-                debugio.info('  ' + p)
-                plugin.postprocess(self)
-
-    def generate(self):
-        """Generate pages for plugins."""
-        for p in config.PLUGINS:
-            # import the plugin
-            plugin = __import__('plugins.' + p, globals(), locals(), [p])
-            if hasattr(plugin, 'generate'):
-                debugio.info('  ' + p)
-                plugin.generate(self)
diff --git a/db.py b/db.py
deleted file mode 100644
index 3426dfb..0000000
--- a/db.py
+++ /dev/null
@@ -1,300 +0,0 @@
-
-# db.py - database access layer for webcheck
-#
-# Copyright (C) 2011 Arthur de Jong
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software
-# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
-#
-# The files produced as output from the software do not automatically fall
-# under the copyright of the software, unless explicitly stated otherwise.
-
-import urlparse
-
-from sqlalchemy.ext.declarative import declarative_base
-from sqlalchemy import distinct, func
-from sqlalchemy import Table, Column, Integer, Boolean, String, DateTime, ForeignKey
-from sqlalchemy.orm import relationship, backref, sessionmaker
-from sqlalchemy.orm.session import object_session
-from sqlalchemy.sql.expression import ClauseElement, union
-
-import config
-import debugio
-import myurllib
-
-
-# provide session and schema classes
-Session = sessionmaker()
-Base = declarative_base()
-
-
-children = Table(
-    'children', Base.metadata,
-    Column('parent_id', Integer, ForeignKey('links.id', ondelete='CASCADE'), index=True),
-    Column('child_id', Integer, ForeignKey('links.id', ondelete='CASCADE'), index=True)
-    )
-
-
-embedded = Table(
-    'embedded', Base.metadata,
-    Column('parent_id', Integer, ForeignKey('links.id', ondelete='CASCADE'), index=True),
-    Column('child_id', Integer, ForeignKey('links.id', ondelete='CASCADE'), index=True)
-    )
-
-
-class Link(Base):
-
-    __tablename__ = 'links'
-
-    id = Column(Integer, primary_key=True)
-    url = Column(String, index=True, nullable=False, unique=True)
-    is_internal = Column(Boolean, index=True)
-    yanked = Column(String, index=True)
-    fetched = Column(DateTime, index=True)
-
-    # information about the retrieved link
-    status = Column(String)
-    mimetype = Column(String)
-    mimetype = Column(String)
-    encoding = Column(String)
-    size = Column(Integer)
-    mtime = Column(DateTime, index=True)
-    is_page = Column(Boolean, index=True)
-    title = Column(String, index=True)
-    author = Column(String)
-
-    # relationships between links
-    children = relationship('Link', secondary=children,
-        backref=backref('linked_from', lazy='dynamic'),
-        primaryjoin=(id == children.c.parent_id),
-        secondaryjoin=(id == children.c.child_id),
-        lazy='dynamic')
-    embedded = relationship('Link', secondary=embedded,
-        backref=backref('embedded_in', lazy='dynamic'),
-        primaryjoin=(id == embedded.c.parent_id),
-        secondaryjoin=(id == embedded.c.child_id),
-        lazy='dynamic')
-
-    # crawling information
-    redirectdepth = Column(Integer, default=0)
-    depth = Column(Integer)
-
-    @staticmethod
-    def clean_url(url):
-        # normalise the URL, removing the fragment from the URL
-        url = myurllib.normalizeurl(url)
-        return urlparse.urldefrag(myurllib.normalizeurl(url))[0]
-
-    def _get_link(self, url):
-        """Get a link object for the specified URL."""
-        # get the session
-        session = object_session(self)
-        # normalise the URL, removing the fragment from the URL
-        url, fragment = urlparse.urldefrag(myurllib.normalizeurl(url))
-        # try to find the link
-        instance = session.query(Link).filter_by(url=url).first()
-        if not instance:
-            instance = Link(url=url)
-            session.add(instance)
-        # mark that we were looking for an anchor/fragment
-        if fragment:
-            instance.add_reqanchor(self, fragment)
-        # return the link
-        return instance
-
-    def set_encoding(self, encoding):
-        """Set the encoding of the link doing some basic checks to see if
-        the encoding is supported."""
-        if not self.encoding and encoding:
-            try:
-                debugio.debug('crawler.Link.set_encoding(%r)' % encoding)
-                unicode('just some random text', encoding, 'replace')
-                self.encoding = encoding
-            except Exception, e:
-                import traceback
-                traceback.print_exc()
-                self.add_pageproblem('unknown encoding: %s' % encoding)
-
-    def add_redirect(self, url):
-        """Indicate that this link redirects to the specified url."""
-        url = self.clean_url(url)
-        # figure out depth
-        self.redirectdepth = max([self.redirectdepth] +
-                                 [x.redirectdepth for x in self.parents]) + 1
-        # check depth
-        if self.redirectdepth >= config.REDIRECT_DEPTH:
-            self.add_linkproblem('too many redirects (%d)' % self.redirectdepth)
-            return
-        # check for redirect to self
-        if url == self.url:
-            self.add_linkproblem('redirect same as source: %s' % url)
-            return
-        # add child
-        self.add_child(url)
-
-    def add_linkproblem(self, message):
-        """Indicate that something went wrong while retrieving this link."""
-        self.linkproblems.append(LinkProblem(message=message))
-
-    def add_pageproblem(self, message):
-        """Indicate that something went wrong with parsing the document."""
-        # only think about problems on internal pages
-        if not self.is_internal:
-            return
-        # TODO: only include a single problem once (e.g. multiple anchors)
-        self.pageproblems.append(PageProblem(message=message))
-
-    def add_child(self, url):
-        """Add the specified URL as a child of this link."""
-        # ignore children for external links
-        if not self.is_internal:
-            return
-        # add to children
-        self.children.append(self._get_link(url))
-
-    def add_embed(self, url):
-        """Mark the given URL as used as an image on this page."""
-        # ignore embeds for external links
-        if not self.is_internal:
-            return
-        # add to embedded
-        self.embedded.append(self._get_link(url))
-
-    def add_anchor(self, anchor):
-        """Indicate that this page contains the specified anchor."""
-        # lowercase anchor
-        anchor = anchor.lower()
-        if self.anchors.filter(Anchor.anchor == anchor).first():
-            self.add_pageproblem(
-              'anchor/id "%(anchor)s" defined multiple times'
-              % {'anchor': anchor})
-        else:
-            self.anchors.append(Anchor(anchor=anchor))
-
-    def add_reqanchor(self, parent, anchor):
-        """Indicate that the specified link contains a reference to the
-        specified anchor. This can be checked later."""
-        # lowercase anchor
-        anchor = anchor.lower()
-        # if RequestedAnchor doesn't exist, add it
-        if not self.reqanchors.filter((RequestedAnchor.parent_id == parent.id) & (RequestedAnchor.anchor == anchor)).first():
-            self.reqanchors.append(RequestedAnchor(parent_id=parent.id, anchor=anchor))
-
-    def follow_link(self, visited=None):
-        """If this link represents a redirect return the redirect target,
-        otherwise return self. If this redirect does not find a referenced
-        link None is returned."""
-        # if this is not a redirect just return
-        if not self.redirectdepth:
-            return self
-        # if we don't know where this redirects, return None
-        if not self.children.count():
-            return None
-        # avoid loops
-        if not visited:
-            visited = set()
-        visited.add(self.url)
-        # the first (and only) child is the redirect target
-        child = self.children.first()
-        if child.url in visited:
-            return None
-        # check where we redirect to
-        return child.follow_link(visited)
-
-    @property
-    def count_parents(self):
-        session = object_session(self)
-        p1 = session.query(func.count(distinct(children.c.parent_id))).filter(children.c.child_id == self.id)
-        p2 = session.query(func.count(distinct(embedded.c.parent_id))).filter(embedded.c.child_id == self.id)
-        return p1.scalar() + p2.scalar()
-
-    @property
-    def parents(self):
-        session = object_session(self)
-        #links = object_session(self).query(Link)
-        #links = links.join(children, Link.id == children.c.parent_id)
-        #links = links.join(embedded, Link.id == embedded.c.parent_id)
-        #return links.filter((children.c.child_id == self.id) |
-        #                    (embedded.c.child_id == self.id)).distinct()
-        parent_ids = union(session.query(children.c.parent_id).filter(children.c.child_id == self.id),
-                           session.query(embedded.c.parent_id).filter(embedded.c.child_id == self.id))
-
-        return session.query(Link).filter(Link.id == parent_ids.c.children_parent_id).distinct()
-
-
-class LinkProblem(Base):
-    """Storage of problems in the URL itself (e.g. problem downloading the
-    associated resource)."""
-
-    __tablename__ = 'linkproblems'
-
-    id = Column(Integer, primary_key=True)
-    link_id = Column(Integer, ForeignKey('links.id', ondelete='CASCADE'), index=True)
-    message = Column(String, index=True)
-    link = relationship(Link, backref=backref('linkproblems', order_by=message,
-                        cascade='all,delete,delete-orphan'))
-
-    def __unicode__(self):
-        return self.message
-
-
-class PageProblem(Base):
-    """Storage of problems in the information from the retrieved URL (e.g.
-    invalid HTML)."""
-
-    __tablename__ = 'pageproblems'
-
-    id = Column(Integer, primary_key=True)
-    link_id = Column(Integer, ForeignKey('links.id', ondelete='CASCADE'), index=True)
-    message = Column(String, index=True)
-    link = relationship(Link, backref=backref('pageproblems', order_by=message,
-                        cascade='all,delete,delete-orphan'))
-
-    def __unicode__(self):
-        return self.message
-
-
-class Anchor(Base):
-    """The named anchors (IDs) found on the page."""
-
-    __tablename__ = 'anchors'
-
-    id = Column(Integer, primary_key=True)
-    link_id = Column(Integer, ForeignKey('links.id', ondelete='CASCADE'), index=True)
-    link = relationship(Link, backref=backref('anchors',
-                        lazy='dynamic',
-                        cascade='all,delete,delete-orphan'))
-    anchor = Column(String)
-
-    def __unicode__(self):
-        return self.anchor
-
-
-class RequestedAnchor(Base):
-    """The named anchors (IDs) found on the page."""
-
-    __tablename__ = 'reqanchors'
-
-    id = Column(Integer, primary_key=True)
-    link_id = Column(Integer, ForeignKey('links.id', ondelete='CASCADE'), index=True)
-    link = relationship(Link, backref=backref('reqanchors',
-                        lazy='dynamic',
-                        cascade='all,delete,delete-orphan',
-                        ), primaryjoin='Link.id == RequestedAnchor.link_id')
-    parent_id = Column(Integer, ForeignKey('links.id', ondelete='CASCADE'), index=True)
-    parent = relationship(Link, primaryjoin='Link.id == RequestedAnchor.parent_id')
-    anchor = Column(String)
-
-    def __unicode__(self):
-        return self.anchor
diff --git a/debugio.py b/debugio.py
deleted file mode 100644
index 6d7f698..0000000
--- a/debugio.py
+++ /dev/null
@@ -1,65 +0,0 @@
-
-# debugio.py - output logging module
-#
-# Copyright (C) 1998, 1999 Albert Hopkins (marduk)
-# Copyright (C) 2002 Mike W. Meyer
-# Copyright (C) 2005, 2006, 2011 Arthur de Jong
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software
-# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
-#
-# The files produced as output from the software do not automatically fall
-# under the copyright of the software, unless explicitly stated otherwise.
-
-"""Debugging and message output module.
-
-This module contains facilities for logging program output.  The use of
-this module is really simple: import it, set loglevel, and use debug(),
-info(), warn() and error() whenever you want to print something."""
-
-import sys
-
-# log levels that can be used
-ERROR = 0
-WARN = 1
-INFO = 2
-DEBUG = 3
-
-# initialize logging at default level
-loglevel = INFO
-
-
-def debug(msg):
-    """Log the message to stderr if loglevel will allow it."""
-    if loglevel >= DEBUG:
-        sys.stderr.write('webcheck: DEBUG: ' + str(msg) + '\n')
-
-
-def info(msg):
-    """Log the message to stdout if loglevel will allow it."""
-    if loglevel >= INFO:
-        sys.stdout.write('webcheck: ' + str(msg) + '\n')
-        sys.stdout.flush()
-
-
-def warn(msg):
-    """Log a warning to stderr if loglevel will allow it."""
-    if loglevel >= WARN:
-        sys.stderr.write('webcheck: Warning: ' + str(msg) + '\n')
-
-
-def error(msg):
-    """Log an error to stderr if loglevel will allow it."""
-    if loglevel >= ERROR:
-        sys.stderr.write('webcheck: Error: ' + str(msg) + '\n')
diff --git a/monkeypatch.py b/monkeypatch.py
deleted file mode 100644
index cf9218e..0000000
--- a/monkeypatch.py
+++ /dev/null
@@ -1,81 +0,0 @@
-
-# monkeypatch.py - add missing functionality to standard modules
-#
-# Copyright (C) 2011 Arthur de Jong
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software
-# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
-#
-# The files produced as output from the software do not automatically fall
-# under the copyright of the software, unless explicitly stated otherwise.
-
-import re
-import urlparse
-import urllib
-import sys
-
-
-__all__ = []
-
-
-# This monkeypatches RuleLine.applies_to to support * and $ characters in
-# robots.txt path names.
-def my_applies_to(ruleline, filename):
-    if not hasattr(ruleline, 'pattern'):
-        pat = []
-        # we need to unescape the * from the path here
-        for x in ruleline.path.replace('%2A', '*'):
-            if x == '*':
-                pat.append('.*')
-            elif x == '$':
-                pat.append(r'\Z')
-            else:
-                pat.append(re.escape(x))
-        ruleline.pattern = re.compile(''.join(pat) + '(?ms)')
-    return bool(ruleline.pattern.match(filename))
-
-from robotparser import RuleLine
-RuleLine.applies_to = my_applies_to
-
-
-# This monkeypatches RobotFileParser.can_fetch to include the query string
-# into the tested part of the URL, taken from http://bugs.python.org/issue6325
-# this should be fixed in Python 2.7
-if sys.version_info < (2, 7):
-
-    def my_can_fetch(rfp, useragent, url):
-        """using the parsed robots.txt decide if useragent can fetch url"""
-        if rfp.disallow_all:
-            return False
-        if rfp.allow_all:
-            return True
-        # search for given user agent matches
-        # the first match counts
-        parsed_url = urlparse.urlparse(urllib.unquote(url))
-        url = urlparse.urlunparse(('', '', parsed_url.path,
-        parsed_url.params, parsed_url.query, parsed_url.fragment))
-        url = urllib.quote(url)
-        if not url:
-            url = "/"
-        for entry in rfp.entries:
-            if entry.applies_to(useragent):
-                return entry.allowance(url)
-        # try the default entry last
-        if rfp.default_entry:
-            return rfp.default_entry.allowance(url)
-        # agent not found ==> access granted
-        return True
-
-    from robotparser import RobotFileParser
-    RobotFileParser.can_fetch = my_can_fetch
diff --git a/myurllib.py b/myurllib.py
deleted file mode 100644
index bd5987c..0000000
--- a/myurllib.py
+++ /dev/null
@@ -1,120 +0,0 @@
-
-# myurllib.py - general purpose URL handling library
-#
-# Copyright (C) 2007, 2011 Arthur de Jong
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software
-# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
-#
-# The files produced as output from the software do not automatically fall
-# under the copyright of the software, unless explicitly stated otherwise.
-
-import urlparse
-import re
-import urllib
-
-# this is a workaround for Python 2.3
-try:
-    set
-except NameError:
-    from sets import Set as set
-
-# The way I read RFC3986 (especially sections 3.3 and 6.2) is that these
-# are all separate and valid URLs that point to the same resource.
-#
-# In section 6.2.2.3 only the removal of "." and ".." in paths is
-# mentioned although 6.2.3 does leave some room for other normalisation.
-
-# pattern for matching URL-encoded characters
-_urlencpattern = re.compile('(%[0-9a-fA-F]{2})')
-
-# characters that should be unescaped in URLs
-_okurlchars = set('-.0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ' \
-                  '_abcdefghijklmnopqrstuvwxyz~')
-
-# pattern for matching characters that should be escaped
-_urlprobpattern = re.compile('([^-;/?:@&=+$,%#.0123456789' \
-                             'ABCDEFGHIJKLMNOPQRSTUVWXYZ_' \
-                             'abcdefghijklmnopqrstuvwxyz~])')
-
-# pattern for double slashes
-_doubleslashpattern = re.compile('//+')
-
-# pattern for leading dots
-_leadingdotpattern = re.compile('^(/\.\.)*')
-
-
-def _unescape_printable(match):
-    """Helper function for _normalize_escapes() to perform the expansion of
-    html entity refs that are normal printable (but not reserver)
-    characters."""
-    # unescape the character
-    r = chr(int(match.group(1)[1:3], 16))
-    if r in _okurlchars:
-        return r
-    # transform remaining escapes to uppercase
-    return match.group(1).upper()
-
-
-def _normalize_escapes(url):
-    """Ensure that escaping in the url is consistent. Any reserved characters
-    are left alone. Any characters that are printable but are escaped are
-    unescaped. Any non-printable characters are escaped."""
-    # url decode any printable normal characters (this leaves us with a string
-    # with as much stuff unquoted as # possible)
-    url = _urlencpattern.sub(_unescape_printable, url)
-    # url encode any nonprintable or problematic characters (but not reserved
-    # characters) so we're left with a string with everything that needs to be
-    # quoted as such
-    url = _urlprobpattern.sub(lambda x: '%%%02X' % ord(x.group(1)), url)
-    return url
-
-
-def _urlclean(url):
-    """Clean the url of uneccesary parts."""
-    # make escaping consistent
-    url = _normalize_escapes(url)
-    # split the url in useful parts
-    (scheme, netloc, path, query, fragment) = urlparse.urlsplit(url)
-    # remove any leading /../ parts
-    if scheme in ('http', 'https'):
-        path = _leadingdotpattern.sub('', path)
-    if scheme in ('http', 'https', 'ftp'):
-        # http(s) urls should have a non-empty path
-        if path == '':
-            path = '/'
-        # make hostname lower case
-        (userpass, hostport) = urllib.splituser(netloc)
-        (host, port) = urllib.splitport(hostport)
-        # remove default port
-        if scheme == 'http' and str(port) == '80':
-            hostport = host
-        elif scheme == 'https' and str(port) == '443':
-            hostport = host
-        netloc = hostport.lower()
-        # trim trailing :
-        if netloc[-1:] == ':':
-            netloc = netloc[:-1]
-        if userpass is not None:
-            netloc = userpass + '@' + netloc
-    # get rid of double slashes in some paths
-    if scheme == 'file':
-        path = _doubleslashpattern.sub('/', path)
-    # put the url back together again
-    return urlparse.urlunsplit((scheme, netloc, path, query, fragment))
-
-
-def normalizeurl(url):
-    """Return a normalized URL."""
-    return _urlclean(url)
diff --git a/parsers/__init__.py b/parsers/__init__.py
deleted file mode 100644
index 3bfbd1f..0000000
--- a/parsers/__init__.py
+++ /dev/null
@@ -1,63 +0,0 @@
-
-# __init__.py - general content-type parser interface
-#
-# Copyright (C) 2005, 2006, 2011 Arthur de Jong
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software
-# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
-#
-# The files produced as output from the software do not automatically fall
-# under the copyright of the software, unless explicitly stated otherwise.
-
-"""This package groups all the supported content-types.
-
-A content-type module can be requested by the get_parsemodule() function.
-Each module should export the following function:
-
-    parse(content, link)
-        Based on the content, fill in the common fields of the link object."""
-
-# the modules that should be imported
-_modules = ('html', 'css')
-
-# a map of mimetypes to modules
-_parsermodules = {}
-
-
-def _init_modules():
-    """Initialize the modules."""
-    # go throught all known modules to probe the content-types
-    # (do this only once)
-    for mod in _modules:
-        parser = __import__('parsers.' + mod, globals(), locals(), [mod])
-        for mimetype in parser.mimetypes:
-            _parsermodules[mimetype] = parser
-
-
-def get_parsermodule(mimetype):
-    """Look up the correct module for the specified mimetype."""
-    if _parsermodules == {}:
-        _init_modules()
-    # check if we have a supported content-type
-    if mimetype in _parsermodules:
-        return _parsermodules[mimetype]
-    return None
-
-
-def get_mimetypes():
-    """Return a list of supported mime types that can be parsed
-    by the installed parsers."""
-    if _parsermodules == {}:
-        _init_modules()
-    return _parsermodules.keys()
diff --git a/parsers/css.py b/parsers/css.py
deleted file mode 100644
index 5ab2905..0000000
--- a/parsers/css.py
+++ /dev/null
@@ -1,55 +0,0 @@
-
-# css.py - parser functions for css content
-#
-# Copyright (C) 2005, 2006, 2009, 2011 Arthur de Jong
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software
-# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
-#
-# The files produced as output from the software do not automatically fall
-# under the copyright of the software, unless explicitly stated otherwise.
-
-"""This modules attempts to parse CSS files.
-It currently looks for url() links in stylesheet contents and also
-looks for @import processing directives."""
-
-mimetypes = ('text/css',)
-
-import urlparse
-import re
-
-# pattern for matching /* ... */ comments in css
-_commentpattern = re.compile('/\*.*?\*/', re.IGNORECASE | re.DOTALL)
-
-# pattern for matching @import "url" statments in css
-_importpattern = re.compile('@import\s+["\']([^"\']*)["\']',
-                            re.IGNORECASE | re.DOTALL)
-
-# pattern for matching url(...) in css
-_urlpattern = re.compile('url\(["\']?(.*?)["\']?\)')
-
-
-def parse(content, link, base=None):
-    """Parse the specified content and extract information for crawling the
-    site further."""
-    # if no base is specified, get it from the link
-    base = base or link.url
-    # strip out comments from the content
-    content = _commentpattern.sub('', content)
-    # handle @imports
-    for embed in _importpattern.findall(content):
-        link.add_embed(urlparse.urljoin(base, embed))
-    # handle url()s
-    for embed in _urlpattern.findall(content):
-        link.add_embed(urlparse.urljoin(base, embed))
diff --git a/parsers/html/__init__.py b/parsers/html/__init__.py
deleted file mode 100644
index 09966f4..0000000
--- a/parsers/html/__init__.py
+++ /dev/null
@@ -1,123 +0,0 @@
-
-# html.py - parser functions for html content
-#
-# Copyright (C) 2005, 2006, 2007, 2008, 2011 Arthur de Jong
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software
-# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
-#
-# The files produced as output from the software do not automatically fall
-# under the copyright of the software, unless explicitly stated otherwise.
-
-"""Parser functions for processing HTML content. This a front-end
-module that tries to load the BeatifulSoup parser first and falls
-back to loading the legacy HTMLParser parser."""
-
-import debugio
-import re
-import htmlentitydefs
-import config
-
-# the list of mimetypes this module should be able to handle
-mimetypes = ('text/html', 'application/xhtml+xml', 'text/x-server-parsed-html')
-
-# pattern for matching all html entities
-_entitypattern = re.compile('&(#[0-9]{1,6}|[a-zA-Z]{2,10});')
-
-
-def htmlescape(txt):
-    """HTML escape the given string and return an ASCII clean string with
-    known entities and character entities for the other values."""
-    # check for empty string
-    if not txt:
-        return u''
-    # convert to unicode object
-    if not isinstance(txt, unicode):
-        txt = unicode(txt)
-    # the output string
-    out = ''
-    # loop over the characters of the string
-    for c in txt:
-        if ord(c) in htmlentitydefs.codepoint2name:
-            out += '&%s;' % htmlentitydefs.codepoint2name[ord(c)]
-        elif ord(c) > 126:
-            out += '&#%d;' % ord(c)
-        else:
-            out += c.encode('utf-8')
-    return out
-
-
-def _unescape_entity(match):
-    """Helper function for htmlunescape().
-    This funcion unescapes a html entity, it is passed to the sub()
-    function."""
-    if match.group(1) in htmlentitydefs.name2codepoint:
-        # we have a named entity, return proper character
-        return unichr(htmlentitydefs.name2codepoint[match.group(1)])
-    elif match.group(1)[0] == '#':
-        # we have a numeric entity, replace with proper character
-        return unichr(int(match.group(1)[1:]))
-    else:
-        # we have something else, just keep the original
-        return match.group(0)
-
-
-def htmlunescape(txt):
-    """This function unescapes a html encoded string.
-    This function returns a unicode string."""
-    # check for empty string
-    if not txt:
-        return u''
-    # convert to unicode
-    if not isinstance(txt, unicode):
-        txt = unicode(txt, errors='replace')
-    # replace &name; and &#nn; refs with proper characters
-    txt = _entitypattern.sub(_unescape_entity, txt)
-    # we're done
-    return txt
-
-
-def _parsefunction(content, link):
-    # we find a suitable parse function
-    global _parsefunction
-    try:
-        # try BeautifulSoup parser first
-        import parsers.html.beautifulsoup
-        debugio.debug('parsers.html.parse(): the BeautifulSoup parser is ok')
-        _parsefunction = parsers.html.beautifulsoup.parse
-    except ImportError:
-        # fall back to legacy HTMLParser parser
-        debugio.warn('falling back to the legacy HTML parser, '
-                     'consider installing BeautifulSoup')
-        import parsers.html.htmlparser
-        _parsefunction = parsers.html.htmlparser.parse
-    # call the actual parse function
-    _parsefunction(content, link)
-
-
-def parse(content, link):
-    """Parse the specified content and extract an url list, a list of images a
-    title and an author. The content is assumed to contain HMTL."""
-    # call the normal parse function
-    _parsefunction(content, link)
-    # call the tidy parse function
-    if config.TIDY_OPTIONS:
-        try:
-            import calltidy
-            debugio.debug('parsers.html.parse(): the Tidy parser is ok')
-            calltidy.parse(content, link)
-        except ImportError:
-            debugio.warn('tidy library (python-utidylib) is unavailable')
-            # remove config to only try once
-            config.TIDY_OPTIONS = None
diff --git a/parsers/html/beautifulsoup.py b/parsers/html/beautifulsoup.py
deleted file mode 100644
index 268014d..0000000
--- a/parsers/html/beautifulsoup.py
+++ /dev/null
@@ -1,191 +0,0 @@
-
-# beautifulsoup.py - parser functions for html content
-#
-# Copyright (C) 2007, 2008, 2009, 2011 Arthur de Jong
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software
-# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
-#
-# The files produced as output from the software do not automatically fall
-# under the copyright of the software, unless explicitly stated otherwise.
-
-"""Parser functions for processing HTML content. This module uses the
-BeautifulSoup HTML parser and is more flexible than the legacy HTMLParser
-module."""
-
-import urlparse
-import crawler
-import re
-import htmlentitydefs
-import BeautifulSoup
-import myurllib
-from parsers.html import htmlunescape
-
-# pattern for matching http-equiv and content part of
-# <meta http-equiv="refresh" content="0;url=URL">
-_refreshhttpequivpattern = re.compile('^refresh$', re.I)
-_refershcontentpattern = re.compile('^[0-9]+;url=(.*)$', re.I)
-
-# check BeautifulSoup find() function for bugs
-if BeautifulSoup.BeautifulSoup('<foo>').find('foo', bar=True):
-    import debugio
-    debugio.warn('using buggy version of BeautifulSoup (%s)' %
-                 BeautifulSoup.__version__)
-
-
-def parse(content, link):
-    """Parse the specified content and extract an url list, a list of images a
-    title and an author. The content is assumed to contain HMTL."""
-    # create parser and feed it the content
-    soup = BeautifulSoup.BeautifulSoup(content,
-                                       fromEncoding=str(link.encoding))
-    # fetch document encoding
-    link.set_encoding(soup.originalEncoding)
-    # <title>TITLE</title>
-    title = soup.find('title')
-    if title and title.string:
-        link.title = htmlunescape(title.string).strip()
-
-    # FIXME: using myurllib.normalizeurl is wrong below, we should probably use
-    #        something like link.urlunescape() to do the escaping and check
-    #        and log at the same time
-
-    # <base href="URL">
-    base = soup.find('base', href=True)
-    if base:
-        base = myurllib.normalizeurl(htmlunescape(base['href']).strip())
-    else:
-        base = link.url
-    # <link rel="TYPE" href="URL">
-    for l in soup.findAll('link', rel=True, href=True):
-        if l['rel'].lower() in ('stylesheet', 'alternate stylesheet', 'icon',
-                                'shortcut icon'):
-            embed = myurllib.normalizeurl(htmlunescape(l['href']).strip())
-            if embed:
-                link.add_embed(urlparse.urljoin(base, embed))
-    # <meta name="author" content="AUTHOR">
-    author = soup.find('meta', attrs={'name': re.compile("^author$", re.I),
-                                      'content': True})
-    if author and author['content']:
-        link.author = htmlunescape(author['content']).strip()
-    # <meta http-equiv="refresh" content="0;url=URL">
-    refresh = soup.find('meta', attrs={'http-equiv': _refreshhttpequivpattern,
-                                       'content': True})
-    if refresh and refresh['content']:
-        try:
-            child = _refershcontentpattern.search(refresh['content']).group(1)
-        except AttributeError:
-            pass  # ignore cases where refresh header parsing causes problems
-        else:
-            link.add_child(urlparse.urljoin(base, child))
-    # <img src="URL">
-    for img in soup.findAll('img', src=True):
-        embed = myurllib.normalizeurl(htmlunescape(img['src']).strip())
-        if embed:
-            link.add_embed(urlparse.urljoin(base, embed))
-    # <a href="URL">
-    for a in soup.findAll('a', href=True):
-        child = myurllib.normalizeurl(htmlunescape(a['href']).strip())
-        if child:
-            link.add_child(urlparse.urljoin(base, child))
-    # <a name="NAME">
-    # TODO: consistent url escaping?
-    for a in soup.findAll('a', attrs={'name': True}):
-        # get anchor name
-        a_name = myurllib.normalizeurl(htmlunescape(a['name']).strip())
-        # if both id and name are used they should be the same
-        if 'id' in a and \
-           a_name != myurllib.normalizeurl(htmlunescape(a['id']).strip()):
-            link.add_pageproblem(
-              'anchors defined in name and id attributes do not match')
-            # add the id anchor anyway
-            link.add_anchor(myurllib.normalizeurl(htmlunescape(a['id']).strip()))
-        # add the anchor
-        link.add_anchor(a_name)
-    # <ANY id="ID">
-    for elem in soup.findAll(id=True):
-        # skip anchor that have a name
-        if elem.name == 'a' and 'name' in elem:
-            continue
-        # add the anchor
-        link.add_anchor(myurllib.normalizeurl(htmlunescape(elem['id']).strip()))
-    # <frameset><frame src="URL"...>...</frameset>
-    for frame in soup.findAll('frame', src=True):
-        embed = myurllib.normalizeurl(htmlunescape(frame['src']).strip())
-        if embed:
-            link.add_embed(urlparse.urljoin(base, embed))
-    # <iframe src="URL"...>
-    for frame in soup.findAll('iframe', src=True):
-        embed = myurllib.normalizeurl(htmlunescape(frame['src']).strip())
-        if embed:
-            link.add_embed(urlparse.urljoin(base, embed))
-    # <object data="URL"...>
-    for obj in soup.findAll('object', data=True):
-        embed = myurllib.normalizeurl(htmlunescape(obj['data']).strip())
-        if embed:
-            link.add_embed(urlparse.urljoin(base, embed))
-    # <object><param name="movie" value="URL"...></object>
-    for para in soup.findAll('param', attrs={'name': 'movie', 'value': True}):
-        embed = myurllib.normalizeurl(htmlunescape(para['value']).strip())
-        if embed:
-            link.add_embed(urlparse.urljoin(base, embed))
-    # <map><area href="URL"...>...</map>
-    for area in soup.findAll('area', href=True):
-        child = myurllib.normalizeurl(htmlunescape(area['href']).strip())
-        if child:
-            link.add_child(urlparse.urljoin(base, child))
-    # <applet code="URL" [archive="URL"]...>
-    for applet in soup.findAll('applet', code=True):
-        # if applet has archive tag check that
-        if 'archive' in applet:
-            embed = myurllib.normalizeurl(htmlunescape(applet['archive']).strip())
-        else:
-            embed = myurllib.normalizeurl(htmlunescape(applet['code']).strip())
-        if embed:
-            link.add_embed(urlparse.urljoin(base, embed))
-    # <embed src="URL"...>
-    for embedd in soup.findAll('frame', src=True):
-        embed = myurllib.normalizeurl(htmlunescape(embedd['src']).strip())
-        if embed:
-            link.add_embed(urlparse.urljoin(base, embed))
-    # <embed><param name="movie" value="url"></embed>
-    for param in soup.findAll('param', attrs={
-                  'name': re.compile("^movie$", re.I),
-                  'value': True}):
-        embed = myurllib.normalizeurl(htmlunescape(param['value']).strip())
-        if embed:
-            link.add_embed(urlparse.urljoin(base, embed))
-    # <style>content</style>
-    for style in soup.findAll('style'):
-        if style.string:
-            # delegate handling of inline css to css module
-            import parsers.css
-            parsers.css.parse(htmlunescape(style.string), link, base)
-    # <ANY style="CSS">
-    for elem in soup.findAll(style=True):
-        # delegate handling of inline css to css module
-        import parsers.css
-        parsers.css.parse(elem['style'], link, base)
-    # <script src="url">
-    for script in soup.findAll('script', src=True):
-        embed = myurllib.normalizeurl(htmlunescape(script['src']).strip())
-        if embed:
-            link.add_embed(urlparse.urljoin(base, embed))
-    # <body|table|td background="url">
-    for t in soup.findAll(('body', 'table', 'td'), background=True):
-        embed = myurllib.normalizeurl(htmlunescape(t['background']).strip())
-        if embed:
-            link.add_embed(urlparse.urljoin(base, embed))
-    # flag that the link contains a valid page
-    link.is_page = True
diff --git a/parsers/html/calltidy.py b/parsers/html/calltidy.py
deleted file mode 100644
index 6623b08..0000000
--- a/parsers/html/calltidy.py
+++ /dev/null
@@ -1,36 +0,0 @@
-
-# calltidy.py - parser functions for html content
-#
-# Copyright (C) 2008, 2011 Arthur de Jong
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software
-# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
-#
-# The files produced as output from the software do not automatically fall
-# under the copyright of the software, unless explicitly stated otherwise.
-
-import tidy
-import config
-import parsers.html
-
-
-def parse(content, link):
-    """Parse the specified content with tidy and add any errors to the
-    link."""
-    # only call tidy on internal pages
-    if link.is_internal:
-        t = tidy.parseString(content, **config.TIDY_OPTIONS)
-        for err in t.errors:
-            # error messages are escaped so we unescape them
-            link.add_pageproblem(parsers.html.htmlunescape(unicode(err)))
diff --git a/parsers/html/htmlparser.py b/parsers/html/htmlparser.py
deleted file mode 100644
index 2bae745..0000000
--- a/parsers/html/htmlparser.py
+++ /dev/null
@@ -1,304 +0,0 @@
-
-# html.py - parser functions for html content
-#
-# Copyright (C) 2005, 2006, 2007, 2009, 2011 Arthur de Jong
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software
-# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
-#
-# The files produced as output from the software do not automatically fall
-# under the copyright of the software, unless explicitly stated otherwise.
-
-"""Parser functions for processing HTML content. This module uses
-the legacy HTMLParser module. It will only be used if BeatifulSoup
-is not available and can be considered depricated. This parser
-will only handle properly formatted HTML."""
-
-import debugio
-import HTMLParser
-import urlparse
-import re
-import crawler
-import myurllib
-from parsers.html import htmlunescape
-
-# pattern for matching numeric html entities
-_charentitypattern = re.compile('&#([0-9]{1,3});')
-
-# pattern for matching spaces
-_spacepattern = re.compile(' ')
-
-# pattern for matching charset declaration for http-equiv tag
-_charsetpattern = re.compile('charset=([^ ]*)', re.I)
-
-# pattern for matching the encoding part of an xml declaration
-_encodingpattern = re.compile('^xml .*encoding="([^"]*)"', re.I)
-
-
-class _MyHTMLParser(HTMLParser.HTMLParser):
-    """A simple subclass of HTMLParser.HTMLParser continuing after errors
-    and gathering some information from the parsed content."""
-
-    def __init__(self, link):
-        """Inialize the menbers in which we collect data from parsing the
-        document."""
-        self.link = link
-        self.collect = None
-        self.base = None
-        self.title = None
-        self.author = None
-        self.embedded = []
-        self.children = []
-        self.anchors = []
-        self.errmsg = None
-        self.errcount = 0
-        HTMLParser.HTMLParser.__init__(self)
-
-    def _location(self):
-        """Return the current parser location as a string."""
-        (lineno, offset) = self.getpos()
-        if lineno is not None:
-            msg = 'at line %d' % lineno
-        else:
-            msg = 'at unknown line'
-        if offset is not None:
-            msg += ', column %d' % (offset + 1)
-        return msg
-
-    def _cleanurl(self, url, what='link'):
-        """Do some translations of url."""
-        # check for spaces in urls
-        # (characters are escaped in myurllib.normalizeurl())
-        if _spacepattern.search(url):
-            self.link.add_pageproblem(
-              what + ' contains unescaped spaces: ' + url + ', ' + self._location())
-        # replace &#nnn; entity refs with proper characters
-        url = _charentitypattern.sub(lambda x: chr(int(x.group(1))), url)
-        return myurllib.normalizeurl(url)
-
-    def error(self, message):
-        """Override superclass' error() method to ignore errors."""
-        # construct error message
-        message += ', ' + self._location()
-        # store error message
-        debugio.debug('parsers.html.htmlparser._MyHTMLParser.error(): problem parsing html: ' + message)
-        if self.errmsg is None:
-            self.errmsg = message
-        # increment error count
-        self.errcount += 1
-        if self.errcount > 10:
-            raise HTMLParser.HTMLParseError(message, self.getpos())
-
-    def check_for_whole_start_tag(self, i):
-        """Override to catch assertion exception."""
-        try:
-            return HTMLParser.HTMLParser.check_for_whole_start_tag(self, i)
-        except AssertionError:
-            debugio.debug('parsers.html.htmlparser._MyHTMLParser.check_for_whole_start_tag(): caught assertion error')
-            return None
-
-    def handle_starttag(self, tag, attrs):
-        """Handle start tags in html."""
-        # turn attrs into hash
-        attrs = dict(attrs)
-        # <title>TITLE</title>
-        if tag == 'title':
-            self.collect = ''
-        # <base href="URL">
-        elif tag == 'base' and 'href' in attrs:
-            self.base = self._cleanurl(attrs['href'])
-        # <link rel="type" href="URL">
-        elif tag == 'link' and 'rel' in attrs and 'href' in attrs:
-            if attrs['rel'].lower() in ('stylesheet', 'alternate stylesheet', 'icon', 'shortcut icon'):
-                self.embedded.append(self._cleanurl(attrs['href']))
-        # <meta name="author" content="AUTHOR">
-        elif tag == 'meta' and 'name' in attrs and 'content' in attrs and attrs['name'].lower() == 'author':
-            if self.author is None:
-                self.author = attrs['content']
-        # <meta http-equiv="refresh" content="0;url=URL">
-        elif tag == 'meta' and 'http-equiv' in attrs and 'content' in attrs and attrs['http-equiv'].lower() == 'refresh':
-            pass  # TODO: implement
-        # <meta http-equiv="content-type" content="text/html; charset=utf-8" />
-        elif tag == 'meta' and 'http-equiv' in attrs and 'content' in attrs and attrs['http-equiv'].lower() == 'content-type':
-            try:
-                self.link.set_encoding(_charsetpattern.search(attrs['content']).group(1))
-            except AttributeError:
-                # ignore cases where encoding is not set in header
-                pass
-        # <img src="url">
-        elif tag == 'img' and 'src' in attrs:
-            self.embedded.append(self._cleanurl(attrs['src']))
-        # <a href="url" name="anchor" id="anchor">
-        elif tag == 'a':
-            # <a href="url">
-            if 'href' in attrs:
-                self.children.append(self._cleanurl(attrs['href']))
-            # <a name="anchor">
-            a_name = None
-            if 'name' in attrs:
-                a_name = self._cleanurl(attrs['name'], 'anchor')
-            # <a id="anchor">
-            a_id = None
-            if 'id' in attrs:
-                a_id = self._cleanurl(attrs['id'], 'anchor')
-            # if both id and name are used they should be the same
-            if a_id and a_name and a_id != a_name:
-                # add problem
-                self.link.add_pageproblem(
-                  'anchors defined in name and id attributes do not match %(location)s'
-                  % {'location': self._location()})
-            elif a_id == a_name:
-                # ignore id if it's the same as name
-                a_id = None
-            # <a name="anchor">
-            if a_name:
-                if a_name in self.anchors:
-                    self.link.add_pageproblem(
-                      'anchor "%(anchor)s" defined again %(location)s'
-                      % {'anchor':   a_name,
-                         'location': self._location()})
-                else:
-                    self.anchors.append(a_name)
-            # <a id="anchor">
-            if a_id:
-                if a_id in self.anchors:
-                    self.link.add_pageproblem(
-                      'anchor "%(anchor)s" defined again %(location)s'
-                      % {'anchor':   a_id,
-                         'location': self._location()})
-                else:
-                    self.anchors.append(a_id)
-        # <frameset><frame src="url"...>...</frameset>
-        elif tag == 'frame' and 'src' in attrs:
-            self.embedded.append(self._cleanurl(attrs['src']))
-        # <map><area href="url"...>...</map>
-        elif tag == 'area' and 'href' in attrs:
-            self.children.append(self._cleanurl(attrs['href']))
-        # <applet archive="URL"...>
-        elif tag == 'applet' and 'archive' in attrs:
-            self.embedded.append(self._cleanurl(attrs['archive']))
-        # <applet code="URL"...>
-        elif tag == 'applet' and 'code' in attrs:
-            self.embedded.append(self._cleanurl(attrs['code']))
-        # <embed src="url"...>
-        elif tag == 'embed' and 'src' in attrs:
-            self.embedded.append(self._cleanurl(attrs['src']))
-        # <embed><param name="movie" value="url"></embed>
-        elif tag == 'param' and 'name' in attrs and 'value' in attrs:
-            if attrs['name'].lower() == 'movie':
-                self.embedded.append(self._cleanurl(attrs['value']))
-        # <style>content</style>
-        elif tag == 'style':
-            self.collect = ''
-        # <script src="url">
-        elif tag == 'script' and 'src' in attrs:
-            self.embedded.append(self._cleanurl(attrs['src']))
-        # <body|table|td background="url">
-        elif tag in ('body', 'table', 'td') and 'background' in attrs:
-            self.embedded.append(self._cleanurl(attrs['background']))
-        # pick up any tags with a style attribute
-        if 'style' in attrs:
-            # delegate handling of inline css to css module
-            import parsers.css
-            parsers.css.parse(attrs['style'], self.link, self.base)
-
-    def handle_endtag(self, tag):
-        """Handle end tags in html."""
-        if tag == 'title' and self.title is None:
-            self.title = self.collect
-            self.collect = None
-        elif tag == 'style' and self.collect is not None:
-            # delegate handling of inline css to css module
-            import parsers.css
-            parsers.css.parse(self.collect, self.link, self.base)
-
-    def handle_data(self, data):
-        """Collect data if we were collecting data."""
-        if self.collect is not None:
-            self.collect += data
-
-    def handle_charref(self, name):
-        """Handle character references (e.g. &#65;) by passing the data to
-        handle_data()."""
-        self.handle_data('&#' + name + ';')
-        # TODO: do not pass ; if plain text does not contain it?
-
-    def handle_entityref(self, name):
-        """Handle entity references (e.g. &eacute;) by passing the data to
-        handle_data()."""
-        self.handle_data('&' + name + ';')
-        # TODO: do not pass ; if plain text does not contain it?
-
-    def handle_pi(self, data):
-        """Handle xml declaration."""
-        # find character encoding from declaration
-        try:
-            self.link.set_encoding(_encodingpattern.search(data).group(1))
-        except AttributeError:
-            pass
-
-
-def _maketxt(txt, encoding):
-    """Return an unicode text of the specified string do correct character
-    conversions and replacing html entities with normal characters."""
-    # try to decode with the given encoding
-    if encoding:
-        try:
-            return htmlunescape(unicode(txt, encoding, 'replace'))
-        except (LookupError, TypeError, ValueError), e:
-            debugio.warn('page has unknown encoding: %s' % str(encoding))
-    # fall back to locale's encoding
-    return htmlunescape(unicode(txt, errors='replace'))
-
-
-def parse(content, link):
-    """Parse the specified content and extract an url list, a list of images a
-    title and an author. The content is assumed to contain HMTL."""
-    # create parser and feed it the content
-    parser = _MyHTMLParser(link)
-    try:
-        parser.feed(content)
-        parser.close()
-    except Exception, e:
-        # ignore (but log) all errors
-        debugio.debug('parsers.html.htmlparser.parse(): caught exception: ' + str(e))
-    # check for parser errors
-    if parser.errmsg is not None:
-        debugio.debug('parsers.html.htmlparser.parse(): problem parsing html: ' + parser.errmsg)
-        link.add_pageproblem('problem parsing html: %s' % parser.errmsg)
-    # dump encoding
-    debugio.debug('parsers.html.htmlparser.parse(): html encoding: %s' % str(link.encoding))
-    # flag that the link contains a valid page
-    link.is_page = True
-    # save the title
-    if parser.title is not None:
-        link.title = _maketxt(parser.title, link.encoding).strip()
-    # save the author
-    if parser.author is not None:
-        link.author = _maketxt(parser.author, link.encoding).strip()
-    # figure out the base of the document (for building the other urls)
-    base = link.url
-    if parser.base is not None:
-        base = parser.base
-    # list embedded and children
-    for embed in parser.embedded:
-        if embed:
-            link.add_embed(urlparse.urljoin(base, embed))
-    for child in parser.children:
-        if child:
-            link.add_child(urlparse.urljoin(base, child))
-    # list anchors
-    for anchor in parser.anchors:
-        if anchor:
-            link.add_anchor(anchor)
diff --git a/plugins/__init__.py b/plugins/__init__.py
deleted file mode 100644
index 2607101..0000000
--- a/plugins/__init__.py
+++ /dev/null
@@ -1,284 +0,0 @@
-
-# __init__.py - plugin function module
-#
-# Copyright (C) 1998, 1999 Albert Hopkins (marduk)
-# Copyright (C) 2002 Mike W. Meyer
-# Copyright (C) 2005, 2006, 2007, 2009, 2011 Arthur de Jong
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software
-# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
-#
-# The files produced as output from the software do not automatically fall
-# under the copyright of the software, unless explicitly stated otherwise.
-
-"""This package groups all the plugins.
-
-When generating the report each plugin is called in turn with
-the generate() function. Each plugin should export the following
-fields:
-
-    generate(site)
-        Based on the site generate all the output files as needed.
-    __title__
-        A short description of the plugin that is used when linking
-        to the output from the plugin.
-    __author__
-        The author(s) of the plugin.
-    __outputfile__
-        The file the plugin generates (for linking to).
-    docstring
-        The docstring is used as description of the plugin in the
-        report.
-
-Pluings can use the functions exported by this module."""
-
-import sys
-import time
-
-from sqlalchemy.orm import joinedload
-from sqlalchemy.orm.session import object_session
-
-import config
-import db
-import debugio
-import parsers.html
-
-# reference function from html module
-htmlescape = parsers.html.htmlescape
-
-
-def _floatformat(f):
-    """Return a float as a string while trying to keep it within three
-    characters."""
-    txt = '%.1f' % f
-    # remove period from too long strings
-    if len(txt) > 3:
-        txt = txt[:txt.find('.')]
-    return txt
-
-
-def get_size(i):
-    """Return the size in bytes as a readble string."""
-    K = 1024
-    M = K * 1024
-    G = M * 1024
-    if i > 1024 * 1024 * 999:
-        return _floatformat(float(i) / float(G)) + 'G'
-    elif i > 1024 * 999:
-        return _floatformat(float(i) / float(M)) + 'M'
-    elif i >= 1024:
-        return _floatformat(float(i) / float(K)) + 'K'
-    else:
-        return '%d' % i
-
-
-def _get_info(link):
-    """Return a string with a summary of the information in the link."""
-    info = u'url: %s\n' % link.url
-    if link.status:
-        info += u'%s\n' % link.status
-    if link.title:
-        info += u'title: %s\n' % link.title.strip()
-    if link.author:
-        info += u'author: %s\n' % link.author.strip()
-    if link.is_internal:
-        info += u'internal link'
-    else:
-        info += u'external link'
-    if link.yanked:
-        info += u', not checked (%s)\n' % link.yanked
-    else:
-        info += u'\n'
-    if link.redirectdepth:
-        if link.children.count() > 0:
-            info += u'redirect: %s\n' % link.children.first().url
-        else:
-            info += u'redirect (not followed)\n'
-    count = link.count_parents
-    if count == 1:
-        info += u'linked from 1 page\n'
-    elif count > 1:
-        info += u'linked from %d pages\n' % count
-    if link.mtime:
-        info += u'last modified: %s\n' % time.ctime(link.mtime)
-    if link.size:
-        info += u'size: %s\n' % get_size(link.size)
-    if link.mimetype:
-        info += u'mime-type: %s\n' % link.mimetype
-    if link.encoding:
-        info += u'encoding: %s\n' % link.encoding
-    for problem in link.linkproblems:
-        info += u'problem: %s\n' % problem.message
-    # trim trailing newline
-    return info.strip()
-
-
-def make_link(link, title=None):
-    """Return an <a>nchor to a url with title. If url is in the Linklist and
-    is external, insert "class=external" in the <a> tag."""
-    return '<a href="%(url)s" %(target)sclass="%(cssclass)s" title="%(info)s">%(title)s</a>' % \
-            dict(url=htmlescape(link.url),
-                 target='target="_blank" ' if config.REPORT_LINKS_IN_NEW_WINDOW else '',
-                 cssclass='internal' if link.is_internal else 'external',
-                 info=htmlescape(_get_info(link)).replace('\n', '&#10;'),
-                 title=htmlescape(title or link.title or link.url))
-
-
-def print_parents(fp, link, indent='     '):
-    """Write a list of parents to the output file descriptor.
-    The output is indeted with the specified indent."""
-    # if there are no parents print nothing
-    count = link.count_parents
-    if not count:
-        return
-    parents = link.parents.order_by(db.Link.title, db.Link.url).options(joinedload(db.Link.linkproblems))[:config.PARENT_LISTLEN]
-    fp.write(
-      indent + '<div class="parents">\n' +
-      indent + ' referenced from:\n' +
-      indent + ' <ul>\n')
-    more = link.count_parents
-    for parent in parents:
-        fp.write(
-          indent + '  <li>%(parent)s</li>\n'
-          % {'parent': make_link(parent)})
-        more -= 1
-    if more:
-        fp.write(
-          indent + '  <li>%(more)d more...</li>\n'
-          % {'more': more})
-    fp.write(
-      indent + ' </ul>\n' +
-      indent + '</div>\n')
-
-
-def open_file(filename, istext=True, makebackup=False):
-    """This returns an open file object which can be used for writing. This
-    file is created in the output directory. The output directory (stored in
-    config.OUTPUT_DIR is created if it does not yet exist. If the second
-    parameter is True (default) the file is opened as an UTF-8 text file."""
-    import os
-    # check if output directory exists and create it if needed
-    if not os.path.isdir(config.OUTPUT_DIR):
-        try:
-            os.mkdir(config.OUTPUT_DIR)
-        except OSError, (errno, strerror):
-            debugio.error('error creating directory %(dir)s: %(strerror)s' %
-                          {'dir': config.OUTPUT_DIR,
-                           'strerror': strerror})
-            sys.exit(1)
-    # build the output file name
-    fname = os.path.join(config.OUTPUT_DIR, filename)
-    # check if file exists
-    if os.path.exists(fname):
-        if makebackup:
-            # create backup of original (overwriting previous backup)
-            os.rename(fname, fname + '~')
-        elif not config.OVERWRITE_FILES:
-            # ask to overwrite
-            try:
-                res = raw_input('webcheck: overwrite %s? [y]es, [a]ll, [q]uit: ' % fname)
-            except EOFError:
-                # bail out in case raw_input() failed
-                debugio.error('error reading response')
-                res = 'q'
-            res = res.lower() + ' '
-            if res[0] == 'a':
-                config.OVERWRITE_FILES = True
-            elif res[0] != 'y':
-                print 'Aborted.'
-                sys.exit(1)
-    # open the file for writing
-    try:
-        if istext:
-            return open(fname, 'w')
-        else:
-            return open(fname, 'wb')
-    except IOError, (errno, strerror):
-        debugio.error('error creating output file %(fname)s: %(strerror)s' %
-                      {'fname': fname,
-                       'strerror': strerror})
-        sys.exit(1)
-
-
-def _print_navbar(fp, plugin):
-    """Return an html fragement representing the navigation bar for a page."""
-    fp.write('  <ul class="navbar">\n')
-    for p in config.PLUGINS:
-        # import the plugin
-        report = __import__('plugins.' + p, globals(), locals(), [p])
-        # skip if no outputfile
-        if not hasattr(report, '__outputfile__'):
-            continue
-        # generate a link to the plugin page
-        selected = ''
-        if report == plugin:
-            selected = ' class="selected"'
-        fp.write(
-          '   <li><a href="%(pluginfile)s"%(selected)s title="%(description)s">%(title)s</a></li>\n'
-          % {'pluginfile':  report.__outputfile__,
-             'selected':    selected,
-             'title':       htmlescape(report.__title__),
-             'description': htmlescape(report.__doc__)})
-    fp.write('  </ul>\n')
-
-
-def open_html(plugin, site):
-    """Print an html fragment for the start of an html page."""
-    # open the file
-    fp = open_file(plugin.__outputfile__)
-    # get the first base url
-    base = site.bases[0]
-    # write basic html head
-    fp.write(
-      '<?xml version="1.0" encoding="UTF-8" standalone="yes"?>\n'
-      '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">\n'
-      '<html xmlns="http://www.w3.org/1999/xhtml">\n'
-      ' <head>\n'
-      '  <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />\n'
-      '  <title>Webcheck report for %(sitetitle)s (%(plugintitle)s)</title>\n'
-      '  <link rel="stylesheet" type="text/css" href="webcheck.css" />\n'
-      '  <link rel="icon" href="favicon.ico" type="image/ico" />\n'
-      '  <link rel="shortcut icon" href="favicon.ico" />\n'
-      '  <script type="text/javascript" src="fancytooltips.js"></script>\n'
-      '  <meta name="Generator" content="webcheck %(version)s" />\n'
-      ' </head>\n'
-      ' <body>\n'
-      '  <h1 class="basename">Webcheck report for <a href="%(siteurl)s">%(sitetitle)s</a></h1>\n'
-      % {'sitetitle':   htmlescape(base.title or base.url),
-         'plugintitle': htmlescape(plugin.__title__),
-         'siteurl':     base.url,
-         'version':     config.VERSION})
-    # write navigation bar
-    _print_navbar(fp, plugin)
-    # write plugin heading
-    fp.write('  <h2>%s</h2>\n' % htmlescape(plugin.__title__))
-    # write plugin contents
-    fp.write('  <div class="content">\n')
-    return fp
-
-
-def close_html(fp):
-    """Print an html fragment as footer of an html page."""
-    fp.write('  </div>\n')
-    # write bottom of page
-    fp.write(
-      '  <p class="footer">\n'
-      '   Generated %(time)s by <a href="%(homepage)s">webcheck %(version)s</a>\n'
-      '  </p>\n'
-      ' </body>\n'
-      '</html>\n'
-      % {'time':     htmlescape(time.ctime(time.time())),
-         'homepage': config.HOMEPAGE,
-         'version':  htmlescape(config.VERSION)})
-    fp.close()
diff --git a/plugins/about.py b/plugins/about.py
deleted file mode 100644
index fdfb3a1..0000000
--- a/plugins/about.py
+++ /dev/null
@@ -1,114 +0,0 @@
-
-# about.py - plugin to list some information about used plugins
-#
-# Copyright (C) 1998, 1999 Albert Hopkins (marduk)
-# Copyright (C) 2002 Mike W. Meyer
-# Copyright (C) 2005, 2006, 2007, 2011 Arthur de Jong
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software
-# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
-#
-# The files produced as output from the software do not automatically fall
-# under the copyright of the software, unless explicitly stated otherwise.
-
-"""Present an overview of the plugins that are used."""
-
-__title__ = 'about webcheck'
-__author__ = 'Arthur de Jong'
-__outputfile__ = 'about.html'
-
-import time
-
-import config
-import db
-import plugins
-
-
-def generate(site):
-    """Output a list of modules, it's authors and the webcheck version."""
-    fp = plugins.open_html(plugins.about, site)
-    session = db.Session()
-    # TODO: xxx links were fetched, xxx pages were examined and a total of xxx notes and problems were found
-    # TODO: include some runtime information (e.g. supported schemes, user configuration, etc)
-    # output some general information about the report
-    fp.write(
-      '   <p>\n'
-      '    This is a website report generated by <tt>webcheck</tt>\n'
-      '    %(version)s. <tt>webcheck</tt> is a website checking tool for\n'
-      '    webmasters. It crawls a given website and does a number of tests\n'
-      '    to see if links and pages are valid.\n'
-      '    More information about <tt>webcheck</tt> can be found at the\n'
-      '    <tt>webcheck</tt> homepage which is located at\n'
-      '    <a href="%(homepage)s">%(homepage)s</a>.\n'
-      '   </p>\n'
-      '   <p>\n'
-      '    This report was generated on %(time)s, a total of %(numurls)d\n'
-      '    links were found.\n'
-      '   </p>\n\n'
-      % {'version':  plugins.htmlescape(config.VERSION),
-         'time':     plugins.htmlescape(time.ctime(time.time())),
-         'numurls':  session.query(db.Link).count(),
-         'homepage': config.HOMEPAGE})
-    # output copyright information
-    fp.write(
-      '   <h3>Copyright</h3>\n'
-      '   <p>\n'
-      '    <tt>webcheck</tt> was originally named <tt>linbot</tt> which was\n'
-      '    developed by Albert Hopkins (marduk).\n'
-      '    Versions up till 1.0 were maintained by Mike W. Meyer who changed\n'
-      '    the name to <tt>webcheck</tt>.\n'
-      '    After that Arthur de Jong did a complete rewrite.\n'
-      '    <tt>webcheck</tt> is <i>free software</i>; you can redistribute it\n'
-      '    and/or modify it under the terms of the\n'
-      '    <a href="http://www.gnu.org/copyleft/gpl.html">GNU General Public License</a>\n'
-      '    (version 2 or later).\n'
-      '    There is no warranty; not even for merchantability or fitness for a\n'
-      '    particular purpose. See the source for further details.\n'
-      '   </p>\n'
-      '   <p>\n'
-      '    Copyright &copy; 1998-2011 Albert Hopkins (marduk),\n'
-      '    Mike W. Meyer and Arthur de Jong\n'
-      '   </p>\n'
-      '   <p>\n'
-      '    The files in this generated report do not automatically fall under\n'
-      '    the copyright of the software, unless explicitly stated otherwise.\n'
-      '   </p>\n'
-      '   <p>\n'
-      '    <tt>webcheck</tt> includes the\n'
-      '    <a href="http://victr.lm85.com/projects/fancytooltips/">FancyTooltips</a>\n'
-      '    javascript library to display readable tooltips. FancyTooltips is\n'
-      '    distributed under the MIT license and has the following copyright\n'
-      '    notices (see <tt>fancytooltips.js</tt> for details):\n'
-      '   </p>\n'
-      '   <p>\n'
-      '    Copyright &copy; 2003-2005 Stuart Langridge, Paul McLanahan,\n'
-      '    Peter Janes, Brad Choate, Dunstan Orchard, Ethan Marcotte,\n'
-      '    Mark Wubben and Victor Kulinski\n'
-      '   </p>\n\n')
-    # output plugin information
-    fp.write(
-      '   <h3>Plugins</h3>\n'
-      '   <ul>\n')
-    for plugin in config.PLUGINS:
-        report = __import__('plugins.' + plugin, globals(), locals(), [plugin])
-        fp.write(
-          '    <li>\n'
-          '     <strong>%s</strong><br />\n'
-          % plugins.htmlescape(report.__title__))
-        if hasattr(report, '__doc__'):
-            fp.write('     %s<br />\n' % plugins.htmlescape(report.__doc__))
-        fp.write('    </li>\n')
-    fp.write(
-      '   </ul>\n')
-    plugins.close_html(fp)
diff --git a/plugins/anchors.py b/plugins/anchors.py
deleted file mode 100644
index 420b0fc..0000000
--- a/plugins/anchors.py
+++ /dev/null
@@ -1,51 +0,0 @@
-
-# anchors.py - plugin check for missing anchors
-#
-# Copyright (C) 2006, 2007, 2011 Arthur de Jong
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software
-# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
-#
-# The files produced as output from the software do not automatically fall
-# under the copyright of the software, unless explicitly stated otherwise.
-
-"""Find references to undefined anchors.
-
-This plugin does not output any files, it just finds problems."""
-
-__title__ = 'missing anchors'
-__author__ = 'Arthur de Jong'
-
-import db
-
-
-def postprocess(site):
-    """Add all missing anchors as page problems to the referring page."""
-    session = db.Session()
-    # find all fetched links with requested anchors
-    links = session.query(db.Link).filter(db.Link.reqanchors.any())
-    links = links.filter(db.Link.fetched != None)
-    # go over list and find missing anchors
-    # TODO: we can probably make a nicer query for this
-    for link in links:
-        # check that all requested anchors exist
-        for anchor in link.reqanchors:
-            # if the anchor is not there there, report problem
-            if not link.anchors.filter(db.Anchor.anchor == anchor.anchor).first():
-                anchor.parent.add_pageproblem(
-                  u'bad link: %(url)s#%(anchor)s: unknown anchor'
-                  % {'url': link.url,
-                     'anchor': anchor})
-    # commit changes in session
-    session.commit()
diff --git a/plugins/badlinks.py b/plugins/badlinks.py
deleted file mode 100644
index 1816adf..0000000
--- a/plugins/badlinks.py
+++ /dev/null
@@ -1,90 +0,0 @@
-
-# badlinks.py - plugin to list bad links
-#
-# Copyright (C) 1998, 1999 Albert Hopkins (marduk)
-# Copyright (C) 2002 Mike W. Meyer
-# Copyright (C) 2005, 2006, 2007, 2011 Arthur de Jong
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software
-# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
-#
-# The files produced as output from the software do not automatically fall
-# under the copyright of the software, unless explicitly stated otherwise.
-
-"""Present a listing of links that point to non-existent pages."""
-
-__title__ = 'bad links'
-__author__ = 'Arthur de Jong'
-__outputfile__ = 'badlinks.html'
-
-from sqlalchemy.orm import joinedload
-
-import db
-import plugins
-
-
-def postporcess(site):
-    """Add all bad links as pageproblems on pages where they are linked."""
-    session = db.Session()
-    # find all links with link problems
-    links = session.query(db.Link).filter(db.Link.linkproblems.any()).options(joinedload(db.Link.linkproblems))
-    # TODO: probably make it a nicer query over all linkproblems
-    for link in links:
-        # add a reference to the problem map
-        for problem in link.linkproblems:
-            for parent in link.parents:
-                parent.add_pageproblem('bad link: %s: %s' % (link.url, problem))
-    session.commit()
-
-
-def generate(site):
-    """Present the list of bad links."""
-    session = db.Session()
-    # find all links with link problems
-    links = session.query(db.Link).filter(db.Link.linkproblems.any()).order_by(db.Link.url).options(joinedload(db.Link.linkproblems))
-    # present results
-    fp = plugins.open_html(plugins.badlinks, site)
-    if not links:
-        fp.write(
-          '   <p class="description">\n'
-          '    There were no problems retrieving links from the website.\n'
-          '   </p>\n')
-        plugins.close_html(fp)
-        return
-    fp.write(
-      '   <p class="description">\n'
-      '    These links could not be retrieved during the crawling of the website.\n'
-      '   </p>\n'
-      '   <ol>\n')
-    for link in links:
-        # list the link
-        fp.write(
-          '    <li>\n'
-          '     %(badurl)s\n'
-          '     <ul class="problems">\n'
-          % {'badurl':  plugins.make_link(link, link.url)})
-        # list the problems
-        for problem in link.linkproblems:
-            fp.write(
-              '      <li>%(problem)s</li>\n'
-              % {'problem':  plugins.htmlescape(problem)})
-        fp.write(
-          '     </ul>\n')
-        # present a list of parents
-        plugins.print_parents(fp, link, '     ')
-        fp.write(
-          '    </li>\n')
-    fp.write(
-      '   </ol>\n')
-    plugins.close_html(fp)
diff --git a/plugins/external.py b/plugins/external.py
deleted file mode 100644
index 056b766..0000000
--- a/plugins/external.py
+++ /dev/null
@@ -1,68 +0,0 @@
-
-# external.py - plugin to list external links
-#
-# Copyright (C) 1998, 1999 Albert Hopkins (marduk)
-# Copyright (C) 2002 Mike W. Meyer
-# Copyright (C) 2005, 2006, 2009, 2011 Arthur de Jong
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software
-# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
-#
-# The files produced as output from the software do not automatically fall
-# under the copyright of the software, unless explicitly stated otherwise.
-
-"""Present a list of external links present on the site."""
-
-__title__ = 'external links'
-__author__ = 'Arthur de Jong'
-__outputfile__ = 'external.html'
-
-from sqlalchemy.orm import joinedload
-
-import db
-import plugins
-
-
-def generate(site):
-    """Generate the list of external links."""
-    session = db.Session()
-    # get all external links
-    links = session.query(db.Link).filter(db.Link.is_internal != True).order_by(db.Link.url)
-    # present results
-    fp = plugins.open_html(plugins.external, site)
-    if not links:
-        fp.write(
-          '   <p class="description">'
-          '    No external links were found on the website.'
-          '   </p>\n')
-        plugins.close_html(fp)
-        return
-    fp.write(
-      '   <p class="description">'
-      '    This is the list of all external urls encountered during the'
-      '    examination of the website.'
-      '   </p>\n'
-      '   <ol>\n')
-    for link in links.options(joinedload(db.Link.linkproblems)):
-        fp.write(
-          '    <li>\n'
-          '     %(link)s\n'
-          % {'link': plugins.make_link(link)})
-        # present a list of parents
-        plugins.print_parents(fp, link, '     ')
-        fp.write(
-          '    </li>\n')
-    fp.write(
-      '   </ol>\n')
-    plugins.close_html(fp)
diff --git a/plugins/images.py b/plugins/images.py
deleted file mode 100644
index e3575db..0000000
--- a/plugins/images.py
+++ /dev/null
@@ -1,64 +0,0 @@
-
-# images.py - plugin to list images referenced on the site
-#
-# Copyright (C) 1998, 1999 Albert Hopkins (marduk)
-# Copyright (C) 2002 Mike W. Meyer
-# Copyright (C) 2005, 2006, 2011 Arthur de Jong
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software
-# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
-#
-# The files produced as output from the software do not automatically fall
-# under the copyright of the software, unless explicitly stated otherwise.
-
-"""Present a list of images that are on the site."""
-
-__title__ = 'images'
-__author__ = 'Arthur de Jong'
-__outputfile__ = 'images.html'
-
-import re
-
-import db
-import plugins
-
-
-def generate(site):
-    """Generate a list of image URLs that were found."""
-    session = db.Session()
-    # get non-page links that have an image/* mimetype
-    links = session.query(db.Link)
-    links = links.filter((db.Link.is_page != True) | (db.Link.is_page == None))
-    links = links.filter(db.Link.mimetype.startswith('image/'))
-    links = links.order_by(db.Link.url)
-    # present results
-    fp = plugins.open_html(plugins.images, site)
-    if not links:
-        fp.write(
-          '   <p class="description">\n'
-          '    No images were linked on the website.\n'
-          '   </p>\n'
-          '   <ol>\n')
-        plugins.close_html(fp)
-        return
-    fp.write(
-      '   <p class="description">\n'
-      '    This is the list of all images found linked on the website.\n'
-      '   </p>\n'
-      '   <ol>\n')
-    for link in links:
-        fp.write('    <li>%s</li>\n' % plugins.make_link(link, link.url))
-    fp.write(
-      '   </ol>\n')
-    plugins.close_html(fp)
diff --git a/plugins/new.py b/plugins/new.py
deleted file mode 100644
index aa69315..0000000
--- a/plugins/new.py
+++ /dev/null
@@ -1,77 +0,0 @@
-
-# new.py - plugin to list recently modified pages
-#
-# Copyright (C) 1998, 1999 Albert Hopkins (marduk)
-# Copyright (C) 2002 Mike W. Meyer
-# Copyright (C) 2005, 2006, 2011 Arthur de Jong
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software
-# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
-#
-# The files produced as output from the software do not automatically fall
-# under the copyright of the software, unless explicitly stated otherwise.
-
-"""Present a list of recently modified pages."""
-
-__title__ = "what's new"
-__author__ = 'Arthur de Jong'
-__outputfile__ = 'new.html'
-
-import time
-
-import config
-import db
-import plugins
-
-
-SECS_PER_DAY = 60 * 60 * 24
-
-
-def generate(site):
-    """Output the list of recently modified pages."""
-    session = db.Session()
-    # the time for which links are considered new
-    newtime = time.time() - SECS_PER_DAY * config.REPORT_WHATSNEW_URL_AGE
-    # get all internal pages that are new
-    links = session.query(db.Link).filter_by(is_page=True, is_internal=True)
-    links = links.filter(db.Link.mtime > newtime).order_by(db.Link.mtime.desc())
-    # present results
-    fp = plugins.open_html(plugins.new, site)
-    if not links.count():
-        fp.write(
-          '   <p class="description">\n'
-          '    No pages were found that were modified within the last %(new)d days.\n'
-          '   </p>\n'
-          % {'new': config.REPORT_WHATSNEW_URL_AGE})
-        plugins.close_html(fp)
-        return
-    fp.write(
-      '   <p class="description">\n'
-      '    These pages have been recently modified (within %(new)d days).\n'
-      '   </p>\n'
-      '   <ul>\n'
-      % {'new': config.REPORT_WHATSNEW_URL_AGE})
-    for link in links:
-        age = (time.time() - link.mtime) / SECS_PER_DAY
-        fp.write(
-          '    <li>\n'
-          '     %(link)s\n'
-          '     <ul class="problems">\n'
-          '      <li>age: %(age)d days</li>\n'
-          '     </ul>\n'
-          '    </li>\n'
-          % {'link': plugins.make_link(link),
-             'age':  age})
-    fp.write('   </ul>\n')
-    plugins.close_html(fp)
diff --git a/plugins/notchkd.py b/plugins/notchkd.py
deleted file mode 100644
index 923813e..0000000
--- a/plugins/notchkd.py
+++ /dev/null
@@ -1,68 +0,0 @@
-
-# notchkd.py - plugin to list links that were not followed
-#
-# Copyright (C) 1998, 1999 Albert Hopkins (marduk)
-# Copyright (C) 2002 Mike W. Meyer
-# Copyright (C) 2005, 2006, 2011 Arthur de Jong
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software
-# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
-#
-# The files produced as output from the software do not automatically fall
-# under the copyright of the software, unless explicitly stated otherwise.
-
-"""Present an overview of pages that were not checked."""
-
-__title__ = 'not checked'
-__author__ = 'Arthur de Jong'
-__outputfile__ = 'notchkd.html'
-
-from sqlalchemy.orm import joinedload
-
-import db
-import plugins
-
-
-def generate(site):
-    """Output the list of not checked pages."""
-    session = db.Session()
-    # get all yanked urls
-    links = session.query(db.Link).filter(db.Link.yanked != None).order_by(db.Link.url)
-    # present results
-    fp = plugins.open_html(plugins.notchkd, site)
-    if not links.count():
-        fp.write(
-          '   <p class="description">\n'
-          '    All links have been checked.\n'
-          '   </p>\n')
-        plugins.close_html(fp)
-        return
-    fp.write(
-      '   <p class="description">\n'
-      '    This is the list of all urls that were encountered but not checked\n'
-      '    at all during the examination of the website.\n'
-      '   </p>\n'
-      '   <ol>\n')
-    for link in links.options(joinedload(db.Link.linkproblems)):
-        fp.write(
-          '    <li>\n'
-          '     %(link)s\n'
-          % {'link': plugins.make_link(link, link.url)})
-        # present a list of parents
-        plugins.print_parents(fp, link, '     ')
-        fp.write(
-          '    </li>\n')
-    fp.write(
-      '   </ol>\n')
-    plugins.close_html(fp)
diff --git a/plugins/notitles.py b/plugins/notitles.py
deleted file mode 100644
index c378fbf..0000000
--- a/plugins/notitles.py
+++ /dev/null
@@ -1,77 +0,0 @@
-
-# notitles.py - plugin to list pages without titles
-#
-# Copyright (C) 1998, 1999 Albert Hopkins (marduk)
-# Copyright (C) 2002 Mike W. Meyer
-# Copyright (C) 2005, 2006, 2011 Arthur de Jong
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software
-# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
-#
-# The files produced as output from the software do not automatically fall
-# under the copyright of the software, unless explicitly stated otherwise.
-
-"""List pages without a title."""
-
-__title__ = 'missing titles'
-__author__ = 'Arthur de Jong'
-__outputfile__ = 'notitles.html'
-
-from sqlalchemy.sql.functions import char_length
-
-import db
-import plugins
-
-
-def postprocess(site):
-    """Add page problems for all pages without a title."""
-    session = db.Session()
-    # get all internal pages without a title
-    links = session.query(db.Link).filter_by(is_page=True, is_internal=True)
-    links = links.filter((char_length(db.Link.title) == 0) |
-                         (db.Link.title == None))
-    for link in links:
-        link.add_pageproblem('missing title')
-    session.commit()
-
-
-def generate(site):
-    """Output the list of pages without a title."""
-    session = db.Session()
-    # get all internal pages without a title
-    links = session.query(db.Link).filter_by(is_page=True, is_internal=True)
-    links = links.filter((char_length(db.Link.title) == 0) |
-                         (db.Link.title == None)).order_by(db.Link.url)
-    # present results
-    fp = plugins.open_html(plugins.notitles, site)
-    if not links.count():
-        fp.write(
-          '   <p class="description">\n'
-          '    All pages had a title specified.\n'
-          '   </p>\n')
-        plugins.close_html(fp)
-        return
-    fp.write(
-      '   <p class="description">\n'
-      '    This is the list of all (internal) pages without a proper title\n'
-      '    specified.\n'
-      '   </p>\n'
-      '   <ol>\n')
-    for link in links:
-        fp.write(
-          '    <li>%(link)s</li>\n'
-          % {'link': plugins.make_link(link, link.url)})
-    fp.write(
-      '   </ol>\n')
-    plugins.close_html(fp)
diff --git a/plugins/old.py b/plugins/old.py
deleted file mode 100644
index 5596f4e..0000000
--- a/plugins/old.py
+++ /dev/null
@@ -1,79 +0,0 @@
-
-# old.py - plugin to list old pages
-#
-# Copyright (C) 1998, 1999 Albert Hopkins (marduk)
-# Copyright (C) 2002 Mike W. Meyer
-# Copyright (C) 2005, 2006, 2011 Arthur de Jong
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software
-# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
-#
-# The files produced as output from the software do not automatically fall
-# under the copyright of the software, unless explicitly stated otherwise.
-
-"""Present a list of potentially outdated pages."""
-
-__title__ = "what's old"
-__author__ = 'Arthur de Jong'
-__outputfile__ = 'old.html'
-
-import time
-
-import config
-import db
-import plugins
-
-
-SECS_PER_DAY = 60 * 60 * 24
-
-
-def generate(site):
-    """Output the list of outdated pages to the specified file descriptor."""
-    session = db.Session()
-    # the time for which links are considered old
-    oldtime = time.time() - SECS_PER_DAY * config.REPORT_WHATSOLD_URL_AGE
-    # get all internal pages that are old
-    links = session.query(db.Link).filter_by(is_page=True, is_internal=True)
-    links = links.filter(db.Link.mtime < oldtime).order_by(db.Link.mtime)
-    # present results
-    fp = plugins.open_html(plugins.old, site)
-    if not links.count():
-        fp.write(
-          '   <p class="description">\n'
-          '    No pages were found that were older than %(old)d days old.\n'
-          '   </p>\n'
-          % {'old': config.REPORT_WHATSOLD_URL_AGE})
-        plugins.close_html(fp)
-        return
-    fp.write(
-      '   <p class="description">\n'
-      '    These pages have been modified a long time ago (older than %(old)d\n'
-      '    days) and may be outdated.\n'
-      '   </p>\n'
-      '   <ul>\n'
-      % {'old': config.REPORT_WHATSOLD_URL_AGE})
-    for link in links:
-        age = (time.time() - link.mtime) / SECS_PER_DAY
-        fp.write(
-          '    <li>\n'
-          '     %(link)s\n'
-          '     <ul class="problems">\n'
-          '      <li>age: %(age)d days</li>\n'
-          '     </ul>\n'
-          '    </li>\n'
-          % {'link': plugins.make_link(link),
-             'age':  age})
-    fp.write(
-      '   </ul>\n')
-    plugins.close_html(fp)
diff --git a/plugins/problems.py b/plugins/problems.py
deleted file mode 100644
index 4a03f7b..0000000
--- a/plugins/problems.py
+++ /dev/null
@@ -1,129 +0,0 @@
-
-# problems.py - plugin to list problems
-#
-# Copyright (C) 1998, 1999 Albert Hopkins (marduk)
-# Copyright (C) 2002 Mike W. Meyer
-# Copyright (C) 2005, 2006, 2007, 2011 Arthur de Jong
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software
-# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
-#
-# The files produced as output from the software do not automatically fall
-# under the copyright of the software, unless explicitly stated otherwise.
-
-"""Present an overview of all encountered problems per author."""
-
-__title__ = 'problems by author'
-__author__ = 'Arthur de Jong'
-__outputfile__ = 'problems.html'
-
-import urllib
-
-import db
-import plugins
-
-
-def _mk_id(name):
-    """Convert the name to a string that may be used inside an
-    ID attribute."""
-    # convert to lowercase first
-    name = name.lower()
-    import re
-    # strip any leading non alpha characters
-    name = re.sub('^[^a-z]*', '', name)
-    # remove any non-allowed characters
-    name = re.sub('[^a-z0-9_:.]+', '-', name)
-    # we're done
-    return name
-
-
-def generate(site):
-    """Output the overview of problems per author."""
-    session = db.Session()
-    # make a list of problems per author
-    problem_db = {}
-    # get internal links with page problems
-    links = session.query(db.Link).filter_by(is_internal=True)
-    links = links.filter(db.Link.pageproblems.any()).order_by(db.Link.url)
-    for link in links:
-        # make a normal name for the author
-        if link.author:
-            author = link.author.strip()
-        else:
-            author = unicode('Unknown')
-        # store the problem
-        if author in problem_db:
-            problem_db[author].append(link)
-        else:
-            problem_db[author] = [link]
-    fp = plugins.open_html(plugins.problems, site)
-    if not problem_db:
-        fp.write(
-          '   <p class="description">\n'
-          '    No problems were found on this site, hurray.\n'
-          '   </p>\n')
-        plugins.close_html(fp)
-        return
-    # print description
-    fp.write(
-      '   <p class="description">\n'
-      '    This is an overview of all the problems on the site, grouped by\n'
-      '    author.\n'
-      '   </p>\n')
-    # get a list of authors
-    authors = problem_db.keys()
-    authors.sort()
-    # generate short list of authors
-    if len(authors) > 1:
-        fp.write('   <ul class="authorlist">\n')
-        for author in authors:
-            fp.write(
-              '    <li><a href="#author_%(authorref)s">Author: %(author)s</a></li>\n'
-              % {'authorref': plugins.htmlescape(_mk_id(author)),
-                 'author':    plugins.htmlescape(author)})
-        fp.write('   </ul>\n')
-    # generate problem report
-    fp.write('   <ul>\n')
-    for author in authors:
-        fp.write(
-          '     <li id="author_%(authorref)s">\n'
-          '      Author: %(author)s\n'
-          '      <ul>\n'
-          % {'authorref': plugins.htmlescape(_mk_id(author)),
-             'author':    plugins.htmlescape(author)})
-        # sort pages by url
-        problem_db[author].sort(lambda a, b: cmp(a.url, b.url))
-        # list problems for this author
-        for link in problem_db[author]:
-            # present the links
-            fp.write(
-              '    <li>\n'
-              '     %(link)s\n'
-              '     <ul class="problems">\n'
-              % {'link': plugins.make_link(link)})
-            # list the problems
-            for problem in link.pageproblems:
-                fp.write(
-                  '      <li>%(problem)s</li>\n'
-                  % {'problem':  plugins.htmlescape(problem)})
-            # end the list item
-            fp.write(
-              '     </ul>\n'
-              '    </li>\n')
-        fp.write(
-          '      </ul>\n'
-          '     </li>\n')
-    fp.write(
-      '   </ul>\n')
-    plugins.close_html(fp)
diff --git a/plugins/sitemap.py b/plugins/sitemap.py
deleted file mode 100644
index f6b8963..0000000
--- a/plugins/sitemap.py
+++ /dev/null
@@ -1,96 +0,0 @@
-
-# sitemap.py - plugin to generate a sitemap
-#
-# Copyright (C) 1998, 1999 Albert Hopkins (marduk)
-# Copyright (C) 2002 Mike W. Meyer
-# Copyright (C) 2005, 2006, 2007, 2011 Arthur de Jong
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software
-# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
-#
-# The files produced as output from the software do not automatically fall
-# under the copyright of the software, unless explicitly stated otherwise.
-
-"""Present a site map of the checked site."""
-
-__title__ = 'site map'
-__author__ = 'Arthur de Jong'
-__outputfile__ = 'index.html'
-
-import config
-import db
-import plugins
-
-
-def add_pagechildren(link, children, explored):
-    """Determine the page children of this link, combining the children of
-    embedded items and following redirects."""
-    # get all internal children
-    qry = link.children.filter(db.Link.is_internal == True)
-    if link.depth:
-        qry = qry.filter((db.Link.depth > link.depth) | (db.Link.depth == None))
-    # follow redirects
-    children.update(y
-                    for y in (x.follow_link() for x in qry)
-                    if y and y.is_page and y.is_internal and y.id not in explored)
-    explored.update(x.id for x in children)
-    # add embedded element's pagechildren (think frames)
-    for embed in link.embedded.filter(db.Link.is_internal == True).filter(db.Link.is_page == True):
-        # TODO: put this in a query
-        if embed.id not in explored and \
-           (embed.depth == None or embed.depth > link.depth):
-            add_pagechildren(embed, children, explored)
-
-
-def _explore(fp, link, explored, depth=0, indent='    '):
-    """Recursively do a breadth first traversal of the graph of links on the
-    site. Prints the html results to the file descriptor."""
-    # output this link
-    fp.write(indent + '<li>\n')
-    fp.write(indent + ' ' + plugins.make_link(link) + '\n')
-    # only check children if we are not too deep yet
-    if depth <= config.REPORT_SITEMAP_LEVEL:
-        # figure out the links to follow and ensure that they are only
-        # explored from here
-        children = set()
-        add_pagechildren(link, children, explored)
-        # remove None which could be there as a result of follow_link()
-        children.discard(None)
-        if children:
-            children = list(children)
-            # present children as a list
-            fp.write(indent + ' <ul>\n')
-            children.sort(lambda a, b: cmp(a.url, b.url))
-            for child in children:
-                _explore(fp, child, explored, depth + 1, indent + '  ')
-            fp.write(indent + ' </ul>\n')
-    fp.write(indent + '</li>\n')
-
-
-def generate(site):
-    """Output the sitemap."""
-    session = db.Session()
-    fp = plugins.open_html(plugins.sitemap, site)
-    # output the site structure using breadth first traversal
-    fp.write(
-      '   <p class="description">\n'
-      '    This an overview of the crawled site.\n'
-      '   </p>\n'
-      '   <ul>\n')
-    explored = set(x.id for x in site.bases)
-    for l in site.bases:
-        _explore(fp, l, explored)
-    fp.write(
-      '   </ul>\n')
-    plugins.close_html(fp)
diff --git a/plugins/size.py b/plugins/size.py
deleted file mode 100644
index c4d33f8..0000000
--- a/plugins/size.py
+++ /dev/null
@@ -1,97 +0,0 @@
-
-# size.py - plugin that lists pages that could be slow to load
-#
-# Copyright (C) 1998, 1999 Albert Hopkins (marduk)
-# Copyright (C) 2002 Mike W. Meyer
-# Copyright (C) 2005, 2006, 2011 Arthur de Jong
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software
-# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
-#
-# The files produced as output from the software do not automatically fall
-# under the copyright of the software, unless explicitly stated otherwise.
-
-"""Present a list of pages that are large and probably slow to download."""
-
-__title__ = "what's big"
-__author__ = 'Arthur de Jong'
-__outputfile__ = 'size.html'
-
-import config
-import db
-import plugins
-
-
-def _getsize(link, done=None):
-    """Return the size of the link and all its embedded links, counting each
-    link only once."""
-    # make a new list
-    if done is None:
-        done = []
-    # add this link to the list
-    done.append(link)
-    # if we don't known about our total size yet, calculate
-    if not hasattr(link, 'total_size'):
-        size = 0
-        # add our size
-        if link.size is not None:
-            size = link.size
-        # add sizes of embedded objects
-        for embed in link.embedded:
-            if embed not in done:
-                size += _getsize(embed, done)
-        link.total_size = size
-    return link.total_size
-
-
-def generate(site):
-    """Output the list of large pages."""
-    session = db.Session()
-    # get all internal pages and get big links
-    links = session.query(db.Link).filter_by(is_page=True, is_internal=True)
-    links = [x for x in links
-             if _getsize(x) >= config.REPORT_SLOW_URL_SIZE * 1024]
-    # sort links by size (biggest first)
-    links.sort(lambda a, b: cmp(b.total_size, a.total_size))
-    # present results
-    fp = plugins.open_html(plugins.size, site)
-    if not links:
-        fp.write(
-          '   <p class="description">\n'
-          '    No pages over %(size)dK were found.\n'
-          '   </p>\n'
-          % {'size': config.REPORT_SLOW_URL_SIZE})
-        plugins.close_html(fp)
-        return
-    fp.write(
-      '   <p class="description">\n'
-      '    These pages are probably too big (over %(size)dK) which could be\n'
-      '    slow to download.\n'
-      '   </p>\n'
-      '   <ul>\n'
-      % {'size': config.REPORT_SLOW_URL_SIZE})
-    for link in links:
-        size = plugins.get_size(link.total_size)
-        fp.write(
-          '    <li>\n'
-          '     %(link)s\n'
-          '     <ul class="problem">\n'
-          '      <li>size: %(size)s</li>\n'
-          '     </ul>\n'
-          '    </li>\n'
-          % {'link': plugins.make_link(link),
-             'size': size})
-    fp.write(
-      '   </ul>\n')
-    plugins.close_html(fp)
diff --git a/plugins/urllist.py b/plugins/urllist.py
deleted file mode 100644
index 1160b2e..0000000
--- a/plugins/urllist.py
+++ /dev/null
@@ -1,49 +0,0 @@
-
-# urllist.py - plugin to generate a list of visited urls
-#
-# Copyright (C) 2005, 2006, 2011 Arthur de Jong
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software
-# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
-#
-# The files produced as output from the software do not automatically fall
-# under the copyright of the software, unless explicitly stated otherwise.
-
-"""Present a list of visited urls."""
-
-__title__ = 'url list'
-__author__ = 'Arthur de Jong'
-__outputfile__ = 'urllist.html'
-
-import db
-import plugins
-
-
-def generate(site):
-    """Output a sorted list of URLs."""
-    session = db.Session()
-    fp = plugins.open_html(plugins.urllist, site)
-    fp.write(
-      '   <p class="description">\n'
-      '    This is the list of all urls encountered during the examination of\n'
-      '    the website. It lists internal as well as external and\n'
-      '    non-examined urls.\n'
-      '   </p>\n'
-      '   <ol>\n')
-    links = session.query(db.Link).order_by(db.Link.url)
-    for link in links:
-        fp.write('    <li>' + plugins.make_link(link, link.url) + '</li>\n')
-    fp.write(
-      '   </ol>\n')
-    plugins.close_html(fp)
diff --git a/webcheck.py b/webcheck.py
deleted file mode 100755
index 2731c64..0000000
--- a/webcheck.py
+++ /dev/null
@@ -1,295 +0,0 @@
-#!/usr/bin/env python
-
-# webcheck.py - main module of webcheck doing command-line checking
-#
-# Copyright (C) 1998, 1999 Albert Hopkins (marduk)
-# Copyright (C) 2002 Mike W. Meyer
-# Copyright (C) 2005, 2006, 2007, 2008, 2010, 2011 Arthur de Jong
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software
-# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
-#
-# The files produced as output from the software do not automatically fall
-# under the copyright of the software, unless explicitly stated otherwise.
-
-"""This is the main webcheck module."""
-
-__version__ = '1.10.4'
-__homepage__ = 'http://arthurdejong.org/webcheck/'
-
-import sys
-import os
-import re
-import urlparse
-import urllib
-
-import config
-# update some fields that currently are stored in config
-config.VERSION = __version__
-config.HOMEPAGE = __homepage__
-
-import crawler
-import plugins
-import debugio
-import monkeypatch
-import db
-
-debugio.loglevel = debugio.INFO
-
-
-def print_version():
-    """Print version information."""
-    sys.stdout.write(
-      'webcheck %(version)s\n'
-      'Written by Albert Hopkins (marduk), Mike W. Meyer and Arthur de Jong.\n'
-      '\n'
-      'Copyright (C) 1998-2011\n'
-      'Albert Hopkins (marduk), Mike W. Meyer and Arthur de Jong.\n'
-      'This is free software; see the source for copying conditions.  There is NO\n'
-      'warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.\n'
-      % {'version': __version__})
-
-
-def print_usage():
-    """Print short usage information."""
-    sys.stderr.write(
-      'Usage: webcheck [OPTION]... URL...\n')
-
-
-def print_tryhelp():
-    """Print friendly pointer to more information."""
-    sys.stderr.write(
-      'Try \'webcheck --help\' for more information.\n')
-
-
-def print_help():
-    """Print the option list."""
-    sys.stdout.write(
-      'Usage: webcheck [OPTION]... URL...\n'
-      'Generate a report for the given URLs\n'
-      '\n'
-      '  -i, --internal=PATTERN mark URLs matching PATTERN as internal\n'
-      '  -x, --external=PATTERN mark URLs matching PATTERN as external\n'
-      '  -y, --yank=PATTERN     do not check URLs matching PATTERN\n'
-      '  -b, --base-only        base URLs only: consider any URL not starting\n'
-      '                         with any of the base URLs to be external\n'
-      '  -a, --avoid-external   do not check external URLs\n'
-      '      --ignore-robots    do not retrieve and parse robots.txt files\n'
-      '  -q, --quiet, --silent  suppress progress messages\n'
-      '  -d, --debug            do programmer-level debugging\n'
-      '  -o, --output=DIRECTORY store the generated reports in the specified\n'
-      '                         directory\n'
-      '  -c, --continue         try to continue from a previous run\n'
-      '  -f, --force            overwrite files without asking\n'
-      '  -r, --redirects=N      the number of redirects webcheck should follow,\n'
-      '                         0 implies to follow all redirects (default=%(redirects)d)\n'
-      '  -u, --userpass=URL     specify a URL with user:pass so username and password are given\n'
-      '                         to matching network locations, -u http://user:pass@example.com\n'
-      '  -w, --wait=SECONDS     wait SECONDS between retrievals\n'
-      '  -V, --version          output version information and exit\n'
-      '  -h, --help             display this help and exit\n'
-      % {'redirects': config.REDIRECT_DEPTH})
-
-
-def parse_args(site):
-    """Parse command-line arguments."""
-    import getopt
-    try:
-        optlist, args = getopt.gnu_getopt(sys.argv[1:],
-          'i:x:y:l:baqdo:cfr:u:w:Vh',
-          ('internal=', 'external=', 'yank=', 'base-only', 'avoid-external',
-           'ignore-robots',
-           'quiet', 'silent', 'debug', 'profile', 'output=', 'continue',
-           'force', 'redirects=', 'userpass=', 'wait=', 'version', 'help'))
-        internal_urls = []
-        external_urls = []
-        yank_urls = []
-        for flag, arg in optlist:
-            if flag in ('-i', '--internal'):
-                internal_urls.append(arg)
-            elif flag in ('-x', '--external'):
-                external_urls.append(arg)
-            elif flag in ('-y', '--yank'):
-                yank_urls.append(arg)
-            elif flag in ('-b', '--base-only'):
-                config.BASE_URLS_ONLY = True
-            elif flag in ('-a', '--avoid-external'):
-                config.AVOID_EXTERNAL_LINKS = True
-            elif flag in ('--ignore-robots',):
-                config.USE_ROBOTS = False
-            elif flag in ('-q', '--quiet', '--silent'):
-                debugio.loglevel = debugio.ERROR
-            elif flag in ('-d', '--debug'):
-                debugio.loglevel = debugio.DEBUG
-            elif flag in ('--profile',):
-                # undocumented on purpose
-                config.PROFILE = True
-            elif flag in ('-o', '--output'):
-                config.OUTPUT_DIR = arg
-            elif flag in ('-c', '--continue'):
-                config.CONTINUE = True
-            elif flag in ('-f', '--force'):
-                config.OVERWRITE_FILES = True
-            elif flag in ('-r', '--redirects'):
-                config.REDIRECT_DEPTH = int(arg)
-            elif flag in ('-u', '--userpass'):
-                (_scheme, _netloc, _path, _params, _query, _frag) = urlparse.urlparse(arg)
-                (_userpass, _netloc) = urllib.splituser(_netloc)
-                config.USERPASS[_netloc] = _userpass
-            elif flag in ('-w', '--wait'):
-                config.WAIT_BETWEEN_REQUESTS = float(arg)
-            elif flag in ('-V', '--version'):
-                print_version()
-                sys.exit(0)
-            elif flag in ('-h', '--help'):
-                print_help()
-                sys.exit(0)
-        if len(args) == 0 and not config.CONTINUE:
-            print_usage()
-            print_tryhelp()
-            sys.exit(1)
-        # ensure output directory exists
-        if not os.path.isdir(config.OUTPUT_DIR):
-            os.mkdir(config.OUTPUT_DIR)
-        # set up database connection
-        filename = os.path.join(config.OUTPUT_DIR, 'webcheck.sqlite')
-        from sqlalchemy import create_engine
-        engine = create_engine('sqlite:///' + filename)
-        db.Session.configure(bind=engine)
-        # ensure that all tables are created
-        db.Base.metadata.create_all(engine)
-        # TODO: schema migraton goes here
-        # add configuration to site
-        for pattern in internal_urls:
-            site.add_internal_re(pattern)
-        for pattern in external_urls:
-            site.add_external_re(pattern)
-        for pattern in yank_urls:
-            site.add_yanked_re(pattern)
-        for arg in args:
-            # if it does not look like a url it is probably a local file
-            if urlparse.urlsplit(arg)[0] == '':
-                arg = 'file://' + urllib.pathname2url(os.path.abspath(arg))
-            site.add_internal(arg)
-    except getopt.error, reason:
-        sys.stderr.write('webcheck: %s\n' % reason)
-        print_tryhelp()
-        sys.exit(1)
-    except re.error, e:
-        sys.stderr.write('webcheck: %s\n' % str(e))
-        sys.exit(1)
-
-
-def install_file(source, text=False):
-    """Install the given file in the output directory.
-    If the text flag is set to true it is assumed the file is text,
-    translating line endings."""
-    import shutil
-    import urlparse
-    # figure out mode to open the file with
-    mode = 'r'
-    if text:
-        mode += 'U'
-    # check with what kind of argument we are called
-    scheme = urlparse.urlsplit(source)[0]
-    if scheme == 'file':
-        # this is a file:/// url, translate to normal path and open
-        import urllib
-        source = urllib.url2pathname(urlparse.urlsplit(source)[2])
-    elif scheme == '' and os.path.isabs(source):
-        # this is an absolute path, just open it as is
-        pass
-    elif scheme == '':
-        # this is a relavite path, try to fetch it from the python path
-        for directory in sys.path:
-            tst = os.path.join(directory, source)
-            if os.path.isfile(tst):
-                source = tst
-                break
-    # TODO: support more schemes here
-    # figure out the destination name
-    target = os.path.join(config.OUTPUT_DIR, os.path.basename(source))
-    # test if source and target are the same
-    source = os.path.realpath(source)
-    if source == os.path.realpath(target):
-        debugio.warn('attempt to overwrite %(fname)s with itself' % {'fname': source})
-        return
-    # open the input file
-    sfp = None
-    try:
-        sfp = open(source, mode)
-    except IOError, (errno, strerror):
-        debugio.error('%(fname)s: %(strerror)s' %
-                      {'fname': source,
-                       'strerror': strerror})
-        sys.exit(1)
-    # create file in output directory (with overwrite question)
-    tfp = plugins.open_file(os.path.basename(source))
-    # copy contents
-    shutil.copyfileobj(sfp, tfp)
-    # close files
-    tfp.close()
-    sfp.close()
-
-
-def main(site):
-    """Main program."""
-    # crawl through the website
-    debugio.info('checking site....')
-    crawler.setup_urllib2()
-    site.crawl()  # this will take a while
-    debugio.info('done.')
-    # do postprocessing (building site structure, etc)
-    debugio.info('postprocessing....')
-    site.postprocess()
-    debugio.info('done.')
-    # now we can write out the files
-    # start with the frame-description page
-    debugio.info('generating reports...')
-    # for every plugin, generate a page
-    site.generate()
-    # put extra files in the output directory
-    install_file('webcheck.css', True)
-    install_file('fancytooltips/fancytooltips.js', True)
-    install_file('favicon.ico', False)
-    debugio.info('done.')
-
-
-if __name__ == '__main__':
-    try:
-        # initialize site object
-        site = crawler.Site()
-        # parse command-line arguments
-        parse_args(site)
-        # run the main program
-        if config.PROFILE:
-            fname = os.path.join(config.OUTPUT_DIR, 'webcheck.prof')
-            try:
-                import cProfile
-            except ImportError:
-                import profile as cProfile
-            try:
-                import sqltap
-                sqltap.start()
-            except ImportError:
-                pass
-            cProfile.run('main(site)', fname)
-            if 'sqltap' in locals():
-                statistics = sqltap.collect()
-                sqltap.report(statistics, os.path.join(config.OUTPUT_DIR, 'sqltap.html'))
-        else:
-            main(site)
-    except KeyboardInterrupt:
-        sys.stderr.write('Interrupted\n')
-        sys.exit(1)
diff --git a/webcheck/__init__.py b/webcheck/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/webcheck/config.py b/webcheck/config.py
new file mode 100644
index 0000000..3d66d42
--- /dev/null
+++ b/webcheck/config.py
@@ -0,0 +1,132 @@
+
+# config.py - configuration state for webcheck
+#
+# Copyright (C) 1998, 1999 Albert Hopkins (marduk)
+# Copyright (C) 2002 Mike Meyer
+# Copyright (C) 2005, 2006, 2007, 2008, 2010, 2011 Arthur de Jong
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
+#
+# The files produced as output from the software do not automatically fall
+# under the copyright of the software, unless explicitly stated otherwise.
+
+"""Configuration state for webcheck.
+
+This file contains the default configuration for webcheck. All configurable
+items should be changeble from the command line."""
+
+import urllib
+
+
+# Whether to consider any URL not starting with the base URL to be external.
+# This is the state of the -b command line option.
+BASE_URLS_ONLY = False
+
+# Avoid checking external links at all. This is the state of the -a command
+# line option.
+AVOID_EXTERNAL_LINKS = False
+
+# The proxy configuration.
+PROXIES = urllib.getproxies_environment()
+
+# IO timeout as passed to socket.setdefaulttimeout()
+# value is a float in seconds None disables the timeout
+IOTIMEOUT = 10.0
+
+# Output directory. This is the state of the -o command line option.
+OUTPUT_DIR = '.'
+
+# Whether to try to read a state file to continue from.
+CONTINUE = False
+
+# Whether to produce profiling information. This is for development
+# purposes and as such undocumented.
+# http://docs.python.org/lib/profile.html
+PROFILE = False
+
+# This is the time in seconds to wait between requests. This is the state of
+# the -w command line option.
+WAIT_BETWEEN_REQUESTS = 0
+
+# Redirect depth, the number of redirects to follow. This is the state of the
+# -r command line option.
+REDIRECT_DEPTH = 5
+
+# The list of plugins that will be used to generate the report.
+PLUGINS = ['anchors',
+           'sitemap',
+           'urllist',
+           'images',
+           'external',
+           'notchkd',
+           'badlinks',
+           'old',
+           'new',
+           'size',
+           'notitles',
+           'problems',
+           'about']
+
+# Whether to overwrite files without asking. This is the state of the -f
+# command line option.
+OVERWRITE_FILES = False
+
+# Whether to add extra headers to outgoing requests, requesting to
+# disable caching, ensuring that a fresh page is returned
+BYPASSHTTPCACHE = False
+
+# The number of levels the sitemap plugin should show.
+REPORT_SITEMAP_LEVEL = 8
+
+# The age of pages in days that after which a page is considered too old.
+REPORT_WHATSOLD_URL_AGE = 700
+
+# The age of pages in days within wich a page is considered new.
+REPORT_WHATSNEW_URL_AGE = 7
+
+# The size of a page in kilobytes after which the page is considered too big.
+REPORT_SLOW_URL_SIZE = 76
+
+# The maximum number of links to show in the "referenced from:" lists
+PARENT_LISTLEN = 10
+
+# Whether to open links in a new window (add target="_blank")
+# (disabled by default because it is not xhtml 1.1)
+REPORT_LINKS_IN_NEW_WINDOW = False
+
+# A list of names that will be checked when encountering an file:///
+# directory. This file will be picked up instead of the directory list.
+FILE_INDEXES = ['index.html', 'index.htm']
+
+# A list of names that will be checked when encountering an ftp://
+# directory. This file will be picked up instead of the directory list.
+FTP_INDEXES = ['index.html', 'index.htm']
+
+# Whether to fetch robots.txt files and do checking based on the information
+# present in those files (normally matching links are yanked).
+USE_ROBOTS = True
+
+# This is a hash that maps netlocs (e.g. some.server.com:8000) to
+# username/password combinations that are passed as basic authentication
+# to that netloc
+USERPASS = {}
+
+# Options for tidy (make None to disable running tidy)
+# See http://tidy.sourceforge.net/docs/quickref.html for details.
+TIDY_OPTIONS = dict(quiet=1,
+                    accessibility_check=1,
+                    show_errors=6,
+                    show_warnings=1,
+                    char_encoding='raw')
diff --git a/webcheck/crawler.py b/webcheck/crawler.py
new file mode 100644
index 0000000..3614d3f
--- /dev/null
+++ b/webcheck/crawler.py
@@ -0,0 +1,422 @@
+
+# crawler.py - definition of Link class for storing the crawled site
+#
+# Copyright (C) 1998, 1999 Albert Hopkins (marduk)
+# Copyright (C) 2002 Mike W. Meyer
+# Copyright (C) 2005, 2006, 2007, 2008, 2011 Arthur de Jong
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
+#
+# The files produced as output from the software do not automatically fall
+# under the copyright of the software, unless explicitly stated otherwise.
+
+"""General module to do site-checking. This module contains the Site class
+containing the state for the crawled site and some functions to access and
+manipulate the crawling of the website. This module also contains the Link
+class that holds all the link related properties."""
+
+import atexit
+import cookielib
+import datetime
+import httplib
+import os
+import re
+import robotparser
+import socket
+import time
+import urllib
+import urllib2
+import urlparse
+
+from webcheck.db import Session, Link, LinkProblem, PageProblem, children, \
+                        embedded
+from webcheck import debugio
+import webcheck.config
+import webcheck.parsers
+
+
+class RedirectError(urllib2.HTTPError):
+    def __init__(self, url, code, msg, hdrs, fp, newurl):
+        self.newurl = newurl
+        urllib2.HTTPError.__init__(self, url, code, msg, hdrs, fp)
+
+
+class NoRedirectHandler(urllib2.HTTPRedirectHandler):
+
+    def redirect_request(self, req, fp, code, msg, headers, newurl):
+        raise RedirectError(req.get_full_url(), code, msg, headers, fp, newurl)
+
+
+def setup_urllib2():
+    """Configure the urllib2 module to store cookies in the output
+    directory."""
+    filename = os.path.join(webcheck.config.OUTPUT_DIR, 'cookies.lwp')
+    # set up our cookie jar
+    cookiejar = cookielib.LWPCookieJar(filename)
+    try:
+        cookiejar.load(ignore_discard=False, ignore_expires=False)
+    except IOError:
+        pass
+    atexit.register(cookiejar.save, ignore_discard=False, ignore_expires=False)
+    # set up our custom opener that sets a meaningful user agent
+    opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookiejar),
+                                  NoRedirectHandler())
+    opener.addheaders = [
+      ('User-agent', 'webcheck %s' % webcheck.config.VERSION),
+      ]
+    if webcheck.config.BYPASSHTTPCACHE:
+        opener.addheaders.append(('Cache-control', 'no-cache'))
+        opener.addheaders.append(('Pragma', 'no-cache'))
+    urllib2.install_opener(opener)
+
+
+# pattern for matching spaces
+_spacepattern = re.compile(' ')
+
+# pattern to match anchor part of a url
+_anchorpattern = re.compile('#([^#]+)$')
+
+
+# TODO: rename Site to Crawler
+class Site(object):
+    """Class to represent gathered data of a site.
+
+    The available properties of this class are:
+
+      bases      - a list of base link object
+   """
+
+    def __init__(self):
+        """Creates an instance of the Site class and initializes the
+        state of the site."""
+        # list of internal urls
+        self._internal_urls = set()
+        # list of regexps considered internal
+        self._internal_res = {}
+        # list of regexps considered external
+        self._external_res = {}
+        # list of regexps matching links that should not be checked
+        self._yanked_res = {}
+        # map of scheme+netloc to robot handleds
+        self._robotparsers = {}
+        # list of base urls (these are the internal urls to start from)
+        self.bases = []
+
+    def add_internal(self, url):
+        """Add the given url and consider all urls below it to be internal.
+        These links are all marked for checking with the crawl() function."""
+        url = Link.clean_url(url)
+        if url not in self._internal_urls:
+            self._internal_urls.add(url)
+
+    def add_internal_re(self, exp):
+        """Adds the gived regular expression as a pattern to match internal
+        urls."""
+        self._internal_res[exp] = re.compile(exp, re.IGNORECASE)
+
+    def add_external_re(self, exp):
+        """Adds the gived regular expression as a pattern to match external
+        urls."""
+        self._external_res[exp] = re.compile(exp, re.IGNORECASE)
+
+    def add_yanked_re(self, exp):
+        """Adds the gived regular expression as a pattern to match urls that
+        will not be checked at all."""
+        self._yanked_res[exp] = re.compile(exp, re.IGNORECASE)
+
+    def _is_internal(self, url):
+        """Check whether the specified url is external or internal.
+        This uses the urls marked with add_internal() and the regular
+        expressions passed with add_external_re()."""
+        # check if it is internal through the regexps
+        for regexp in self._internal_res.values():
+            if regexp.search(url) is not None:
+                return True
+        res = False
+        # check that the url starts with an internal url
+        if webcheck.config.BASE_URLS_ONLY:
+            # the url must start with one of the _internal_urls
+            for i in self._internal_urls:
+                res |= (i == url[:len(i)])
+        else:
+            # the netloc must match a netloc of an _internal_url
+            netloc = urlparse.urlsplit(url)[1]
+            for i in self._internal_urls:
+                res |= (urlparse.urlsplit(i)[1] == netloc)
+        # if it is not internal now, it never will be
+        if not res:
+            return False
+        # check if it is external through the regexps
+        for x in self._external_res.values():
+            # if the url matches it is external and we can stop
+            if x.search(url):
+                return False
+        return True
+
+    def _get_robotparser(self, scheme, netloc):
+        """Return the proper robots parser for the given url or None if one
+        cannot be constructed. Robot parsers are cached per scheme and
+        netloc."""
+        # only some schemes have a meaningful robots.txt file
+        if scheme != 'http' and scheme != 'https':
+            debugio.debug('crawler._get_robotparser() '
+                          'called with unsupported scheme (%s)' % scheme)
+            return None
+        # split out the key part of the url
+        location = urlparse.urlunsplit((scheme, netloc, '', '', ''))
+        # try to create a new robotparser if we don't already have one
+        if location not in self._robotparsers:
+            debugio.info('  getting robots.txt for %s' % location)
+            self._robotparsers[location] = None
+            try:
+                rp = robotparser.RobotFileParser()
+                rp.set_url(urlparse.urlunsplit(
+                  (scheme, netloc, '/robots.txt', '', '')))
+                rp.read()
+                self._robotparsers[location] = rp
+            except (TypeError, IOError, httplib.HTTPException):
+                # ignore any problems setting up robot parser
+                pass
+        return self._robotparsers[location]
+
+    def _is_yanked(self, url):
+        """Check whether the specified url should not be checked at all.
+        This uses the regualr expressions passed with add_yanked_re() and the
+        robots information present."""
+        # check if it is yanked through the regexps
+        for regexp in self._yanked_res.values():
+            # if the url matches it is yanked and we can stop
+            if regexp.search(url):
+                return 'yanked'
+        # check if we should avoid external links
+        is_internal = self._is_internal(url)
+        if not is_internal and webcheck.config.AVOID_EXTERNAL_LINKS:
+            return 'external avoided'
+        # check if we should use robot parsers
+        if not webcheck.config.USE_ROBOTS:
+            return None
+        (scheme, netloc) = urlparse.urlsplit(url)[0:2]
+        # skip schemes not having robot.txt files
+        if scheme not in ('http', 'https'):
+            return None
+        # skip robot checks for external urls
+        # TODO: make this configurable
+        if not is_internal:
+            return None
+        # check robots for remaining links
+        rp = self._get_robotparser(scheme, netloc)
+        if rp and not rp.can_fetch('webcheck', url):
+            return 'robot restriced'
+        # fall back to allowing the url
+        return None
+
+    def get_link(self, session, url):
+        # try to find the URL
+        url = Link.clean_url(url)
+        link = session.query(Link).filter_by(url=url).first()
+        if not link:
+            link = Link(url=url)
+            session.add(link)
+        return link
+
+    def get_links_to_crawl(self, session):
+        links = session.query(Link).filter(Link.fetched == None)
+        return links.filter(Link.yanked == None)
+
+    def crawl(self):
+        """Crawl the website based on the urls specified with
+        add_internal(). If the serialization file pointer
+        is specified the crawler writes out updated links to
+        the file while crawling the site."""
+        # get a database session
+        session = Session()
+        # remove all links
+        if not webcheck.config.CONTINUE:
+            session.query(LinkProblem).delete()
+            session.commit()
+            session.query(PageProblem).delete()
+            session.commit()
+            session.execute(children.delete())
+            session.commit()
+            session.execute(embedded.delete())
+            session.commit()
+            session.query(Link).delete()
+            session.commit()
+        # add all internal urls to the database
+        for url in self._internal_urls:
+            url = Link.clean_url(url)
+            self.get_link(session, url)
+        # add some URLs from the database that haven't been fetched
+        tocheck = self.get_links_to_crawl(session)
+        remaining = tocheck.count()
+        tocheck = tocheck[:100]
+        remaining -= len(tocheck)
+        # repeat until we have nothing more to check
+        while tocheck:
+            # choose a link from the tocheck list
+            link = tocheck.pop()
+            link.is_internal = self._is_internal(link.url)
+            link.yanked = self._is_yanked(link.url)
+            # see if there are any more links to check
+            if not tocheck:
+                tocheck = self.get_links_to_crawl(session)
+                remaining = tocheck.count()
+                tocheck = tocheck[:100]
+                remaining -= len(tocheck)
+            # skip link it there is nothing to check
+            if link.yanked or link.fetched:
+                continue
+            # fetch the link's contents
+            response = self.fetch(link)
+            if response:
+                self.parse(link, response)
+            # flush database changes
+            session.commit()
+            # sleep between requests if configured
+            if webcheck.config.WAIT_BETWEEN_REQUESTS > 0:
+                debugio.debug('crawler.crawl(): sleeping %s seconds' %
+                              webcheck.config.WAIT_BETWEEN_REQUESTS)
+                time.sleep(webcheck.config.WAIT_BETWEEN_REQUESTS)
+            debugio.debug('crawler.crawl(): items left to check: %d' %
+                          (remaining + len(tocheck)))
+        session.commit()
+
+    def fetch(self, link):
+        """Attempt to fetch the url (if not yanked) and fill in link
+        attributes (based on is_internal)."""
+        debugio.info('  %s' % link.url)
+        # mark the link as fetched to avoid loops
+        link.fetched = datetime.datetime.now()
+        # see if we can import the proper module for this scheme
+        try:
+            # FIXME: if an URI has a username:passwd add the uri, username and password to the HTTPPasswordMgr
+            request = urllib2.Request(link.url)
+            parent = link.parents.first()
+            if parent:
+                request.add_header('Referer', parent.url)
+            response = urllib2.urlopen(request)
+            link.mimetype = response.info().gettype()
+            link.set_encoding(response.headers.getparam('charset'))
+            # FIXME: get result code and other stuff
+            link.status = str(response.code)
+            # link.size = int(response.getheader('Content-length'))
+            # link.mtime = time.mktime(response.msg.getdate('Last-Modified'))
+            # if response.status == 301: link.add_linkproblem(str(response.status)+': '+response.reason)
+            # elif response.status != 200: link.add_linkproblem(str(response.status)+': '+response.reason)
+            # TODO: add checking for size
+            return response
+        except RedirectError, e:
+            link.status = str(e.code)
+            debugio.info('    ' + str(e))
+            if e.code == 301:
+                link.add_linkproblem(str(e))
+            link.add_redirect(e.newurl)
+            return
+        except urllib2.HTTPError, e:
+            link.status = str(e.code)
+            debugio.info('    ' + str(e))
+            link.add_linkproblem(str(e))
+            return
+        except urllib2.URLError, e:
+            debugio.info('    ' + str(e))
+            link.add_linkproblem(str(e))
+            return
+        except KeyboardInterrupt:
+            # handle this in a higher-level exception handler
+            raise
+        except Exception, e:
+            # handle all other exceptions
+            debugio.warn('unknown exception caught: ' + str(e))
+            link.add_linkproblem('error reading HTTP response: %s' % str(e))
+            import traceback
+            traceback.print_exc()
+            return
+
+    def parse(self, link, response):
+        """Parse the fetched response."""
+        # find a parser for the content-type
+        parsermodule = webcheck.parsers.get_parsermodule(link.mimetype)
+        if parsermodule is None:
+            debugio.debug('crawler.Link.fetch(): unsupported content-type: %s' % link.mimetype)
+            return
+        # skip parsing of content if we were returned nothing
+        content = response.read()
+        if content is None:
+            return
+        # parse the content
+        debugio.debug('crawler.Link.fetch(): parsing using %s' % parsermodule.__name__)
+        try:
+            parsermodule.parse(content, link)
+        except Exception, e:
+            import traceback
+            traceback.print_exc()
+            debugio.warn('problem parsing page: ' + str(e))
+            link.add_pageproblem('problem parsing page: ' + str(e))
+
+    def postprocess(self):
+        """Do some basic post processing of the collected data, including
+        depth calculation of every link."""
+        # get a database session
+        session = Session()
+        # build the list of urls that were set up with add_internal() that
+        # do not have a parent (they form the base for the site)
+        for url in self._internal_urls:
+            link = self.get_link(session, url).follow_link()
+            if not link:
+                debugio.warn('base link %s redirects to nowhere' % url)
+                continue
+            # add the link to bases
+            debugio.debug('crawler.postprocess(): adding %s to bases' % link.url)
+            self.bases.append(link)
+        # if we got no bases, just use the first internal one
+        if not self.bases:
+            link = session.query(Link).filter(Link.is_internal == True).first()
+            debugio.debug('crawler.postprocess(): fallback to adding %s to bases' % link.url)
+            self.bases.append(link)
+        # do a breadth first traversal of the website to determine depth
+        session.query(Link).update(dict(depth=None), synchronize_session=False)
+        session.commit()
+        depth = 0
+        count = len(self.bases)
+        for link in self.bases:
+            link.depth = 0
+        session.commit()
+        debugio.debug('crawler.postprocess(): %d links at depth 0' % count)
+        while count > 0:
+            # update the depth of all links without a depth that have a
+            # parent with the previous depth
+            qry = session.query(Link).filter(Link.depth == None)
+            qry = qry.filter(Link.linked_from.any(Link.depth == depth))
+            count = qry.update(dict(depth=depth + 1), synchronize_session=False)
+            session.commit()
+            depth += 1
+            debugio.debug('crawler.postprocess(): %d links at depth %d' % (count, depth))
+            # TODO: also handle embeds
+        # see if any of the plugins want to do postprocessing
+        for p in webcheck.config.PLUGINS:
+            # import the plugin
+            plugin = __import__('plugins.' + p, globals(), locals(), [p])
+            if hasattr(plugin, 'postprocess'):
+                debugio.info('  ' + p)
+                plugin.postprocess(self)
+
+    def generate(self):
+        """Generate pages for plugins."""
+        for p in webcheck.config.PLUGINS:
+            # import the plugin
+            plugin = __import__('plugins.' + p, globals(), locals(), [p])
+            if hasattr(plugin, 'generate'):
+                debugio.info('  ' + p)
+                plugin.generate(self)
diff --git a/webcheck/db.py b/webcheck/db.py
new file mode 100644
index 0000000..a7cf551
--- /dev/null
+++ b/webcheck/db.py
@@ -0,0 +1,299 @@
+
+# db.py - database access layer for webcheck
+#
+# Copyright (C) 2011 Arthur de Jong
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
+#
+# The files produced as output from the software do not automatically fall
+# under the copyright of the software, unless explicitly stated otherwise.
+
+import urlparse
+
+from sqlalchemy.ext.declarative import declarative_base
+from sqlalchemy import distinct, func
+from sqlalchemy import Table, Column, Integer, Boolean, String, DateTime, ForeignKey
+from sqlalchemy.orm import relationship, backref, sessionmaker
+from sqlalchemy.orm.session import object_session
+from sqlalchemy.sql.expression import ClauseElement, union
+
+from webcheck.myurllib import normalizeurl
+import webcheck.config
+import webcheck.debugio
+
+
+# provide session and schema classes
+Session = sessionmaker()
+Base = declarative_base()
+
+
+children = Table(
+    'children', Base.metadata,
+    Column('parent_id', Integer, ForeignKey('links.id', ondelete='CASCADE'), index=True),
+    Column('child_id', Integer, ForeignKey('links.id', ondelete='CASCADE'), index=True)
+    )
+
+
+embedded = Table(
+    'embedded', Base.metadata,
+    Column('parent_id', Integer, ForeignKey('links.id', ondelete='CASCADE'), index=True),
+    Column('child_id', Integer, ForeignKey('links.id', ondelete='CASCADE'), index=True)
+    )
+
+
+class Link(Base):
+
+    __tablename__ = 'links'
+
+    id = Column(Integer, primary_key=True)
+    url = Column(String, index=True, nullable=False, unique=True)
+    is_internal = Column(Boolean, index=True)
+    yanked = Column(String, index=True)
+    fetched = Column(DateTime, index=True)
+
+    # information about the retrieved link
+    status = Column(String)
+    mimetype = Column(String)
+    mimetype = Column(String)
+    encoding = Column(String)
+    size = Column(Integer)
+    mtime = Column(DateTime, index=True)
+    is_page = Column(Boolean, index=True)
+    title = Column(String, index=True)
+    author = Column(String)
+
+    # relationships between links
+    children = relationship('Link', secondary=children,
+        backref=backref('linked_from', lazy='dynamic'),
+        primaryjoin=(id == children.c.parent_id),
+        secondaryjoin=(id == children.c.child_id),
+        lazy='dynamic')
+    embedded = relationship('Link', secondary=embedded,
+        backref=backref('embedded_in', lazy='dynamic'),
+        primaryjoin=(id == embedded.c.parent_id),
+        secondaryjoin=(id == embedded.c.child_id),
+        lazy='dynamic')
+
+    # crawling information
+    redirectdepth = Column(Integer, default=0)
+    depth = Column(Integer)
+
+    @staticmethod
+    def clean_url(url):
+        # normalise the URL, removing the fragment from the URL
+        return urlparse.urldefrag(normalizeurl(url))[0]
+
+    def _get_link(self, url):
+        """Get a link object for the specified URL."""
+        # get the session
+        session = object_session(self)
+        # normalise the URL, removing the fragment from the URL
+        url, fragment = urlparse.urldefrag(normalizeurl(url))
+        # try to find the link
+        instance = session.query(Link).filter_by(url=url).first()
+        if not instance:
+            instance = Link(url=url)
+            session.add(instance)
+        # mark that we were looking for an anchor/fragment
+        if fragment:
+            instance.add_reqanchor(self, fragment)
+        # return the link
+        return instance
+
+    def set_encoding(self, encoding):
+        """Set the encoding of the link doing some basic checks to see if
+        the encoding is supported."""
+        if not self.encoding and encoding:
+            try:
+                webcheck.debugio.debug('crawler.Link.set_encoding(%r)' % encoding)
+                unicode('just some random text', encoding, 'replace')
+                self.encoding = encoding
+            except Exception, e:
+                import traceback
+                traceback.print_exc()
+                self.add_pageproblem('unknown encoding: %s' % encoding)
+
+    def add_redirect(self, url):
+        """Indicate that this link redirects to the specified url."""
+        url = self.clean_url(url)
+        # figure out depth
+        self.redirectdepth = max([self.redirectdepth] +
+                                 [x.redirectdepth for x in self.parents]) + 1
+        # check depth
+        if self.redirectdepth >= webcheck.config.REDIRECT_DEPTH:
+            self.add_linkproblem('too many redirects (%d)' % self.redirectdepth)
+            return
+        # check for redirect to self
+        if url == self.url:
+            self.add_linkproblem('redirect same as source: %s' % url)
+            return
+        # add child
+        self.add_child(url)
+
+    def add_linkproblem(self, message):
+        """Indicate that something went wrong while retrieving this link."""
+        self.linkproblems.append(LinkProblem(message=message))
+
+    def add_pageproblem(self, message):
+        """Indicate that something went wrong with parsing the document."""
+        # only think about problems on internal pages
+        if not self.is_internal:
+            return
+        # TODO: only include a single problem once (e.g. multiple anchors)
+        self.pageproblems.append(PageProblem(message=message))
+
+    def add_child(self, url):
+        """Add the specified URL as a child of this link."""
+        # ignore children for external links
+        if not self.is_internal:
+            return
+        # add to children
+        self.children.append(self._get_link(url))
+
+    def add_embed(self, url):
+        """Mark the given URL as used as an image on this page."""
+        # ignore embeds for external links
+        if not self.is_internal:
+            return
+        # add to embedded
+        self.embedded.append(self._get_link(url))
+
+    def add_anchor(self, anchor):
+        """Indicate that this page contains the specified anchor."""
+        # lowercase anchor
+        anchor = anchor.lower()
+        if self.anchors.filter(Anchor.anchor == anchor).first():
+            self.add_pageproblem(
+              'anchor/id "%(anchor)s" defined multiple times'
+              % {'anchor': anchor})
+        else:
+            self.anchors.append(Anchor(anchor=anchor))
+
+    def add_reqanchor(self, parent, anchor):
+        """Indicate that the specified link contains a reference to the
+        specified anchor. This can be checked later."""
+        # lowercase anchor
+        anchor = anchor.lower()
+        # if RequestedAnchor doesn't exist, add it
+        if not self.reqanchors.filter((RequestedAnchor.parent_id == parent.id) & (RequestedAnchor.anchor == anchor)).first():
+            self.reqanchors.append(RequestedAnchor(parent_id=parent.id, anchor=anchor))
+
+    def follow_link(self, visited=None):
+        """If this link represents a redirect return the redirect target,
+        otherwise return self. If this redirect does not find a referenced
+        link None is returned."""
+        # if this is not a redirect just return
+        if not self.redirectdepth:
+            return self
+        # if we don't know where this redirects, return None
+        if not self.children.count():
+            return None
+        # avoid loops
+        if not visited:
+            visited = set()
+        visited.add(self.url)
+        # the first (and only) child is the redirect target
+        child = self.children.first()
+        if child.url in visited:
+            return None
+        # check where we redirect to
+        return child.follow_link(visited)
+
+    @property
+    def count_parents(self):
+        session = object_session(self)
+        p1 = session.query(func.count(distinct(children.c.parent_id))).filter(children.c.child_id == self.id)
+        p2 = session.query(func.count(distinct(embedded.c.parent_id))).filter(embedded.c.child_id == self.id)
+        return p1.scalar() + p2.scalar()
+
+    @property
+    def parents(self):
+        session = object_session(self)
+        #links = object_session(self).query(Link)
+        #links = links.join(children, Link.id == children.c.parent_id)
+        #links = links.join(embedded, Link.id == embedded.c.parent_id)
+        #return links.filter((children.c.child_id == self.id) |
+        #                    (embedded.c.child_id == self.id)).distinct()
+        parent_ids = union(session.query(children.c.parent_id).filter(children.c.child_id == self.id),
+                           session.query(embedded.c.parent_id).filter(embedded.c.child_id == self.id))
+
+        return session.query(Link).filter(Link.id == parent_ids.c.children_parent_id).distinct()
+
+
+class LinkProblem(Base):
+    """Storage of problems in the URL itself (e.g. problem downloading the
+    associated resource)."""
+
+    __tablename__ = 'linkproblems'
+
+    id = Column(Integer, primary_key=True)
+    link_id = Column(Integer, ForeignKey('links.id', ondelete='CASCADE'), index=True)
+    message = Column(String, index=True)
+    link = relationship(Link, backref=backref('linkproblems', order_by=message,
+                        cascade='all,delete,delete-orphan'))
+
+    def __unicode__(self):
+        return self.message
+
+
+class PageProblem(Base):
+    """Storage of problems in the information from the retrieved URL (e.g.
+    invalid HTML)."""
+
+    __tablename__ = 'pageproblems'
+
+    id = Column(Integer, primary_key=True)
+    link_id = Column(Integer, ForeignKey('links.id', ondelete='CASCADE'), index=True)
+    message = Column(String, index=True)
+    link = relationship(Link, backref=backref('pageproblems', order_by=message,
+                        cascade='all,delete,delete-orphan'))
+
+    def __unicode__(self):
+        return self.message
+
+
+class Anchor(Base):
+    """The named anchors (IDs) found on the page."""
+
+    __tablename__ = 'anchors'
+
+    id = Column(Integer, primary_key=True)
+    link_id = Column(Integer, ForeignKey('links.id', ondelete='CASCADE'), index=True)
+    link = relationship(Link, backref=backref('anchors',
+                        lazy='dynamic',
+                        cascade='all,delete,delete-orphan'))
+    anchor = Column(String)
+
+    def __unicode__(self):
+        return self.anchor
+
+
+class RequestedAnchor(Base):
+    """The named anchors (IDs) found on the page."""
+
+    __tablename__ = 'reqanchors'
+
+    id = Column(Integer, primary_key=True)
+    link_id = Column(Integer, ForeignKey('links.id', ondelete='CASCADE'), index=True)
+    link = relationship(Link, backref=backref('reqanchors',
+                        lazy='dynamic',
+                        cascade='all,delete,delete-orphan',
+                        ), primaryjoin='Link.id == RequestedAnchor.link_id')
+    parent_id = Column(Integer, ForeignKey('links.id', ondelete='CASCADE'), index=True)
+    parent = relationship(Link, primaryjoin='Link.id == RequestedAnchor.parent_id')
+    anchor = Column(String)
+
+    def __unicode__(self):
+        return self.anchor
diff --git a/webcheck/debugio.py b/webcheck/debugio.py
new file mode 100644
index 0000000..3da73e3
--- /dev/null
+++ b/webcheck/debugio.py
@@ -0,0 +1,66 @@
+
+# debugio.py - output logging module
+#
+# Copyright (C) 1998, 1999 Albert Hopkins (marduk)
+# Copyright (C) 2002 Mike W. Meyer
+# Copyright (C) 2005, 2006, 2011 Arthur de Jong
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
+#
+# The files produced as output from the software do not automatically fall
+# under the copyright of the software, unless explicitly stated otherwise.
+
+"""Debugging and message output module.
+
+This module contains facilities for logging program output.  The use of
+this module is really simple: import it, set loglevel, and use debug(),
+info(), warn() and error() whenever you want to print something."""
+
+import sys
+
+
+# log levels that can be used
+ERROR = 0
+WARN = 1
+INFO = 2
+DEBUG = 3
+
+# initialize logging at default level
+loglevel = INFO
+
+
+def debug(msg):
+    """Log the message to stderr if loglevel will allow it."""
+    if loglevel >= DEBUG:
+        sys.stderr.write('webcheck: DEBUG: ' + str(msg) + '\n')
+
+
+def info(msg):
+    """Log the message to stdout if loglevel will allow it."""
+    if loglevel >= INFO:
+        sys.stdout.write('webcheck: ' + str(msg) + '\n')
+        sys.stdout.flush()
+
+
+def warn(msg):
+    """Log a warning to stderr if loglevel will allow it."""
+    if loglevel >= WARN:
+        sys.stderr.write('webcheck: Warning: ' + str(msg) + '\n')
+
+
+def error(msg):
+    """Log an error to stderr if loglevel will allow it."""
+    if loglevel >= ERROR:
+        sys.stderr.write('webcheck: Error: ' + str(msg) + '\n')
diff --git a/webcheck/monkeypatch.py b/webcheck/monkeypatch.py
new file mode 100644
index 0000000..a1af120
--- /dev/null
+++ b/webcheck/monkeypatch.py
@@ -0,0 +1,81 @@
+
+# monkeypatch.py - add missing functionality to standard modules
+#
+# Copyright (C) 2011 Arthur de Jong
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
+#
+# The files produced as output from the software do not automatically fall
+# under the copyright of the software, unless explicitly stated otherwise.
+
+import re
+import sys
+import urllib
+import urlparse
+
+
+__all__ = []
+
+
+# This monkeypatches RuleLine.applies_to to support * and $ characters in
+# robots.txt path names.
+def my_applies_to(ruleline, filename):
+    if not hasattr(ruleline, 'pattern'):
+        pat = []
+        # we need to unescape the * from the path here
+        for x in ruleline.path.replace('%2A', '*'):
+            if x == '*':
+                pat.append('.*')
+            elif x == '$':
+                pat.append(r'\Z')
+            else:
+                pat.append(re.escape(x))
+        ruleline.pattern = re.compile(''.join(pat) + '(?ms)')
+    return bool(ruleline.pattern.match(filename))
+
+from robotparser import RuleLine
+RuleLine.applies_to = my_applies_to
+
+
+# This monkeypatches RobotFileParser.can_fetch to include the query string
+# into the tested part of the URL, taken from http://bugs.python.org/issue6325
+# this should be fixed in Python 2.7
+if sys.version_info < (2, 7):
+
+    def my_can_fetch(rfp, useragent, url):
+        """using the parsed robots.txt decide if useragent can fetch url"""
+        if rfp.disallow_all:
+            return False
+        if rfp.allow_all:
+            return True
+        # search for given user agent matches
+        # the first match counts
+        parsed_url = urlparse.urlparse(urllib.unquote(url))
+        url = urlparse.urlunparse(('', '', parsed_url.path,
+        parsed_url.params, parsed_url.query, parsed_url.fragment))
+        url = urllib.quote(url)
+        if not url:
+            url = "/"
+        for entry in rfp.entries:
+            if entry.applies_to(useragent):
+                return entry.allowance(url)
+        # try the default entry last
+        if rfp.default_entry:
+            return rfp.default_entry.allowance(url)
+        # agent not found ==> access granted
+        return True
+
+    from robotparser import RobotFileParser
+    RobotFileParser.can_fetch = my_can_fetch
diff --git a/webcheck/myurllib.py b/webcheck/myurllib.py
new file mode 100644
index 0000000..abe26b2
--- /dev/null
+++ b/webcheck/myurllib.py
@@ -0,0 +1,120 @@
+
+# myurllib.py - general purpose URL handling library
+#
+# Copyright (C) 2007, 2011 Arthur de Jong
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
+#
+# The files produced as output from the software do not automatically fall
+# under the copyright of the software, unless explicitly stated otherwise.
+
+import re
+import urllib
+import urlparse
+
+# this is a workaround for Python 2.3
+try:
+    set
+except NameError:
+    from sets import Set as set
+
+# The way I read RFC3986 (especially sections 3.3 and 6.2) is that these
+# are all separate and valid URLs that point to the same resource.
+#
+# In section 6.2.2.3 only the removal of "." and ".." in paths is
+# mentioned although 6.2.3 does leave some room for other normalisation.
+
+# pattern for matching URL-encoded characters
+_urlencpattern = re.compile('(%[0-9a-fA-F]{2})')
+
+# characters that should be unescaped in URLs
+_okurlchars = set('-.0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ' \
+                  '_abcdefghijklmnopqrstuvwxyz~')
+
+# pattern for matching characters that should be escaped
+_urlprobpattern = re.compile('([^-;/?:@&=+$,%#.0123456789' \
+                             'ABCDEFGHIJKLMNOPQRSTUVWXYZ_' \
+                             'abcdefghijklmnopqrstuvwxyz~])')
+
+# pattern for double slashes
+_doubleslashpattern = re.compile('//+')
+
+# pattern for leading dots
+_leadingdotpattern = re.compile('^(/\.\.)*')
+
+
+def _unescape_printable(match):
+    """Helper function for _normalize_escapes() to perform the expansion of
+    html entity refs that are normal printable (but not reserver)
+    characters."""
+    # unescape the character
+    r = chr(int(match.group(1)[1:3], 16))
+    if r in _okurlchars:
+        return r
+    # transform remaining escapes to uppercase
+    return match.group(1).upper()
+
+
+def _normalize_escapes(url):
+    """Ensure that escaping in the url is consistent. Any reserved characters
+    are left alone. Any characters that are printable but are escaped are
+    unescaped. Any non-printable characters are escaped."""
+    # url decode any printable normal characters (this leaves us with a string
+    # with as much stuff unquoted as # possible)
+    url = _urlencpattern.sub(_unescape_printable, url)
+    # url encode any nonprintable or problematic characters (but not reserved
+    # characters) so we're left with a string with everything that needs to be
+    # quoted as such
+    url = _urlprobpattern.sub(lambda x: '%%%02X' % ord(x.group(1)), url)
+    return url
+
+
+def _urlclean(url):
+    """Clean the url of uneccesary parts."""
+    # make escaping consistent
+    url = _normalize_escapes(url)
+    # split the url in useful parts
+    (scheme, netloc, path, query, fragment) = urlparse.urlsplit(url)
+    # remove any leading /../ parts
+    if scheme in ('http', 'https'):
+        path = _leadingdotpattern.sub('', path)
+    if scheme in ('http', 'https', 'ftp'):
+        # http(s) urls should have a non-empty path
+        if path == '':
+            path = '/'
+        # make hostname lower case
+        (userpass, hostport) = urllib.splituser(netloc)
+        (host, port) = urllib.splitport(hostport)
+        # remove default port
+        if scheme == 'http' and str(port) == '80':
+            hostport = host
+        elif scheme == 'https' and str(port) == '443':
+            hostport = host
+        netloc = hostport.lower()
+        # trim trailing :
+        if netloc[-1:] == ':':
+            netloc = netloc[:-1]
+        if userpass is not None:
+            netloc = userpass + '@' + netloc
+    # get rid of double slashes in some paths
+    if scheme == 'file':
+        path = _doubleslashpattern.sub('/', path)
+    # put the url back together again
+    return urlparse.urlunsplit((scheme, netloc, path, query, fragment))
+
+
+def normalizeurl(url):
+    """Return a normalized URL."""
+    return _urlclean(url)
diff --git a/webcheck/parsers/__init__.py b/webcheck/parsers/__init__.py
new file mode 100644
index 0000000..f0f5f97
--- /dev/null
+++ b/webcheck/parsers/__init__.py
@@ -0,0 +1,63 @@
+
+# __init__.py - general content-type parser interface
+#
+# Copyright (C) 2005, 2006, 2011 Arthur de Jong
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
+#
+# The files produced as output from the software do not automatically fall
+# under the copyright of the software, unless explicitly stated otherwise.
+
+"""This package groups all the supported content-types.
+
+A content-type module can be requested by the get_parsemodule() function.
+Each module should export the following function:
+
+    parse(content, link)
+        Based on the content, fill in the common fields of the link object."""
+
+# the modules that should be imported
+_modules = ('html', 'css')
+
+# a map of mimetypes to modules
+_parsermodules = {}
+
+
+def _init_modules():
+    """Initialize the modules."""
+    # go throught all known modules to probe the content-types
+    # (do this only once)
+    for mod in _modules:
+        parser = __import__('webcheck.parsers.' + mod, globals(), locals(), [mod])
+        for mimetype in parser.mimetypes:
+            _parsermodules[mimetype] = parser
+
+
+def get_parsermodule(mimetype):
+    """Look up the correct module for the specified mimetype."""
+    if _parsermodules == {}:
+        _init_modules()
+    # check if we have a supported content-type
+    if mimetype in _parsermodules:
+        return _parsermodules[mimetype]
+    return None
+
+
+def get_mimetypes():
+    """Return a list of supported mime types that can be parsed
+    by the installed parsers."""
+    if _parsermodules == {}:
+        _init_modules()
+    return _parsermodules.keys()
diff --git a/webcheck/parsers/css.py b/webcheck/parsers/css.py
new file mode 100644
index 0000000..1b22a9d
--- /dev/null
+++ b/webcheck/parsers/css.py
@@ -0,0 +1,56 @@
+
+# css.py - parser functions for css content
+#
+# Copyright (C) 2005, 2006, 2009, 2011 Arthur de Jong
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
+#
+# The files produced as output from the software do not automatically fall
+# under the copyright of the software, unless explicitly stated otherwise.
+
+"""This modules attempts to parse CSS files.
+It currently looks for url() links in stylesheet contents and also
+looks for @import processing directives."""
+
+mimetypes = ('text/css',)
+
+import re
+import urlparse
+
+
+# pattern for matching /* ... */ comments in css
+_commentpattern = re.compile('/\*.*?\*/', re.IGNORECASE | re.DOTALL)
+
+# pattern for matching @import "url" statments in css
+_importpattern = re.compile('@import\s+["\']([^"\']*)["\']',
+                            re.IGNORECASE | re.DOTALL)
+
+# pattern for matching url(...) in css
+_urlpattern = re.compile('url\(["\']?(.*?)["\']?\)')
+
+
+def parse(content, link, base=None):
+    """Parse the specified content and extract information for crawling the
+    site further."""
+    # if no base is specified, get it from the link
+    base = base or link.url
+    # strip out comments from the content
+    content = _commentpattern.sub('', content)
+    # handle @imports
+    for embed in _importpattern.findall(content):
+        link.add_embed(urlparse.urljoin(base, embed))
+    # handle url()s
+    for embed in _urlpattern.findall(content):
+        link.add_embed(urlparse.urljoin(base, embed))
diff --git a/webcheck/parsers/html/__init__.py b/webcheck/parsers/html/__init__.py
new file mode 100644
index 0000000..d4c8fe7
--- /dev/null
+++ b/webcheck/parsers/html/__init__.py
@@ -0,0 +1,125 @@
+
+# html.py - parser functions for html content
+#
+# Copyright (C) 2005, 2006, 2007, 2008, 2011 Arthur de Jong
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
+#
+# The files produced as output from the software do not automatically fall
+# under the copyright of the software, unless explicitly stated otherwise.
+
+"""Parser functions for processing HTML content. This a front-end
+module that tries to load the BeatifulSoup parser first and falls
+back to loading the legacy HTMLParser parser."""
+
+import htmlentitydefs
+import re
+
+from webcheck import debugio
+import webcheck.config
+
+
+# the list of mimetypes this module should be able to handle
+mimetypes = ('text/html', 'application/xhtml+xml', 'text/x-server-parsed-html')
+
+# pattern for matching all html entities
+_entitypattern = re.compile('&(#[0-9]{1,6}|[a-zA-Z]{2,10});')
+
+
+def htmlescape(txt):
+    """HTML escape the given string and return an ASCII clean string with
+    known entities and character entities for the other values."""
+    # check for empty string
+    if not txt:
+        return u''
+    # convert to unicode object
+    if not isinstance(txt, unicode):
+        txt = unicode(txt)
+    # the output string
+    out = ''
+    # loop over the characters of the string
+    for c in txt:
+        if ord(c) in htmlentitydefs.codepoint2name:
+            out += '&%s;' % htmlentitydefs.codepoint2name[ord(c)]
+        elif ord(c) > 126:
+            out += '&#%d;' % ord(c)
+        else:
+            out += c.encode('utf-8')
+    return out
+
+
+def _unescape_entity(match):
+    """Helper function for htmlunescape().
+    This funcion unescapes a html entity, it is passed to the sub()
+    function."""
+    if match.group(1) in htmlentitydefs.name2codepoint:
+        # we have a named entity, return proper character
+        return unichr(htmlentitydefs.name2codepoint[match.group(1)])
+    elif match.group(1)[0] == '#':
+        # we have a numeric entity, replace with proper character
+        return unichr(int(match.group(1)[1:]))
+    else:
+        # we have something else, just keep the original
+        return match.group(0)
+
+
+def htmlunescape(txt):
+    """This function unescapes a html encoded string.
+    This function returns a unicode string."""
+    # check for empty string
+    if not txt:
+        return u''
+    # convert to unicode
+    if not isinstance(txt, unicode):
+        txt = unicode(txt, errors='replace')
+    # replace &name; and &#nn; refs with proper characters
+    txt = _entitypattern.sub(_unescape_entity, txt)
+    # we're done
+    return txt
+
+
+def _parsefunction(content, link):
+    # we find a suitable parse function
+    global _parsefunction
+    try:
+        # try BeautifulSoup parser first
+        import webcheck.parsers.html.beautifulsoup
+        debugio.debug('webcheck.parsers.html.parse(): the BeautifulSoup parser is ok')
+        _parsefunction = webcheck.parsers.html.beautifulsoup.parse
+    except ImportError:
+        # fall back to legacy HTMLParser parser
+        debugio.warn('falling back to the legacy HTML parser, '
+                     'consider installing BeautifulSoup')
+        import webcheck.parsers.html.htmlparser
+        _parsefunction = webcheck.parsers.html.htmlparser.parse
+    # call the actual parse function
+    _parsefunction(content, link)
+
+
+def parse(content, link):
+    """Parse the specified content and extract an url list, a list of images a
+    title and an author. The content is assumed to contain HMTL."""
+    # call the normal parse function
+    _parsefunction(content, link)
+    # call the tidy parse function
+    if webcheck.config.TIDY_OPTIONS:
+        try:
+            import calltidy
+            debugio.debug('webcheck.parsers.html.parse(): the Tidy parser is ok')
+            calltidy.parse(content, link)
+        except ImportError:
+            debugio.warn('tidy library (python-utidylib) is unavailable')
+            # remove config to only try once
+            webcheck.config.TIDY_OPTIONS = None
diff --git a/webcheck/parsers/html/beautifulsoup.py b/webcheck/parsers/html/beautifulsoup.py
new file mode 100644
index 0000000..0c71a5f
--- /dev/null
+++ b/webcheck/parsers/html/beautifulsoup.py
@@ -0,0 +1,194 @@
+
+# beautifulsoup.py - parser functions for html content
+#
+# Copyright (C) 2007, 2008, 2009, 2011 Arthur de Jong
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
+#
+# The files produced as output from the software do not automatically fall
+# under the copyright of the software, unless explicitly stated otherwise.
+
+"""Parser functions for processing HTML content. This module uses the
+BeautifulSoup HTML parser and is more flexible than the legacy HTMLParser
+module."""
+
+import htmlentitydefs
+import re
+import urlparse
+
+import BeautifulSoup
+
+from webcheck.myurllib import normalizeurl
+from webcheck.parsers.html import htmlunescape
+import crawler
+
+
+# pattern for matching http-equiv and content part of
+# <meta http-equiv="refresh" content="0;url=URL">
+_refreshhttpequivpattern = re.compile('^refresh$', re.I)
+_refershcontentpattern = re.compile('^[0-9]+;url=(.*)$', re.I)
+
+# check BeautifulSoup find() function for bugs
+if BeautifulSoup.BeautifulSoup('<foo>').find('foo', bar=True):
+    import debugio
+    debugio.warn('using buggy version of BeautifulSoup (%s)' %
+                 BeautifulSoup.__version__)
+
+
+def parse(content, link):
+    """Parse the specified content and extract an url list, a list of images a
+    title and an author. The content is assumed to contain HMTL."""
+    # create parser and feed it the content
+    soup = BeautifulSoup.BeautifulSoup(content,
+                                       fromEncoding=str(link.encoding))
+    # fetch document encoding
+    link.set_encoding(soup.originalEncoding)
+    # <title>TITLE</title>
+    title = soup.find('title')
+    if title and title.string:
+        link.title = htmlunescape(title.string).strip()
+
+    # FIXME: using normalizeurl is wrong below, we should probably use
+    #        something like link.urlunescape() to do the escaping and check
+    #        and log at the same time
+
+    # <base href="URL">
+    base = soup.find('base', href=True)
+    if base:
+        base = normalizeurl(htmlunescape(base['href']).strip())
+    else:
+        base = link.url
+    # <link rel="TYPE" href="URL">
+    for l in soup.findAll('link', rel=True, href=True):
+        if l['rel'].lower() in ('stylesheet', 'alternate stylesheet', 'icon',
+                                'shortcut icon'):
+            embed = normalizeurl(htmlunescape(l['href']).strip())
+            if embed:
+                link.add_embed(urlparse.urljoin(base, embed))
+    # <meta name="author" content="AUTHOR">
+    author = soup.find('meta', attrs={'name': re.compile("^author$", re.I),
+                                      'content': True})
+    if author and author['content']:
+        link.author = htmlunescape(author['content']).strip()
+    # <meta http-equiv="refresh" content="0;url=URL">
+    refresh = soup.find('meta', attrs={'http-equiv': _refreshhttpequivpattern,
+                                       'content': True})
+    if refresh and refresh['content']:
+        try:
+            child = _refershcontentpattern.search(refresh['content']).group(1)
+        except AttributeError:
+            pass  # ignore cases where refresh header parsing causes problems
+        else:
+            link.add_child(urlparse.urljoin(base, child))
+    # <img src="URL">
+    for img in soup.findAll('img', src=True):
+        embed = normalizeurl(htmlunescape(img['src']).strip())
+        if embed:
+            link.add_embed(urlparse.urljoin(base, embed))
+    # <a href="URL">
+    for a in soup.findAll('a', href=True):
+        child = normalizeurl(htmlunescape(a['href']).strip())
+        if child:
+            link.add_child(urlparse.urljoin(base, child))
+    # <a name="NAME">
+    # TODO: consistent url escaping?
+    for a in soup.findAll('a', attrs={'name': True}):
+        # get anchor name
+        a_name = normalizeurl(htmlunescape(a['name']).strip())
+        # if both id and name are used they should be the same
+        if 'id' in a and \
+           a_name != normalizeurl(htmlunescape(a['id']).strip()):
+            link.add_pageproblem(
+              'anchors defined in name and id attributes do not match')
+            # add the id anchor anyway
+            link.add_anchor(normalizeurl(htmlunescape(a['id']).strip()))
+        # add the anchor
+        link.add_anchor(a_name)
+    # <ANY id="ID">
+    for elem in soup.findAll(id=True):
+        # skip anchor that have a name
+        if elem.name == 'a' and 'name' in elem:
+            continue
+        # add the anchor
+        link.add_anchor(normalizeurl(htmlunescape(elem['id']).strip()))
+    # <frameset><frame src="URL"...>...</frameset>
+    for frame in soup.findAll('frame', src=True):
+        embed = normalizeurl(htmlunescape(frame['src']).strip())
+        if embed:
+            link.add_embed(urlparse.urljoin(base, embed))
+    # <iframe src="URL"...>
+    for frame in soup.findAll('iframe', src=True):
+        embed = normalizeurl(htmlunescape(frame['src']).strip())
+        if embed:
+            link.add_embed(urlparse.urljoin(base, embed))
+    # <object data="URL"...>
+    for obj in soup.findAll('object', data=True):
+        embed = normalizeurl(htmlunescape(obj['data']).strip())
+        if embed:
+            link.add_embed(urlparse.urljoin(base, embed))
+    # <object><param name="movie" value="URL"...></object>
+    for para in soup.findAll('param', attrs={'name': 'movie', 'value': True}):
+        embed = normalizeurl(htmlunescape(para['value']).strip())
+        if embed:
+            link.add_embed(urlparse.urljoin(base, embed))
+    # <map><area href="URL"...>...</map>
+    for area in soup.findAll('area', href=True):
+        child = normalizeurl(htmlunescape(area['href']).strip())
+        if child:
+            link.add_child(urlparse.urljoin(base, child))
+    # <applet code="URL" [archive="URL"]...>
+    for applet in soup.findAll('applet', code=True):
+        # if applet has archive tag check that
+        if 'archive' in applet:
+            embed = normalizeurl(htmlunescape(applet['archive']).strip())
+        else:
+            embed = normalizeurl(htmlunescape(applet['code']).strip())
+        if embed:
+            link.add_embed(urlparse.urljoin(base, embed))
+    # <embed src="URL"...>
+    for embedd in soup.findAll('frame', src=True):
+        embed = normalizeurl(htmlunescape(embedd['src']).strip())
+        if embed:
+            link.add_embed(urlparse.urljoin(base, embed))
+    # <embed><param name="movie" value="url"></embed>
+    for param in soup.findAll('param', attrs={
+                  'name': re.compile("^movie$", re.I),
+                  'value': True}):
+        embed = normalizeurl(htmlunescape(param['value']).strip())
+        if embed:
+            link.add_embed(urlparse.urljoin(base, embed))
+    # <style>content</style>
+    for style in soup.findAll('style'):
+        if style.string:
+            # delegate handling of inline css to css module
+            import parsers.css
+            parsers.css.parse(htmlunescape(style.string), link, base)
+    # <ANY style="CSS">
+    for elem in soup.findAll(style=True):
+        # delegate handling of inline css to css module
+        import parsers.css
+        parsers.css.parse(elem['style'], link, base)
+    # <script src="url">
+    for script in soup.findAll('script', src=True):
+        embed = normalizeurl(htmlunescape(script['src']).strip())
+        if embed:
+            link.add_embed(urlparse.urljoin(base, embed))
+    # <body|table|td background="url">
+    for t in soup.findAll(('body', 'table', 'td'), background=True):
+        embed = normalizeurl(htmlunescape(t['background']).strip())
+        if embed:
+            link.add_embed(urlparse.urljoin(base, embed))
+    # flag that the link contains a valid page
+    link.is_page = True
diff --git a/webcheck/parsers/html/calltidy.py b/webcheck/parsers/html/calltidy.py
new file mode 100644
index 0000000..505f185
--- /dev/null
+++ b/webcheck/parsers/html/calltidy.py
@@ -0,0 +1,37 @@
+
+# calltidy.py - parser functions for html content
+#
+# Copyright (C) 2008, 2011 Arthur de Jong
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
+#
+# The files produced as output from the software do not automatically fall
+# under the copyright of the software, unless explicitly stated otherwise.
+
+import tidy
+
+import webcheck.config
+import webcheck.parsers.html
+
+
+def parse(content, link):
+    """Parse the specified content with tidy and add any errors to the
+    link."""
+    # only call tidy on internal pages
+    if link.is_internal:
+        t = tidy.parseString(content, **webcheck.config.TIDY_OPTIONS)
+        for err in t.errors:
+            # error messages are escaped so we unescape them
+            link.add_pageproblem(webcheck.parsers.html.htmlunescape(unicode(err)))
diff --git a/webcheck/parsers/html/htmlparser.py b/webcheck/parsers/html/htmlparser.py
new file mode 100644
index 0000000..fa82045
--- /dev/null
+++ b/webcheck/parsers/html/htmlparser.py
@@ -0,0 +1,306 @@
+
+# html.py - parser functions for html content
+#
+# Copyright (C) 2005, 2006, 2007, 2009, 2011 Arthur de Jong
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
+#
+# The files produced as output from the software do not automatically fall
+# under the copyright of the software, unless explicitly stated otherwise.
+
+"""Parser functions for processing HTML content. This module uses
+the legacy HTMLParser module. It will only be used if BeatifulSoup
+is not available and can be considered depricated. This parser
+will only handle properly formatted HTML."""
+
+import HTMLParser
+import re
+import urlparse
+
+from webcheck import debugio
+from webcheck.myurllib import normalizeurl
+from webcheck.parsers.html import htmlunescape
+import webcheck.crawler
+
+
+# pattern for matching numeric html entities
+_charentitypattern = re.compile('&#([0-9]{1,3});')
+
+# pattern for matching spaces
+_spacepattern = re.compile(' ')
+
+# pattern for matching charset declaration for http-equiv tag
+_charsetpattern = re.compile('charset=([^ ]*)', re.I)
+
+# pattern for matching the encoding part of an xml declaration
+_encodingpattern = re.compile('^xml .*encoding="([^"]*)"', re.I)
+
+
+class _MyHTMLParser(HTMLParser.HTMLParser):
+    """A simple subclass of HTMLParser.HTMLParser continuing after errors
+    and gathering some information from the parsed content."""
+
+    def __init__(self, link):
+        """Inialize the menbers in which we collect data from parsing the
+        document."""
+        self.link = link
+        self.collect = None
+        self.base = None
+        self.title = None
+        self.author = None
+        self.embedded = []
+        self.children = []
+        self.anchors = []
+        self.errmsg = None
+        self.errcount = 0
+        HTMLParser.HTMLParser.__init__(self)
+
+    def _location(self):
+        """Return the current parser location as a string."""
+        (lineno, offset) = self.getpos()
+        if lineno is not None:
+            msg = 'at line %d' % lineno
+        else:
+            msg = 'at unknown line'
+        if offset is not None:
+            msg += ', column %d' % (offset + 1)
+        return msg
+
+    def _cleanurl(self, url, what='link'):
+        """Do some translations of url."""
+        # check for spaces in urls
+        # (characters are escaped in normalizeurl())
+        if _spacepattern.search(url):
+            self.link.add_pageproblem(
+              what + ' contains unescaped spaces: ' + url + ', ' + self._location())
+        # replace &#nnn; entity refs with proper characters
+        url = _charentitypattern.sub(lambda x: chr(int(x.group(1))), url)
+        return normalizeurl(url)
+
+    def error(self, message):
+        """Override superclass' error() method to ignore errors."""
+        # construct error message
+        message += ', ' + self._location()
+        # store error message
+        debugio.debug('webcheck.parsers.html.htmlparser._MyHTMLParser.error(): problem parsing html: ' + message)
+        if self.errmsg is None:
+            self.errmsg = message
+        # increment error count
+        self.errcount += 1
+        if self.errcount > 10:
+            raise HTMLParser.HTMLParseError(message, self.getpos())
+
+    def check_for_whole_start_tag(self, i):
+        """Override to catch assertion exception."""
+        try:
+            return HTMLParser.HTMLParser.check_for_whole_start_tag(self, i)
+        except AssertionError:
+            debugio.debug('webcheck.parsers.html.htmlparser._MyHTMLParser.check_for_whole_start_tag(): caught assertion error')
+            return None
+
+    def handle_starttag(self, tag, attrs):
+        """Handle start tags in html."""
+        # turn attrs into hash
+        attrs = dict(attrs)
+        # <title>TITLE</title>
+        if tag == 'title':
+            self.collect = ''
+        # <base href="URL">
+        elif tag == 'base' and 'href' in attrs:
+            self.base = self._cleanurl(attrs['href'])
+        # <link rel="type" href="URL">
+        elif tag == 'link' and 'rel' in attrs and 'href' in attrs:
+            if attrs['rel'].lower() in ('stylesheet', 'alternate stylesheet', 'icon', 'shortcut icon'):
+                self.embedded.append(self._cleanurl(attrs['href']))
+        # <meta name="author" content="AUTHOR">
+        elif tag == 'meta' and 'name' in attrs and 'content' in attrs and attrs['name'].lower() == 'author':
+            if self.author is None:
+                self.author = attrs['content']
+        # <meta http-equiv="refresh" content="0;url=URL">
+        elif tag == 'meta' and 'http-equiv' in attrs and 'content' in attrs and attrs['http-equiv'].lower() == 'refresh':
+            pass  # TODO: implement
+        # <meta http-equiv="content-type" content="text/html; charset=utf-8" />
+        elif tag == 'meta' and 'http-equiv' in attrs and 'content' in attrs and attrs['http-equiv'].lower() == 'content-type':
+            try:
+                self.link.set_encoding(_charsetpattern.search(attrs['content']).group(1))
+            except AttributeError:
+                # ignore cases where encoding is not set in header
+                pass
+        # <img src="url">
+        elif tag == 'img' and 'src' in attrs:
+            self.embedded.append(self._cleanurl(attrs['src']))
+        # <a href="url" name="anchor" id="anchor">
+        elif tag == 'a':
+            # <a href="url">
+            if 'href' in attrs:
+                self.children.append(self._cleanurl(attrs['href']))
+            # <a name="anchor">
+            a_name = None
+            if 'name' in attrs:
+                a_name = self._cleanurl(attrs['name'], 'anchor')
+            # <a id="anchor">
+            a_id = None
+            if 'id' in attrs:
+                a_id = self._cleanurl(attrs['id'], 'anchor')
+            # if both id and name are used they should be the same
+            if a_id and a_name and a_id != a_name:
+                # add problem
+                self.link.add_pageproblem(
+                  'anchors defined in name and id attributes do not match %(location)s'
+                  % {'location': self._location()})
+            elif a_id == a_name:
+                # ignore id if it's the same as name
+                a_id = None
+            # <a name="anchor">
+            if a_name:
+                if a_name in self.anchors:
+                    self.link.add_pageproblem(
+                      'anchor "%(anchor)s" defined again %(location)s'
+                      % {'anchor':   a_name,
+                         'location': self._location()})
+                else:
+                    self.anchors.append(a_name)
+            # <a id="anchor">
+            if a_id:
+                if a_id in self.anchors:
+                    self.link.add_pageproblem(
+                      'anchor "%(anchor)s" defined again %(location)s'
+                      % {'anchor':   a_id,
+                         'location': self._location()})
+                else:
+                    self.anchors.append(a_id)
+        # <frameset><frame src="url"...>...</frameset>
+        elif tag == 'frame' and 'src' in attrs:
+            self.embedded.append(self._cleanurl(attrs['src']))
+        # <map><area href="url"...>...</map>
+        elif tag == 'area' and 'href' in attrs:
+            self.children.append(self._cleanurl(attrs['href']))
+        # <applet archive="URL"...>
+        elif tag == 'applet' and 'archive' in attrs:
+            self.embedded.append(self._cleanurl(attrs['archive']))
+        # <applet code="URL"...>
+        elif tag == 'applet' and 'code' in attrs:
+            self.embedded.append(self._cleanurl(attrs['code']))
+        # <embed src="url"...>
+        elif tag == 'embed' and 'src' in attrs:
+            self.embedded.append(self._cleanurl(attrs['src']))
+        # <embed><param name="movie" value="url"></embed>
+        elif tag == 'param' and 'name' in attrs and 'value' in attrs:
+            if attrs['name'].lower() == 'movie':
+                self.embedded.append(self._cleanurl(attrs['value']))
+        # <style>content</style>
+        elif tag == 'style':
+            self.collect = ''
+        # <script src="url">
+        elif tag == 'script' and 'src' in attrs:
+            self.embedded.append(self._cleanurl(attrs['src']))
+        # <body|table|td background="url">
+        elif tag in ('body', 'table', 'td') and 'background' in attrs:
+            self.embedded.append(self._cleanurl(attrs['background']))
+        # pick up any tags with a style attribute
+        if 'style' in attrs:
+            # delegate handling of inline css to css module
+            import webcheck.parsers.css
+            webcheck.parsers.css.parse(attrs['style'], self.link, self.base)
+
+    def handle_endtag(self, tag):
+        """Handle end tags in html."""
+        if tag == 'title' and self.title is None:
+            self.title = self.collect
+            self.collect = None
+        elif tag == 'style' and self.collect is not None:
+            # delegate handling of inline css to css module
+            import webcheck.parsers.css
+            webcheck.parsers.css.parse(self.collect, self.link, self.base)
+
+    def handle_data(self, data):
+        """Collect data if we were collecting data."""
+        if self.collect is not None:
+            self.collect += data
+
+    def handle_charref(self, name):
+        """Handle character references (e.g. &#65;) by passing the data to
+        handle_data()."""
+        self.handle_data('&#' + name + ';')
+        # TODO: do not pass ; if plain text does not contain it?
+
+    def handle_entityref(self, name):
+        """Handle entity references (e.g. &eacute;) by passing the data to
+        handle_data()."""
+        self.handle_data('&' + name + ';')
+        # TODO: do not pass ; if plain text does not contain it?
+
+    def handle_pi(self, data):
+        """Handle xml declaration."""
+        # find character encoding from declaration
+        try:
+            self.link.set_encoding(_encodingpattern.search(data).group(1))
+        except AttributeError:
+            pass
+
+
+def _maketxt(txt, encoding):
+    """Return an unicode text of the specified string do correct character
+    conversions and replacing html entities with normal characters."""
+    # try to decode with the given encoding
+    if encoding:
+        try:
+            return htmlunescape(unicode(txt, encoding, 'replace'))
+        except (LookupError, TypeError, ValueError), e:
+            debugio.warn('page has unknown encoding: %s' % str(encoding))
+    # fall back to locale's encoding
+    return htmlunescape(unicode(txt, errors='replace'))
+
+
+def parse(content, link):
+    """Parse the specified content and extract an url list, a list of images a
+    title and an author. The content is assumed to contain HMTL."""
+    # create parser and feed it the content
+    parser = _MyHTMLParser(link)
+    try:
+        parser.feed(content)
+        parser.close()
+    except Exception, e:
+        # ignore (but log) all errors
+        debugio.debug('webcheck.parsers.html.htmlparser.parse(): caught exception: ' + str(e))
+    # check for parser errors
+    if parser.errmsg is not None:
+        debugio.debug('webcheck.parsers.html.htmlparser.parse(): problem parsing html: ' + parser.errmsg)
+        link.add_pageproblem('problem parsing html: %s' % parser.errmsg)
+    # dump encoding
+    debugio.debug('webcheck.parsers.html.htmlparser.parse(): html encoding: %s' % str(link.encoding))
+    # flag that the link contains a valid page
+    link.is_page = True
+    # save the title
+    if parser.title is not None:
+        link.title = _maketxt(parser.title, link.encoding).strip()
+    # save the author
+    if parser.author is not None:
+        link.author = _maketxt(parser.author, link.encoding).strip()
+    # figure out the base of the document (for building the other urls)
+    base = link.url
+    if parser.base is not None:
+        base = parser.base
+    # list embedded and children
+    for embed in parser.embedded:
+        if embed:
+            link.add_embed(urlparse.urljoin(base, embed))
+    for child in parser.children:
+        if child:
+            link.add_child(urlparse.urljoin(base, child))
+    # list anchors
+    for anchor in parser.anchors:
+        if anchor:
+            link.add_anchor(anchor)
diff --git a/webcheck/plugins/__init__.py b/webcheck/plugins/__init__.py
new file mode 100644
index 0000000..e753ed0
--- /dev/null
+++ b/webcheck/plugins/__init__.py
@@ -0,0 +1,281 @@
+
+# __init__.py - plugin function module
+#
+# Copyright (C) 1998, 1999 Albert Hopkins (marduk)
+# Copyright (C) 2002 Mike W. Meyer
+# Copyright (C) 2005, 2006, 2007, 2009, 2011 Arthur de Jong
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
+#
+# The files produced as output from the software do not automatically fall
+# under the copyright of the software, unless explicitly stated otherwise.
+
+"""This package groups all the plugins.
+
+When generating the report each plugin is called in turn with
+the generate() function. Each plugin should export the following
+fields:
+
+    generate(site)
+        Based on the site generate all the output files as needed.
+    __title__
+        A short description of the plugin that is used when linking
+        to the output from the plugin.
+    __author__
+        The author(s) of the plugin.
+    __outputfile__
+        The file the plugin generates (for linking to).
+    docstring
+        The docstring is used as description of the plugin in the
+        report.
+
+Pluings can use the functions exported by this module."""
+
+import sys
+import time
+
+from sqlalchemy.orm import joinedload
+from sqlalchemy.orm.session import object_session
+
+from webcheck.db import Link
+from webcheck.parsers.html import htmlescape
+import webcheck.config
+import webcheck.debugio
+
+
+def _floatformat(f):
+    """Return a float as a string while trying to keep it within three
+    characters."""
+    txt = '%.1f' % f
+    # remove period from too long strings
+    if len(txt) > 3:
+        txt = txt[:txt.find('.')]
+    return txt
+
+
+def get_size(i):
+    """Return the size in bytes as a readble string."""
+    K = 1024
+    M = K * 1024
+    G = M * 1024
+    if i > 1024 * 1024 * 999:
+        return _floatformat(float(i) / float(G)) + 'G'
+    elif i > 1024 * 999:
+        return _floatformat(float(i) / float(M)) + 'M'
+    elif i >= 1024:
+        return _floatformat(float(i) / float(K)) + 'K'
+    else:
+        return '%d' % i
+
+
+def _get_info(link):
+    """Return a string with a summary of the information in the link."""
+    info = u'url: %s\n' % link.url
+    if link.status:
+        info += u'%s\n' % link.status
+    if link.title:
+        info += u'title: %s\n' % link.title.strip()
+    if link.author:
+        info += u'author: %s\n' % link.author.strip()
+    if link.is_internal:
+        info += u'internal link'
+    else:
+        info += u'external link'
+    if link.yanked:
+        info += u', not checked (%s)\n' % link.yanked
+    else:
+        info += u'\n'
+    if link.redirectdepth:
+        if link.children.count() > 0:
+            info += u'redirect: %s\n' % link.children.first().url
+        else:
+            info += u'redirect (not followed)\n'
+    count = link.count_parents
+    if count == 1:
+        info += u'linked from 1 page\n'
+    elif count > 1:
+        info += u'linked from %d pages\n' % count
+    if link.mtime:
+        info += u'last modified: %s\n' % time.ctime(link.mtime)
+    if link.size:
+        info += u'size: %s\n' % get_size(link.size)
+    if link.mimetype:
+        info += u'mime-type: %s\n' % link.mimetype
+    if link.encoding:
+        info += u'encoding: %s\n' % link.encoding
+    for problem in link.linkproblems:
+        info += u'problem: %s\n' % problem.message
+    # trim trailing newline
+    return info.strip()
+
+
+def make_link(link, title=None):
+    """Return an <a>nchor to a url with title. If url is in the Linklist and
+    is external, insert "class=external" in the <a> tag."""
+    return '<a href="%(url)s" %(target)sclass="%(cssclass)s" title="%(info)s">%(title)s</a>' % \
+            dict(url=htmlescape(link.url),
+                 target='target="_blank" ' if webcheck.config.REPORT_LINKS_IN_NEW_WINDOW else '',
+                 cssclass='internal' if link.is_internal else 'external',
+                 info=htmlescape(_get_info(link)).replace('\n', '&#10;'),
+                 title=htmlescape(title or link.title or link.url))
+
+
+def print_parents(fp, link, indent='     '):
+    """Write a list of parents to the output file descriptor.
+    The output is indeted with the specified indent."""
+    # if there are no parents print nothing
+    count = link.count_parents
+    if not count:
+        return
+    parents = link.parents.order_by(Link.title, Link.url).options(joinedload(Link.linkproblems))[:webcheck.config.PARENT_LISTLEN]
+    fp.write(
+      indent + '<div class="parents">\n' +
+      indent + ' referenced from:\n' +
+      indent + ' <ul>\n')
+    more = link.count_parents
+    for parent in parents:
+        fp.write(
+          indent + '  <li>%(parent)s</li>\n'
+          % {'parent': make_link(parent)})
+        more -= 1
+    if more:
+        fp.write(
+          indent + '  <li>%(more)d more...</li>\n'
+          % {'more': more})
+    fp.write(
+      indent + ' </ul>\n' +
+      indent + '</div>\n')
+
+
+def open_file(filename, istext=True, makebackup=False):
+    """This returns an open file object which can be used for writing. This
+    file is created in the output directory. The output directory (stored in
+    webcheck.config.OUTPUT_DIR is created if it does not yet exist. If the second
+    parameter is True (default) the file is opened as an UTF-8 text file."""
+    import os
+    # check if output directory exists and create it if needed
+    if not os.path.isdir(webcheck.config.OUTPUT_DIR):
+        try:
+            os.mkdir(webcheck.config.OUTPUT_DIR)
+        except OSError, (errno, strerror):
+            debugio.error('error creating directory %(dir)s: %(strerror)s' %
+                          {'dir': webcheck.config.OUTPUT_DIR,
+                           'strerror': strerror})
+            sys.exit(1)
+    # build the output file name
+    fname = os.path.join(webcheck.config.OUTPUT_DIR, filename)
+    # check if file exists
+    if os.path.exists(fname):
+        if makebackup:
+            # create backup of original (overwriting previous backup)
+            os.rename(fname, fname + '~')
+        elif not webcheck.config.OVERWRITE_FILES:
+            # ask to overwrite
+            try:
+                res = raw_input('webcheck: overwrite %s? [y]es, [a]ll, [q]uit: ' % fname)
+            except EOFError:
+                # bail out in case raw_input() failed
+                debugio.error('error reading response')
+                res = 'q'
+            res = res.lower() + ' '
+            if res[0] == 'a':
+                webcheck.config.OVERWRITE_FILES = True
+            elif res[0] != 'y':
+                print 'Aborted.'
+                sys.exit(1)
+    # open the file for writing
+    try:
+        if istext:
+            return open(fname, 'w')
+        else:
+            return open(fname, 'wb')
+    except IOError, (errno, strerror):
+        debugio.error('error creating output file %(fname)s: %(strerror)s' %
+                      {'fname': fname,
+                       'strerror': strerror})
+        sys.exit(1)
+
+
+def _print_navbar(fp, plugin):
+    """Return an html fragement representing the navigation bar for a page."""
+    fp.write('  <ul class="navbar">\n')
+    for p in webcheck.config.PLUGINS:
+        # import the plugin
+        report = __import__('webcheck.plugins.' + p, globals(), locals(), [p])
+        # skip if no outputfile
+        if not hasattr(report, '__outputfile__'):
+            continue
+        # generate a link to the plugin page
+        selected = ''
+        if report == plugin:
+            selected = ' class="selected"'
+        fp.write(
+          '   <li><a href="%(pluginfile)s"%(selected)s title="%(description)s">%(title)s</a></li>\n'
+          % {'pluginfile':  report.__outputfile__,
+             'selected':    selected,
+             'title':       htmlescape(report.__title__),
+             'description': htmlescape(report.__doc__)})
+    fp.write('  </ul>\n')
+
+
+def open_html(plugin, site):
+    """Print an html fragment for the start of an html page."""
+    # open the file
+    fp = open_file(plugin.__outputfile__)
+    # get the first base url
+    base = site.bases[0]
+    # write basic html head
+    fp.write(
+      '<?xml version="1.0" encoding="UTF-8" standalone="yes"?>\n'
+      '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">\n'
+      '<html xmlns="http://www.w3.org/1999/xhtml">\n'
+      ' <head>\n'
+      '  <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />\n'
+      '  <title>Webcheck report for %(sitetitle)s (%(plugintitle)s)</title>\n'
+      '  <link rel="stylesheet" type="text/css" href="webcheck.css" />\n'
+      '  <link rel="icon" href="favicon.ico" type="image/ico" />\n'
+      '  <link rel="shortcut icon" href="favicon.ico" />\n'
+      '  <script type="text/javascript" src="fancytooltips.js"></script>\n'
+      '  <meta name="Generator" content="webcheck %(version)s" />\n'
+      ' </head>\n'
+      ' <body>\n'
+      '  <h1 class="basename">Webcheck report for <a href="%(siteurl)s">%(sitetitle)s</a></h1>\n'
+      % {'sitetitle':   htmlescape(base.title or base.url),
+         'plugintitle': htmlescape(plugin.__title__),
+         'siteurl':     base.url,
+         'version':     webcheck.config.VERSION})
+    # write navigation bar
+    _print_navbar(fp, plugin)
+    # write plugin heading
+    fp.write('  <h2>%s</h2>\n' % htmlescape(plugin.__title__))
+    # write plugin contents
+    fp.write('  <div class="content">\n')
+    return fp
+
+
+def close_html(fp):
+    """Print an html fragment as footer of an html page."""
+    fp.write('  </div>\n')
+    # write bottom of page
+    fp.write(
+      '  <p class="footer">\n'
+      '   Generated %(time)s by <a href="%(homepage)s">webcheck %(version)s</a>\n'
+      '  </p>\n'
+      ' </body>\n'
+      '</html>\n'
+      % {'time':     htmlescape(time.ctime(time.time())),
+         'homepage': webcheck.config.HOMEPAGE,
+         'version':  htmlescape(webcheck.config.VERSION)})
+    fp.close()
diff --git a/webcheck/plugins/about.py b/webcheck/plugins/about.py
new file mode 100644
index 0000000..ada2685
--- /dev/null
+++ b/webcheck/plugins/about.py
@@ -0,0 +1,114 @@
+
+# about.py - plugin to list some information about used plugins
+#
+# Copyright (C) 1998, 1999 Albert Hopkins (marduk)
+# Copyright (C) 2002 Mike W. Meyer
+# Copyright (C) 2005, 2006, 2007, 2011 Arthur de Jong
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
+#
+# The files produced as output from the software do not automatically fall
+# under the copyright of the software, unless explicitly stated otherwise.
+
+"""Present an overview of the plugins that are used."""
+
+__title__ = 'about webcheck'
+__author__ = 'Arthur de Jong'
+__outputfile__ = 'about.html'
+
+import time
+
+from webcheck.db import Session, Link
+import webcheck.config
+import webcheck.plugins
+
+
+def generate(site):
+    """Output a list of modules, it's authors and the webcheck version."""
+    fp = webcheck.plugins.open_html(webcheck.plugins.about, site)
+    session = Session()
+    # TODO: xxx links were fetched, xxx pages were examined and a total of xxx notes and problems were found
+    # TODO: include some runtime information (e.g. supported schemes, user configuration, etc)
+    # output some general information about the report
+    fp.write(
+      '   <p>\n'
+      '    This is a website report generated by <tt>webcheck</tt>\n'
+      '    %(version)s. <tt>webcheck</tt> is a website checking tool for\n'
+      '    webmasters. It crawls a given website and does a number of tests\n'
+      '    to see if links and pages are valid.\n'
+      '    More information about <tt>webcheck</tt> can be found at the\n'
+      '    <tt>webcheck</tt> homepage which is located at\n'
+      '    <a href="%(homepage)s">%(homepage)s</a>.\n'
+      '   </p>\n'
+      '   <p>\n'
+      '    This report was generated on %(time)s, a total of %(numurls)d\n'
+      '    links were found.\n'
+      '   </p>\n\n'
+      % {'version':  webcheck.plugins.htmlescape(webcheck.config.VERSION),
+         'time':     webcheck.plugins.htmlescape(time.ctime(time.time())),
+         'numurls':  session.query(Link).count(),
+         'homepage': webcheck.config.HOMEPAGE})
+    # output copyright information
+    fp.write(
+      '   <h3>Copyright</h3>\n'
+      '   <p>\n'
+      '    <tt>webcheck</tt> was originally named <tt>linbot</tt> which was\n'
+      '    developed by Albert Hopkins (marduk).\n'
+      '    Versions up till 1.0 were maintained by Mike W. Meyer who changed\n'
+      '    the name to <tt>webcheck</tt>.\n'
+      '    After that Arthur de Jong did a complete rewrite.\n'
+      '    <tt>webcheck</tt> is <i>free software</i>; you can redistribute it\n'
+      '    and/or modify it under the terms of the\n'
+      '    <a href="http://www.gnu.org/copyleft/gpl.html">GNU General Public License</a>\n'
+      '    (version 2 or later).\n'
+      '    There is no warranty; not even for merchantability or fitness for a\n'
+      '    particular purpose. See the source for further details.\n'
+      '   </p>\n'
+      '   <p>\n'
+      '    Copyright &copy; 1998-2011 Albert Hopkins (marduk),\n'
+      '    Mike W. Meyer and Arthur de Jong\n'
+      '   </p>\n'
+      '   <p>\n'
+      '    The files in this generated report do not automatically fall under\n'
+      '    the copyright of the software, unless explicitly stated otherwise.\n'
+      '   </p>\n'
+      '   <p>\n'
+      '    <tt>webcheck</tt> includes the\n'
+      '    <a href="http://victr.lm85.com/projects/fancytooltips/">FancyTooltips</a>\n'
+      '    javascript library to display readable tooltips. FancyTooltips is\n'
+      '    distributed under the MIT license and has the following copyright\n'
+      '    notices (see <tt>fancytooltips.js</tt> for details):\n'
+      '   </p>\n'
+      '   <p>\n'
+      '    Copyright &copy; 2003-2005 Stuart Langridge, Paul McLanahan,\n'
+      '    Peter Janes, Brad Choate, Dunstan Orchard, Ethan Marcotte,\n'
+      '    Mark Wubben and Victor Kulinski\n'
+      '   </p>\n\n')
+    # output plugin information
+    fp.write(
+      '   <h3>Plugins</h3>\n'
+      '   <ul>\n')
+    for plugin in webcheck.config.PLUGINS:
+        report = __import__('webcheck.plugins.' + plugin, globals(), locals(), [plugin])
+        fp.write(
+          '    <li>\n'
+          '     <strong>%s</strong><br />\n'
+          % webcheck.plugins.htmlescape(report.__title__))
+        if hasattr(report, '__doc__'):
+            fp.write('     %s<br />\n' % webcheck.plugins.htmlescape(report.__doc__))
+        fp.write('    </li>\n')
+    fp.write(
+      '   </ul>\n')
+    webcheck.plugins.close_html(fp)
diff --git a/webcheck/plugins/anchors.py b/webcheck/plugins/anchors.py
new file mode 100644
index 0000000..3b1e88d
--- /dev/null
+++ b/webcheck/plugins/anchors.py
@@ -0,0 +1,51 @@
+
+# anchors.py - plugin check for missing anchors
+#
+# Copyright (C) 2006, 2007, 2011 Arthur de Jong
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
+#
+# The files produced as output from the software do not automatically fall
+# under the copyright of the software, unless explicitly stated otherwise.
+
+"""Find references to undefined anchors.
+
+This plugin does not output any files, it just finds problems."""
+
+__title__ = 'missing anchors'
+__author__ = 'Arthur de Jong'
+
+from webcheck.db import Session, Link, Anchor
+
+
+def postprocess(site):
+    """Add all missing anchors as page problems to the referring page."""
+    session = Session()
+    # find all fetched links with requested anchors
+    links = session.query(Link).filter(Link.reqanchors.any())
+    links = links.filter(Link.fetched != None)
+    # go over list and find missing anchors
+    # TODO: we can probably make a nicer query for this
+    for link in links:
+        # check that all requested anchors exist
+        for anchor in link.reqanchors:
+            # if the anchor is not there there, report problem
+            if not link.anchors.filter(Anchor.anchor == anchor.anchor).first():
+                anchor.parent.add_pageproblem(
+                  u'bad link: %(url)s#%(anchor)s: unknown anchor'
+                  % {'url': link.url,
+                     'anchor': anchor})
+    # commit changes in session
+    session.commit()
diff --git a/webcheck/plugins/badlinks.py b/webcheck/plugins/badlinks.py
new file mode 100644
index 0000000..080d2da
--- /dev/null
+++ b/webcheck/plugins/badlinks.py
@@ -0,0 +1,90 @@
+
+# badlinks.py - plugin to list bad links
+#
+# Copyright (C) 1998, 1999 Albert Hopkins (marduk)
+# Copyright (C) 2002 Mike W. Meyer
+# Copyright (C) 2005, 2006, 2007, 2011 Arthur de Jong
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
+#
+# The files produced as output from the software do not automatically fall
+# under the copyright of the software, unless explicitly stated otherwise.
+
+"""Present a listing of links that point to non-existent pages."""
+
+__title__ = 'bad links'
+__author__ = 'Arthur de Jong'
+__outputfile__ = 'badlinks.html'
+
+from sqlalchemy.orm import joinedload
+
+from webcheck.db import Session, Link
+import webcheck.plugins
+
+
+def postporcess(site):
+    """Add all bad links as pageproblems on pages where they are linked."""
+    session = Session()
+    # find all links with link problems
+    links = session.query(Link).filter(Link.linkproblems.any()).options(joinedload(Link.linkproblems))
+    # TODO: probably make it a nicer query over all linkproblems
+    for link in links:
+        # add a reference to the problem map
+        for problem in link.linkproblems:
+            for parent in link.parents:
+                parent.add_pageproblem('bad link: %s: %s' % (link.url, problem))
+    session.commit()
+
+
+def generate(site):
+    """Present the list of bad links."""
+    session = Session()
+    # find all links with link problems
+    links = session.query(Link).filter(Link.linkproblems.any()).order_by(Link.url).options(joinedload(Link.linkproblems))
+    # present results
+    fp = webcheck.plugins.open_html(webcheck.plugins.badlinks, site)
+    if not links:
+        fp.write(
+          '   <p class="description">\n'
+          '    There were no problems retrieving links from the website.\n'
+          '   </p>\n')
+        webcheck.plugins.close_html(fp)
+        return
+    fp.write(
+      '   <p class="description">\n'
+      '    These links could not be retrieved during the crawling of the website.\n'
+      '   </p>\n'
+      '   <ol>\n')
+    for link in links:
+        # list the link
+        fp.write(
+          '    <li>\n'
+          '     %(badurl)s\n'
+          '     <ul class="problems">\n'
+          % {'badurl':  webcheck.plugins.make_link(link, link.url)})
+        # list the problems
+        for problem in link.linkproblems:
+            fp.write(
+              '      <li>%(problem)s</li>\n'
+              % {'problem':  webcheck.plugins.htmlescape(problem)})
+        fp.write(
+          '     </ul>\n')
+        # present a list of parents
+        webcheck.plugins.print_parents(fp, link, '     ')
+        fp.write(
+          '    </li>\n')
+    fp.write(
+      '   </ol>\n')
+    webcheck.plugins.close_html(fp)
diff --git a/webcheck/plugins/external.py b/webcheck/plugins/external.py
new file mode 100644
index 0000000..17f292e
--- /dev/null
+++ b/webcheck/plugins/external.py
@@ -0,0 +1,68 @@
+
+# external.py - plugin to list external links
+#
+# Copyright (C) 1998, 1999 Albert Hopkins (marduk)
+# Copyright (C) 2002 Mike W. Meyer
+# Copyright (C) 2005, 2006, 2009, 2011 Arthur de Jong
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
+#
+# The files produced as output from the software do not automatically fall
+# under the copyright of the software, unless explicitly stated otherwise.
+
+"""Present a list of external links present on the site."""
+
+__title__ = 'external links'
+__author__ = 'Arthur de Jong'
+__outputfile__ = 'external.html'
+
+from sqlalchemy.orm import joinedload
+
+from webcheck.db import Session, Link
+import webcheck.plugins
+
+
+def generate(site):
+    """Generate the list of external links."""
+    session = Session()
+    # get all external links
+    links = session.query(Link).filter(Link.is_internal != True).order_by(Link.url)
+    # present results
+    fp = webcheck.plugins.open_html(webcheck.plugins.external, site)
+    if not links:
+        fp.write(
+          '   <p class="description">'
+          '    No external links were found on the website.'
+          '   </p>\n')
+        webcheck.plugins.close_html(fp)
+        return
+    fp.write(
+      '   <p class="description">'
+      '    This is the list of all external urls encountered during the'
+      '    examination of the website.'
+      '   </p>\n'
+      '   <ol>\n')
+    for link in links.options(joinedload(Link.linkproblems)):
+        fp.write(
+          '    <li>\n'
+          '     %(link)s\n'
+          % {'link': webcheck.plugins.make_link(link)})
+        # present a list of parents
+        webcheck.plugins.print_parents(fp, link, '     ')
+        fp.write(
+          '    </li>\n')
+    fp.write(
+      '   </ol>\n')
+    webcheck.plugins.close_html(fp)
diff --git a/webcheck/plugins/images.py b/webcheck/plugins/images.py
new file mode 100644
index 0000000..ddeb9af
--- /dev/null
+++ b/webcheck/plugins/images.py
@@ -0,0 +1,64 @@
+
+# images.py - plugin to list images referenced on the site
+#
+# Copyright (C) 1998, 1999 Albert Hopkins (marduk)
+# Copyright (C) 2002 Mike W. Meyer
+# Copyright (C) 2005, 2006, 2011 Arthur de Jong
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
+#
+# The files produced as output from the software do not automatically fall
+# under the copyright of the software, unless explicitly stated otherwise.
+
+"""Present a list of images that are on the site."""
+
+__title__ = 'images'
+__author__ = 'Arthur de Jong'
+__outputfile__ = 'images.html'
+
+import re
+
+from webcheck.db import Session, Link
+import webcheck.plugins
+
+
+def generate(site):
+    """Generate a list of image URLs that were found."""
+    session = Session()
+    # get non-page links that have an image/* mimetype
+    links = session.query(Link)
+    links = links.filter((Link.is_page != True) | (Link.is_page == None))
+    links = links.filter(Link.mimetype.startswith('image/'))
+    links = links.order_by(Link.url)
+    # present results
+    fp = webcheck.plugins.open_html(webcheck.plugins.images, site)
+    if not links:
+        fp.write(
+          '   <p class="description">\n'
+          '    No images were linked on the website.\n'
+          '   </p>\n'
+          '   <ol>\n')
+        webcheck.plugins.close_html(fp)
+        return
+    fp.write(
+      '   <p class="description">\n'
+      '    This is the list of all images found linked on the website.\n'
+      '   </p>\n'
+      '   <ol>\n')
+    for link in links:
+        fp.write('    <li>%s</li>\n' % webcheck.plugins.make_link(link, link.url))
+    fp.write(
+      '   </ol>\n')
+    webcheck.plugins.close_html(fp)
diff --git a/webcheck/plugins/new.py b/webcheck/plugins/new.py
new file mode 100644
index 0000000..94d607d
--- /dev/null
+++ b/webcheck/plugins/new.py
@@ -0,0 +1,77 @@
+
+# new.py - plugin to list recently modified pages
+#
+# Copyright (C) 1998, 1999 Albert Hopkins (marduk)
+# Copyright (C) 2002 Mike W. Meyer
+# Copyright (C) 2005, 2006, 2011 Arthur de Jong
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
+#
+# The files produced as output from the software do not automatically fall
+# under the copyright of the software, unless explicitly stated otherwise.
+
+"""Present a list of recently modified pages."""
+
+__title__ = "what's new"
+__author__ = 'Arthur de Jong'
+__outputfile__ = 'new.html'
+
+import time
+
+from webcheck.db import Session, Link
+import webcheck.config
+import webcheck.plugins
+
+
+SECS_PER_DAY = 60 * 60 * 24
+
+
+def generate(site):
+    """Output the list of recently modified pages."""
+    session = Session()
+    # the time for which links are considered new
+    newtime = time.time() - SECS_PER_DAY * webcheck.config.REPORT_WHATSNEW_URL_AGE
+    # get all internal pages that are new
+    links = session.query(Link).filter_by(is_page=True, is_internal=True)
+    links = links.filter(Link.mtime > newtime).order_by(Link.mtime.desc())
+    # present results
+    fp = webcheck.plugins.open_html(webcheck.plugins.new, site)
+    if not links.count():
+        fp.write(
+          '   <p class="description">\n'
+          '    No pages were found that were modified within the last %(new)d days.\n'
+          '   </p>\n'
+          % {'new': webcheck.config.REPORT_WHATSNEW_URL_AGE})
+        webcheck.plugins.close_html(fp)
+        return
+    fp.write(
+      '   <p class="description">\n'
+      '    These pages have been recently modified (within %(new)d days).\n'
+      '   </p>\n'
+      '   <ul>\n'
+      % {'new': webcheck.config.REPORT_WHATSNEW_URL_AGE})
+    for link in links:
+        age = (time.time() - link.mtime) / SECS_PER_DAY
+        fp.write(
+          '    <li>\n'
+          '     %(link)s\n'
+          '     <ul class="problems">\n'
+          '      <li>age: %(age)d days</li>\n'
+          '     </ul>\n'
+          '    </li>\n'
+          % {'link': webcheck.plugins.make_link(link),
+             'age':  age})
+    fp.write('   </ul>\n')
+    webcheck.plugins.close_html(fp)
diff --git a/webcheck/plugins/notchkd.py b/webcheck/plugins/notchkd.py
new file mode 100644
index 0000000..737afaa
--- /dev/null
+++ b/webcheck/plugins/notchkd.py
@@ -0,0 +1,68 @@
+
+# notchkd.py - plugin to list links that were not followed
+#
+# Copyright (C) 1998, 1999 Albert Hopkins (marduk)
+# Copyright (C) 2002 Mike W. Meyer
+# Copyright (C) 2005, 2006, 2011 Arthur de Jong
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
+#
+# The files produced as output from the software do not automatically fall
+# under the copyright of the software, unless explicitly stated otherwise.
+
+"""Present an overview of pages that were not checked."""
+
+__title__ = 'not checked'
+__author__ = 'Arthur de Jong'
+__outputfile__ = 'notchkd.html'
+
+from sqlalchemy.orm import joinedload
+
+from webcheck.db import Session, Link
+import webcheck.plugins
+
+
+def generate(site):
+    """Output the list of not checked pages."""
+    session = Session()
+    # get all yanked urls
+    links = session.query(Link).filter(Link.yanked != None).order_by(Link.url)
+    # present results
+    fp = webcheck.plugins.open_html(webcheck.plugins.notchkd, site)
+    if not links.count():
+        fp.write(
+          '   <p class="description">\n'
+          '    All links have been checked.\n'
+          '   </p>\n')
+        webcheck.plugins.close_html(fp)
+        return
+    fp.write(
+      '   <p class="description">\n'
+      '    This is the list of all urls that were encountered but not checked\n'
+      '    at all during the examination of the website.\n'
+      '   </p>\n'
+      '   <ol>\n')
+    for link in links.options(joinedload(Link.linkproblems)):
+        fp.write(
+          '    <li>\n'
+          '     %(link)s\n'
+          % {'link': webcheck.plugins.make_link(link, link.url)})
+        # present a list of parents
+        webcheck.plugins.print_parents(fp, link, '     ')
+        fp.write(
+          '    </li>\n')
+    fp.write(
+      '   </ol>\n')
+    webcheck.plugins.close_html(fp)
diff --git a/webcheck/plugins/notitles.py b/webcheck/plugins/notitles.py
new file mode 100644
index 0000000..531acf7
--- /dev/null
+++ b/webcheck/plugins/notitles.py
@@ -0,0 +1,77 @@
+
+# notitles.py - plugin to list pages without titles
+#
+# Copyright (C) 1998, 1999 Albert Hopkins (marduk)
+# Copyright (C) 2002 Mike W. Meyer
+# Copyright (C) 2005, 2006, 2011 Arthur de Jong
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
+#
+# The files produced as output from the software do not automatically fall
+# under the copyright of the software, unless explicitly stated otherwise.
+
+"""List pages without a title."""
+
+__title__ = 'missing titles'
+__author__ = 'Arthur de Jong'
+__outputfile__ = 'notitles.html'
+
+from sqlalchemy.sql.functions import char_length
+
+from webcheck.db import Session, Link
+import webcheck.plugins
+
+
+def postprocess(site):
+    """Add page problems for all pages without a title."""
+    session = Session()
+    # get all internal pages without a title
+    links = session.query(Link).filter_by(is_page=True, is_internal=True)
+    links = links.filter((char_length(Link.title) == 0) |
+                         (Link.title == None))
+    for link in links:
+        link.add_pageproblem('missing title')
+    session.commit()
+
+
+def generate(site):
+    """Output the list of pages without a title."""
+    session = Session()
+    # get all internal pages without a title
+    links = session.query(Link).filter_by(is_page=True, is_internal=True)
+    links = links.filter((char_length(Link.title) == 0) |
+                         (Link.title == None)).order_by(Link.url)
+    # present results
+    fp = webcheck.plugins.open_html(webcheck.plugins.notitles, site)
+    if not links.count():
+        fp.write(
+          '   <p class="description">\n'
+          '    All pages had a title specified.\n'
+          '   </p>\n')
+        webcheck.plugins.close_html(fp)
+        return
+    fp.write(
+      '   <p class="description">\n'
+      '    This is the list of all (internal) pages without a proper title\n'
+      '    specified.\n'
+      '   </p>\n'
+      '   <ol>\n')
+    for link in links:
+        fp.write(
+          '    <li>%(link)s</li>\n'
+          % {'link': webcheck.plugins.make_link(link, link.url)})
+    fp.write(
+      '   </ol>\n')
+    webcheck.plugins.close_html(fp)
diff --git a/webcheck/plugins/old.py b/webcheck/plugins/old.py
new file mode 100644
index 0000000..859f54d
--- /dev/null
+++ b/webcheck/plugins/old.py
@@ -0,0 +1,79 @@
+
+# old.py - plugin to list old pages
+#
+# Copyright (C) 1998, 1999 Albert Hopkins (marduk)
+# Copyright (C) 2002 Mike W. Meyer
+# Copyright (C) 2005, 2006, 2011 Arthur de Jong
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
+#
+# The files produced as output from the software do not automatically fall
+# under the copyright of the software, unless explicitly stated otherwise.
+
+"""Present a list of potentially outdated pages."""
+
+__title__ = "what's old"
+__author__ = 'Arthur de Jong'
+__outputfile__ = 'old.html'
+
+import time
+
+from webcheck.db import Session, Link
+import webcheck.config
+import webcheck.plugins
+
+
+SECS_PER_DAY = 60 * 60 * 24
+
+
+def generate(site):
+    """Output the list of outdated pages to the specified file descriptor."""
+    session = Session()
+    # the time for which links are considered old
+    oldtime = time.time() - SECS_PER_DAY * webcheck.config.REPORT_WHATSOLD_URL_AGE
+    # get all internal pages that are old
+    links = session.query(Link).filter_by(is_page=True, is_internal=True)
+    links = links.filter(Link.mtime < oldtime).order_by(Link.mtime)
+    # present results
+    fp = webcheck.plugins.open_html(webcheck.plugins.old, site)
+    if not links.count():
+        fp.write(
+          '   <p class="description">\n'
+          '    No pages were found that were older than %(old)d days old.\n'
+          '   </p>\n'
+          % {'old': webcheck.config.REPORT_WHATSOLD_URL_AGE})
+        webcheck.plugins.close_html(fp)
+        return
+    fp.write(
+      '   <p class="description">\n'
+      '    These pages have been modified a long time ago (older than %(old)d\n'
+      '    days) and may be outdated.\n'
+      '   </p>\n'
+      '   <ul>\n'
+      % {'old': webcheck.config.REPORT_WHATSOLD_URL_AGE})
+    for link in links:
+        age = (time.time() - link.mtime) / SECS_PER_DAY
+        fp.write(
+          '    <li>\n'
+          '     %(link)s\n'
+          '     <ul class="problems">\n'
+          '      <li>age: %(age)d days</li>\n'
+          '     </ul>\n'
+          '    </li>\n'
+          % {'link': webcheck.plugins.make_link(link),
+             'age':  age})
+    fp.write(
+      '   </ul>\n')
+    webcheck.plugins.close_html(fp)
diff --git a/webcheck/plugins/problems.py b/webcheck/plugins/problems.py
new file mode 100644
index 0000000..4f9403c
--- /dev/null
+++ b/webcheck/plugins/problems.py
@@ -0,0 +1,129 @@
+
+# problems.py - plugin to list problems
+#
+# Copyright (C) 1998, 1999 Albert Hopkins (marduk)
+# Copyright (C) 2002 Mike W. Meyer
+# Copyright (C) 2005, 2006, 2007, 2011 Arthur de Jong
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
+#
+# The files produced as output from the software do not automatically fall
+# under the copyright of the software, unless explicitly stated otherwise.
+
+"""Present an overview of all encountered problems per author."""
+
+__title__ = 'problems by author'
+__author__ = 'Arthur de Jong'
+__outputfile__ = 'problems.html'
+
+import urllib
+
+from webcheck.db import Session, Link
+import webcheck.plugins
+
+
+def _mk_id(name):
+    """Convert the name to a string that may be used inside an
+    ID attribute."""
+    # convert to lowercase first
+    name = name.lower()
+    import re
+    # strip any leading non alpha characters
+    name = re.sub('^[^a-z]*', '', name)
+    # remove any non-allowed characters
+    name = re.sub('[^a-z0-9_:.]+', '-', name)
+    # we're done
+    return name
+
+
+def generate(site):
+    """Output the overview of problems per author."""
+    session = Session()
+    # make a list of problems per author
+    problem_db = {}
+    # get internal links with page problems
+    links = session.query(Link).filter_by(is_internal=True)
+    links = links.filter(Link.pageproblems.any()).order_by(Link.url)
+    for link in links:
+        # make a normal name for the author
+        if link.author:
+            author = link.author.strip()
+        else:
+            author = unicode('Unknown')
+        # store the problem
+        if author in problem_db:
+            problem_db[author].append(link)
+        else:
+            problem_db[author] = [link]
+    fp = webcheck.plugins.open_html(webcheck.plugins.problems, site)
+    if not problem_db:
+        fp.write(
+          '   <p class="description">\n'
+          '    No problems were found on this site, hurray.\n'
+          '   </p>\n')
+        webcheck.plugins.close_html(fp)
+        return
+    # print description
+    fp.write(
+      '   <p class="description">\n'
+      '    This is an overview of all the problems on the site, grouped by\n'
+      '    author.\n'
+      '   </p>\n')
+    # get a list of authors
+    authors = problem_db.keys()
+    authors.sort()
+    # generate short list of authors
+    if len(authors) > 1:
+        fp.write('   <ul class="authorlist">\n')
+        for author in authors:
+            fp.write(
+              '    <li><a href="#author_%(authorref)s">Author: %(author)s</a></li>\n'
+              % {'authorref': webcheck.plugins.htmlescape(_mk_id(author)),
+                 'author':    webcheck.plugins.htmlescape(author)})
+        fp.write('   </ul>\n')
+    # generate problem report
+    fp.write('   <ul>\n')
+    for author in authors:
+        fp.write(
+          '     <li id="author_%(authorref)s">\n'
+          '      Author: %(author)s\n'
+          '      <ul>\n'
+          % {'authorref': webcheck.plugins.htmlescape(_mk_id(author)),
+             'author':    webcheck.plugins.htmlescape(author)})
+        # sort pages by url
+        problem_db[author].sort(lambda a, b: cmp(a.url, b.url))
+        # list problems for this author
+        for link in problem_db[author]:
+            # present the links
+            fp.write(
+              '    <li>\n'
+              '     %(link)s\n'
+              '     <ul class="problems">\n'
+              % {'link': webcheck.plugins.make_link(link)})
+            # list the problems
+            for problem in link.pageproblems:
+                fp.write(
+                  '      <li>%(problem)s</li>\n'
+                  % {'problem':  webcheck.plugins.htmlescape(problem)})
+            # end the list item
+            fp.write(
+              '     </ul>\n'
+              '    </li>\n')
+        fp.write(
+          '      </ul>\n'
+          '     </li>\n')
+    fp.write(
+      '   </ul>\n')
+    webcheck.plugins.close_html(fp)
diff --git a/webcheck/plugins/sitemap.py b/webcheck/plugins/sitemap.py
new file mode 100644
index 0000000..93c975a
--- /dev/null
+++ b/webcheck/plugins/sitemap.py
@@ -0,0 +1,96 @@
+
+# sitemap.py - plugin to generate a sitemap
+#
+# Copyright (C) 1998, 1999 Albert Hopkins (marduk)
+# Copyright (C) 2002 Mike W. Meyer
+# Copyright (C) 2005, 2006, 2007, 2011 Arthur de Jong
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
+#
+# The files produced as output from the software do not automatically fall
+# under the copyright of the software, unless explicitly stated otherwise.
+
+"""Present a site map of the checked site."""
+
+__title__ = 'site map'
+__author__ = 'Arthur de Jong'
+__outputfile__ = 'index.html'
+
+from webcheck.db import Session, Link
+import webcheck.config
+import webcheck.plugins
+
+
+def add_pagechildren(link, children, explored):
+    """Determine the page children of this link, combining the children of
+    embedded items and following redirects."""
+    # get all internal children
+    qry = link.children.filter(Link.is_internal == True)
+    if link.depth:
+        qry = qry.filter((Link.depth > link.depth) | (Link.depth == None))
+    # follow redirects
+    children.update(y
+                    for y in (x.follow_link() for x in qry)
+                    if y and y.is_page and y.is_internal and y.id not in explored)
+    explored.update(x.id for x in children)
+    # add embedded element's pagechildren (think frames)
+    for embed in link.embedded.filter(Link.is_internal == True).filter(Link.is_page == True):
+        # TODO: put this in a query
+        if embed.id not in explored and \
+           (embed.depth == None or embed.depth > link.depth):
+            add_pagechildren(embed, children, explored)
+
+
+def _explore(fp, link, explored, depth=0, indent='    '):
+    """Recursively do a breadth first traversal of the graph of links on the
+    site. Prints the html results to the file descriptor."""
+    # output this link
+    fp.write(indent + '<li>\n')
+    fp.write(indent + ' ' + webcheck.plugins.make_link(link) + '\n')
+    # only check children if we are not too deep yet
+    if depth <= webcheck.config.REPORT_SITEMAP_LEVEL:
+        # figure out the links to follow and ensure that they are only
+        # explored from here
+        children = set()
+        add_pagechildren(link, children, explored)
+        # remove None which could be there as a result of follow_link()
+        children.discard(None)
+        if children:
+            children = list(children)
+            # present children as a list
+            fp.write(indent + ' <ul>\n')
+            children.sort(lambda a, b: cmp(a.url, b.url))
+            for child in children:
+                _explore(fp, child, explored, depth + 1, indent + '  ')
+            fp.write(indent + ' </ul>\n')
+    fp.write(indent + '</li>\n')
+
+
+def generate(site):
+    """Output the sitemap."""
+    session = Session()
+    fp = webcheck.plugins.open_html(webcheck.plugins.sitemap, site)
+    # output the site structure using breadth first traversal
+    fp.write(
+      '   <p class="description">\n'
+      '    This an overview of the crawled site.\n'
+      '   </p>\n'
+      '   <ul>\n')
+    explored = set(x.id for x in site.bases)
+    for l in site.bases:
+        _explore(fp, l, explored)
+    fp.write(
+      '   </ul>\n')
+    webcheck.plugins.close_html(fp)
diff --git a/webcheck/plugins/size.py b/webcheck/plugins/size.py
new file mode 100644
index 0000000..641936f
--- /dev/null
+++ b/webcheck/plugins/size.py
@@ -0,0 +1,97 @@
+
+# size.py - plugin that lists pages that could be slow to load
+#
+# Copyright (C) 1998, 1999 Albert Hopkins (marduk)
+# Copyright (C) 2002 Mike W. Meyer
+# Copyright (C) 2005, 2006, 2011 Arthur de Jong
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
+#
+# The files produced as output from the software do not automatically fall
+# under the copyright of the software, unless explicitly stated otherwise.
+
+"""Present a list of pages that are large and probably slow to download."""
+
+__title__ = "what's big"
+__author__ = 'Arthur de Jong'
+__outputfile__ = 'size.html'
+
+from webcheck.db import Session, Link
+import webcheck.config
+import webcheck.plugins
+
+
+def _getsize(link, done=None):
+    """Return the size of the link and all its embedded links, counting each
+    link only once."""
+    # make a new list
+    if done is None:
+        done = []
+    # add this link to the list
+    done.append(link)
+    # if we don't known about our total size yet, calculate
+    if not hasattr(link, 'total_size'):
+        size = 0
+        # add our size
+        if link.size is not None:
+            size = link.size
+        # add sizes of embedded objects
+        for embed in link.embedded:
+            if embed not in done:
+                size += _getsize(embed, done)
+        link.total_size = size
+    return link.total_size
+
+
+def generate(site):
+    """Output the list of large pages."""
+    session = Session()
+    # get all internal pages and get big links
+    links = session.query(Link).filter_by(is_page=True, is_internal=True)
+    links = [x for x in links
+             if _getsize(x) >= webcheck.config.REPORT_SLOW_URL_SIZE * 1024]
+    # sort links by size (biggest first)
+    links.sort(lambda a, b: cmp(b.total_size, a.total_size))
+    # present results
+    fp = webcheck.plugins.open_html(webcheck.plugins.size, site)
+    if not links:
+        fp.write(
+          '   <p class="description">\n'
+          '    No pages over %(size)dK were found.\n'
+          '   </p>\n'
+          % {'size': webcheck.config.REPORT_SLOW_URL_SIZE})
+        webcheck.plugins.close_html(fp)
+        return
+    fp.write(
+      '   <p class="description">\n'
+      '    These pages are probably too big (over %(size)dK) which could be\n'
+      '    slow to download.\n'
+      '   </p>\n'
+      '   <ul>\n'
+      % {'size': webcheck.config.REPORT_SLOW_URL_SIZE})
+    for link in links:
+        size = webcheck.plugins.get_size(link.total_size)
+        fp.write(
+          '    <li>\n'
+          '     %(link)s\n'
+          '     <ul class="problem">\n'
+          '      <li>size: %(size)s</li>\n'
+          '     </ul>\n'
+          '    </li>\n'
+          % {'link': webcheck.plugins.make_link(link),
+             'size': size})
+    fp.write(
+      '   </ul>\n')
+    webcheck.plugins.close_html(fp)
diff --git a/webcheck/plugins/urllist.py b/webcheck/plugins/urllist.py
new file mode 100644
index 0000000..f630c63
--- /dev/null
+++ b/webcheck/plugins/urllist.py
@@ -0,0 +1,49 @@
+
+# urllist.py - plugin to generate a list of visited urls
+#
+# Copyright (C) 2005, 2006, 2011 Arthur de Jong
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
+#
+# The files produced as output from the software do not automatically fall
+# under the copyright of the software, unless explicitly stated otherwise.
+
+"""Present a list of visited urls."""
+
+__title__ = 'url list'
+__author__ = 'Arthur de Jong'
+__outputfile__ = 'urllist.html'
+
+from webcheck.db import Session, Link
+import webcheck.plugins
+
+
+def generate(site):
+    """Output a sorted list of URLs."""
+    session = Session()
+    fp = webcheck.plugins.open_html(webcheck.plugins.urllist, site)
+    fp.write(
+      '   <p class="description">\n'
+      '    This is the list of all urls encountered during the examination of\n'
+      '    the website. It lists internal as well as external and\n'
+      '    non-examined urls.\n'
+      '   </p>\n'
+      '   <ol>\n')
+    links = session.query(Link).order_by(Link.url)
+    for link in links:
+        fp.write('    <li>' + webcheck.plugins.make_link(link, link.url) + '</li>\n')
+    fp.write(
+      '   </ol>\n')
+    webcheck.plugins.close_html(fp)
-- 
cgit v1.2.3