From 7d7b8cb696c023e3917c9f15485c0d544de7bbe7 Mon Sep 17 00:00:00 2001 From: Arthur de Jong Date: Fri, 16 Sep 2011 13:36:38 +0000 Subject: move all the code except the command-line handling to the webcheck package and reorganise imports accordingly git-svn-id: http://arthurdejong.org/svn/webcheck/webcheck@435 86f53f14-5ff3-0310-afe5-9b438ce3f40c --- cmd.py | 295 +++++++++++++++++++++++ config.py | 131 ---------- crawler.py | 421 -------------------------------- db.py | 300 ----------------------- debugio.py | 65 ----- monkeypatch.py | 81 ------- myurllib.py | 120 ---------- parsers/__init__.py | 63 ----- parsers/css.py | 55 ----- parsers/html/__init__.py | 123 ---------- parsers/html/beautifulsoup.py | 191 --------------- parsers/html/calltidy.py | 36 --- parsers/html/htmlparser.py | 304 ------------------------ plugins/__init__.py | 284 ---------------------- plugins/about.py | 114 --------- plugins/anchors.py | 51 ---- plugins/badlinks.py | 90 ------- plugins/external.py | 68 ------ plugins/images.py | 64 ----- plugins/new.py | 77 ------ plugins/notchkd.py | 68 ------ plugins/notitles.py | 77 ------ plugins/old.py | 79 ------ plugins/problems.py | 129 ---------- plugins/sitemap.py | 96 -------- plugins/size.py | 97 -------- plugins/urllist.py | 49 ---- webcheck.py | 295 ----------------------- webcheck/__init__.py | 0 webcheck/config.py | 132 +++++++++++ webcheck/crawler.py | 422 +++++++++++++++++++++++++++++++++ webcheck/db.py | 299 +++++++++++++++++++++++ webcheck/debugio.py | 66 ++++++ webcheck/monkeypatch.py | 81 +++++++ webcheck/myurllib.py | 120 ++++++++++ webcheck/parsers/__init__.py | 63 +++++ webcheck/parsers/css.py | 56 +++++ webcheck/parsers/html/__init__.py | 125 ++++++++++ webcheck/parsers/html/beautifulsoup.py | 194 +++++++++++++++ webcheck/parsers/html/calltidy.py | 37 +++ webcheck/parsers/html/htmlparser.py | 306 ++++++++++++++++++++++++ webcheck/plugins/__init__.py | 281 ++++++++++++++++++++++ webcheck/plugins/about.py | 114 +++++++++ webcheck/plugins/anchors.py | 51 ++++ webcheck/plugins/badlinks.py | 90 +++++++ webcheck/plugins/external.py | 68 ++++++ webcheck/plugins/images.py | 64 +++++ webcheck/plugins/new.py | 77 ++++++ webcheck/plugins/notchkd.py | 68 ++++++ webcheck/plugins/notitles.py | 77 ++++++ webcheck/plugins/old.py | 79 ++++++ webcheck/plugins/problems.py | 129 ++++++++++ webcheck/plugins/sitemap.py | 96 ++++++++ webcheck/plugins/size.py | 97 ++++++++ webcheck/plugins/urllist.py | 49 ++++ 55 files changed, 3536 insertions(+), 3528 deletions(-) create mode 100755 cmd.py delete mode 100644 config.py delete mode 100644 crawler.py delete mode 100644 db.py delete mode 100644 debugio.py delete mode 100644 monkeypatch.py delete mode 100644 myurllib.py delete mode 100644 parsers/__init__.py delete mode 100644 parsers/css.py delete mode 100644 parsers/html/__init__.py delete mode 100644 parsers/html/beautifulsoup.py delete mode 100644 parsers/html/calltidy.py delete mode 100644 parsers/html/htmlparser.py delete mode 100644 plugins/__init__.py delete mode 100644 plugins/about.py delete mode 100644 plugins/anchors.py delete mode 100644 plugins/badlinks.py delete mode 100644 plugins/external.py delete mode 100644 plugins/images.py delete mode 100644 plugins/new.py delete mode 100644 plugins/notchkd.py delete mode 100644 plugins/notitles.py delete mode 100644 plugins/old.py delete mode 100644 plugins/problems.py delete mode 100644 plugins/sitemap.py delete mode 100644 plugins/size.py delete mode 100644 plugins/urllist.py delete mode 100755 webcheck.py create mode 100644 webcheck/__init__.py create mode 100644 webcheck/config.py create mode 100644 webcheck/crawler.py create mode 100644 webcheck/db.py create mode 100644 webcheck/debugio.py create mode 100644 webcheck/monkeypatch.py create mode 100644 webcheck/myurllib.py create mode 100644 webcheck/parsers/__init__.py create mode 100644 webcheck/parsers/css.py create mode 100644 webcheck/parsers/html/__init__.py create mode 100644 webcheck/parsers/html/beautifulsoup.py create mode 100644 webcheck/parsers/html/calltidy.py create mode 100644 webcheck/parsers/html/htmlparser.py create mode 100644 webcheck/plugins/__init__.py create mode 100644 webcheck/plugins/about.py create mode 100644 webcheck/plugins/anchors.py create mode 100644 webcheck/plugins/badlinks.py create mode 100644 webcheck/plugins/external.py create mode 100644 webcheck/plugins/images.py create mode 100644 webcheck/plugins/new.py create mode 100644 webcheck/plugins/notchkd.py create mode 100644 webcheck/plugins/notitles.py create mode 100644 webcheck/plugins/old.py create mode 100644 webcheck/plugins/problems.py create mode 100644 webcheck/plugins/sitemap.py create mode 100644 webcheck/plugins/size.py create mode 100644 webcheck/plugins/urllist.py diff --git a/cmd.py b/cmd.py new file mode 100755 index 0000000..dbbe9d6 --- /dev/null +++ b/cmd.py @@ -0,0 +1,295 @@ +#!/usr/bin/env python + +# cmd.py - command-line front-end for webcheck +# +# Copyright (C) 1998, 1999 Albert Hopkins (marduk) +# Copyright (C) 2002 Mike W. Meyer +# Copyright (C) 2005, 2006, 2007, 2008, 2010, 2011 Arthur de Jong +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +# +# The files produced as output from the software do not automatically fall +# under the copyright of the software, unless explicitly stated otherwise. + +"""This is the main webcheck module.""" + +__version__ = '1.10.4' +__homepage__ = 'http://arthurdejong.org/webcheck/' + +import os +import re +import sys +import urllib +import urlparse + +from webcheck import config +# update some fields that currently are stored in config +config.VERSION = __version__ +config.HOMEPAGE = __homepage__ + +from webcheck import debugio +import webcheck.crawler +import webcheck.db +import webcheck.monkeypatch +import webcheck.plugins + +debugio.loglevel = debugio.INFO + + +def print_version(): + """Print version information.""" + sys.stdout.write( + 'webcheck %(version)s\n' + 'Written by Albert Hopkins (marduk), Mike W. Meyer and Arthur de Jong.\n' + '\n' + 'Copyright (C) 1998-2011\n' + 'Albert Hopkins (marduk), Mike W. Meyer and Arthur de Jong.\n' + 'This is free software; see the source for copying conditions. There is NO\n' + 'warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.\n' + % {'version': __version__}) + + +def print_usage(): + """Print short usage information.""" + sys.stderr.write( + 'Usage: webcheck [OPTION]... URL...\n') + + +def print_tryhelp(): + """Print friendly pointer to more information.""" + sys.stderr.write( + 'Try \'webcheck --help\' for more information.\n') + + +def print_help(): + """Print the option list.""" + sys.stdout.write( + 'Usage: webcheck [OPTION]... URL...\n' + 'Generate a report for the given URLs\n' + '\n' + ' -i, --internal=PATTERN mark URLs matching PATTERN as internal\n' + ' -x, --external=PATTERN mark URLs matching PATTERN as external\n' + ' -y, --yank=PATTERN do not check URLs matching PATTERN\n' + ' -b, --base-only base URLs only: consider any URL not starting\n' + ' with any of the base URLs to be external\n' + ' -a, --avoid-external do not check external URLs\n' + ' --ignore-robots do not retrieve and parse robots.txt files\n' + ' -q, --quiet, --silent suppress progress messages\n' + ' -d, --debug do programmer-level debugging\n' + ' -o, --output=DIRECTORY store the generated reports in the specified\n' + ' directory\n' + ' -c, --continue try to continue from a previous run\n' + ' -f, --force overwrite files without asking\n' + ' -r, --redirects=N the number of redirects webcheck should follow,\n' + ' 0 implies to follow all redirects (default=%(redirects)d)\n' + ' -u, --userpass=URL specify a URL with user:pass so username and password are given\n' + ' to matching network locations, -u http://user:pass@example.com\n' + ' -w, --wait=SECONDS wait SECONDS between retrievals\n' + ' -V, --version output version information and exit\n' + ' -h, --help display this help and exit\n' + % {'redirects': config.REDIRECT_DEPTH}) + + +def parse_args(site): + """Parse command-line arguments.""" + import getopt + try: + optlist, args = getopt.gnu_getopt(sys.argv[1:], + 'i:x:y:l:baqdo:cfr:u:w:Vh', + ('internal=', 'external=', 'yank=', 'base-only', 'avoid-external', + 'ignore-robots', + 'quiet', 'silent', 'debug', 'profile', 'output=', 'continue', + 'force', 'redirects=', 'userpass=', 'wait=', 'version', 'help')) + internal_urls = [] + external_urls = [] + yank_urls = [] + for flag, arg in optlist: + if flag in ('-i', '--internal'): + internal_urls.append(arg) + elif flag in ('-x', '--external'): + external_urls.append(arg) + elif flag in ('-y', '--yank'): + yank_urls.append(arg) + elif flag in ('-b', '--base-only'): + config.BASE_URLS_ONLY = True + elif flag in ('-a', '--avoid-external'): + config.AVOID_EXTERNAL_LINKS = True + elif flag in ('--ignore-robots',): + config.USE_ROBOTS = False + elif flag in ('-q', '--quiet', '--silent'): + debugio.loglevel = debugio.ERROR + elif flag in ('-d', '--debug'): + debugio.loglevel = debugio.DEBUG + elif flag in ('--profile',): + # undocumented on purpose + config.PROFILE = True + elif flag in ('-o', '--output'): + config.OUTPUT_DIR = arg + elif flag in ('-c', '--continue'): + config.CONTINUE = True + elif flag in ('-f', '--force'): + config.OVERWRITE_FILES = True + elif flag in ('-r', '--redirects'): + config.REDIRECT_DEPTH = int(arg) + elif flag in ('-u', '--userpass'): + (_scheme, _netloc, _path, _params, _query, _frag) = urlparse.urlparse(arg) + (_userpass, _netloc) = urllib.splituser(_netloc) + config.USERPASS[_netloc] = _userpass + elif flag in ('-w', '--wait'): + config.WAIT_BETWEEN_REQUESTS = float(arg) + elif flag in ('-V', '--version'): + print_version() + sys.exit(0) + elif flag in ('-h', '--help'): + print_help() + sys.exit(0) + if len(args) == 0 and not config.CONTINUE: + print_usage() + print_tryhelp() + sys.exit(1) + # ensure output directory exists + if not os.path.isdir(config.OUTPUT_DIR): + os.mkdir(config.OUTPUT_DIR) + # set up database connection + filename = os.path.join(config.OUTPUT_DIR, 'webcheck.sqlite') + from sqlalchemy import create_engine + engine = create_engine('sqlite:///' + filename) + webcheck.db.Session.configure(bind=engine) + # ensure that all tables are created + webcheck.db.Base.metadata.create_all(engine) + # TODO: schema migraton goes here + # add configuration to site + for pattern in internal_urls: + site.add_internal_re(pattern) + for pattern in external_urls: + site.add_external_re(pattern) + for pattern in yank_urls: + site.add_yanked_re(pattern) + for arg in args: + # if it does not look like a url it is probably a local file + if urlparse.urlsplit(arg)[0] == '': + arg = 'file://' + urllib.pathname2url(os.path.abspath(arg)) + site.add_internal(arg) + except getopt.error, reason: + sys.stderr.write('webcheck: %s\n' % reason) + print_tryhelp() + sys.exit(1) + except re.error, e: + sys.stderr.write('webcheck: %s\n' % str(e)) + sys.exit(1) + + +def install_file(source, text=False): + """Install the given file in the output directory. + If the text flag is set to true it is assumed the file is text, + translating line endings.""" + import shutil + import urlparse + # figure out mode to open the file with + mode = 'r' + if text: + mode += 'U' + # check with what kind of argument we are called + scheme = urlparse.urlsplit(source)[0] + if scheme == 'file': + # this is a file:/// url, translate to normal path and open + import urllib + source = urllib.url2pathname(urlparse.urlsplit(source)[2]) + elif scheme == '' and os.path.isabs(source): + # this is an absolute path, just open it as is + pass + elif scheme == '': + # this is a relavite path, try to fetch it from the python path + for directory in sys.path: + tst = os.path.join(directory, source) + if os.path.isfile(tst): + source = tst + break + # TODO: support more schemes here + # figure out the destination name + target = os.path.join(config.OUTPUT_DIR, os.path.basename(source)) + # test if source and target are the same + source = os.path.realpath(source) + if source == os.path.realpath(target): + debugio.warn('attempt to overwrite %(fname)s with itself' % {'fname': source}) + return + # open the input file + sfp = None + try: + sfp = open(source, mode) + except IOError, (errno, strerror): + debugio.error('%(fname)s: %(strerror)s' % + {'fname': source, + 'strerror': strerror}) + sys.exit(1) + # create file in output directory (with overwrite question) + tfp = webcheck.plugins.open_file(os.path.basename(source)) + # copy contents + shutil.copyfileobj(sfp, tfp) + # close files + tfp.close() + sfp.close() + + +def main(site): + """Main program.""" + # crawl through the website + debugio.info('checking site....') + webcheck.crawler.setup_urllib2() + site.crawl() # this will take a while + debugio.info('done.') + # do postprocessing (building site structure, etc) + debugio.info('postprocessing....') + site.postprocess() + debugio.info('done.') + # now we can write out the files + # start with the frame-description page + debugio.info('generating reports...') + # for every plugin, generate a page + site.generate() + # put extra files in the output directory + install_file('webcheck.css', True) + install_file('fancytooltips/fancytooltips.js', True) + install_file('favicon.ico', False) + debugio.info('done.') + + +if __name__ == '__main__': + try: + # initialize site object + site = webcheck.crawler.Site() + # parse command-line arguments + parse_args(site) + # run the main program + if config.PROFILE: + fname = os.path.join(config.OUTPUT_DIR, 'webcheck.prof') + try: + import cProfile + except ImportError: + import profile as cProfile + try: + import sqltap + sqltap.start() + except ImportError: + pass + cProfile.run('main(site)', fname) + if 'sqltap' in locals(): + statistics = sqltap.collect() + sqltap.report(statistics, os.path.join(config.OUTPUT_DIR, 'sqltap.html')) + else: + main(site) + except KeyboardInterrupt: + sys.stderr.write('Interrupted\n') + sys.exit(1) diff --git a/config.py b/config.py deleted file mode 100644 index e106b8e..0000000 --- a/config.py +++ /dev/null @@ -1,131 +0,0 @@ - -# config.py - configuration state for webcheck -# -# Copyright (C) 1998, 1999 Albert Hopkins (marduk) -# Copyright (C) 2002 Mike Meyer -# Copyright (C) 2005, 2006, 2007, 2008, 2010, 2011 Arthur de Jong -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 2 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA -# -# The files produced as output from the software do not automatically fall -# under the copyright of the software, unless explicitly stated otherwise. - -"""Configuration state for webcheck. - -This file contains the default configuration for webcheck. All configurable -items should be changeble from the command line.""" - -import urllib - -# Whether to consider any URL not starting with the base URL to be external. -# This is the state of the -b command line option. -BASE_URLS_ONLY = False - -# Avoid checking external links at all. This is the state of the -a command -# line option. -AVOID_EXTERNAL_LINKS = False - -# The proxy configuration. -PROXIES = urllib.getproxies_environment() - -# IO timeout as passed to socket.setdefaulttimeout() -# value is a float in seconds None disables the timeout -IOTIMEOUT = 10.0 - -# Output directory. This is the state of the -o command line option. -OUTPUT_DIR = '.' - -# Whether to try to read a state file to continue from. -CONTINUE = False - -# Whether to produce profiling information. This is for development -# purposes and as such undocumented. -# http://docs.python.org/lib/profile.html -PROFILE = False - -# This is the time in seconds to wait between requests. This is the state of -# the -w command line option. -WAIT_BETWEEN_REQUESTS = 0 - -# Redirect depth, the number of redirects to follow. This is the state of the -# -r command line option. -REDIRECT_DEPTH = 5 - -# The list of plugins that will be used to generate the report. -PLUGINS = ['anchors', - 'sitemap', - 'urllist', - 'images', - 'external', - 'notchkd', - 'badlinks', - 'old', - 'new', - 'size', - 'notitles', - 'problems', - 'about'] - -# Whether to overwrite files without asking. This is the state of the -f -# command line option. -OVERWRITE_FILES = False - -# Whether to add extra headers to outgoing requests, requesting to -# disable caching, ensuring that a fresh page is returned -BYPASSHTTPCACHE = False - -# The number of levels the sitemap plugin should show. -REPORT_SITEMAP_LEVEL = 8 - -# The age of pages in days that after which a page is considered too old. -REPORT_WHATSOLD_URL_AGE = 700 - -# The age of pages in days within wich a page is considered new. -REPORT_WHATSNEW_URL_AGE = 7 - -# The size of a page in kilobytes after which the page is considered too big. -REPORT_SLOW_URL_SIZE = 76 - -# The maximum number of links to show in the "referenced from:" lists -PARENT_LISTLEN = 10 - -# Whether to open links in a new window (add target="_blank") -# (disabled by default because it is not xhtml 1.1) -REPORT_LINKS_IN_NEW_WINDOW = False - -# A list of names that will be checked when encountering an file:/// -# directory. This file will be picked up instead of the directory list. -FILE_INDEXES = ['index.html', 'index.htm'] - -# A list of names that will be checked when encountering an ftp:// -# directory. This file will be picked up instead of the directory list. -FTP_INDEXES = ['index.html', 'index.htm'] - -# Whether to fetch robots.txt files and do checking based on the information -# present in those files (normally matching links are yanked). -USE_ROBOTS = True - -# This is a hash that maps netlocs (e.g. some.server.com:8000) to -# username/password combinations that are passed as basic authentication -# to that netloc -USERPASS = {} - -# Options for tidy (make None to disable running tidy) -# See http://tidy.sourceforge.net/docs/quickref.html for details. -TIDY_OPTIONS = dict(quiet=1, - accessibility_check=1, - show_errors=6, - show_warnings=1, - char_encoding='raw') diff --git a/crawler.py b/crawler.py deleted file mode 100644 index 5c842db..0000000 --- a/crawler.py +++ /dev/null @@ -1,421 +0,0 @@ - -# crawler.py - definition of Link class for storing the crawled site -# -# Copyright (C) 1998, 1999 Albert Hopkins (marduk) -# Copyright (C) 2002 Mike W. Meyer -# Copyright (C) 2005, 2006, 2007, 2008, 2011 Arthur de Jong -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 2 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA -# -# The files produced as output from the software do not automatically fall -# under the copyright of the software, unless explicitly stated otherwise. - -"""General module to do site-checking. This module contains the Site class -containing the state for the crawled site and some functions to access and -manipulate the crawling of the website. This module also contains the Link -class that holds all the link related properties.""" - -import atexit -import cookielib -import datetime -import httplib -import os -import re -import robotparser -import socket -import time -import urllib -import urllib2 -import urlparse - -import config -import db -import debugio -import parsers - - -class RedirectError(urllib2.HTTPError): - def __init__(self, url, code, msg, hdrs, fp, newurl): - self.newurl = newurl - urllib2.HTTPError.__init__(self, url, code, msg, hdrs, fp) - - -class NoRedirectHandler(urllib2.HTTPRedirectHandler): - - def redirect_request(self, req, fp, code, msg, headers, newurl): - raise RedirectError(req.get_full_url(), code, msg, headers, fp, newurl) - - -def setup_urllib2(): - """Configure the urllib2 module to store cookies in the output - directory.""" - filename = os.path.join(config.OUTPUT_DIR, 'cookies.lwp') - # set up our cookie jar - cookiejar = cookielib.LWPCookieJar(filename) - try: - cookiejar.load(ignore_discard=False, ignore_expires=False) - except IOError: - pass - atexit.register(cookiejar.save, ignore_discard=False, ignore_expires=False) - # set up our custom opener that sets a meaningful user agent - opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookiejar), - NoRedirectHandler()) - opener.addheaders = [ - ('User-agent', 'webcheck %s' % config.VERSION), - ] - if config.BYPASSHTTPCACHE: - opener.addheaders.append(('Cache-control', 'no-cache')) - opener.addheaders.append(('Pragma', 'no-cache')) - urllib2.install_opener(opener) - - -# pattern for matching spaces -_spacepattern = re.compile(' ') - -# pattern to match anchor part of a url -_anchorpattern = re.compile('#([^#]+)$') - - -# TODO: rename Site to Crawler -class Site(object): - """Class to represent gathered data of a site. - - The available properties of this class are: - - bases - a list of base link object - """ - - def __init__(self): - """Creates an instance of the Site class and initializes the - state of the site.""" - # list of internal urls - self._internal_urls = set() - # list of regexps considered internal - self._internal_res = {} - # list of regexps considered external - self._external_res = {} - # list of regexps matching links that should not be checked - self._yanked_res = {} - # map of scheme+netloc to robot handleds - self._robotparsers = {} - # list of base urls (these are the internal urls to start from) - self.bases = [] - - def add_internal(self, url): - """Add the given url and consider all urls below it to be internal. - These links are all marked for checking with the crawl() function.""" - url = db.Link.clean_url(url) - if url not in self._internal_urls: - self._internal_urls.add(url) - - def add_internal_re(self, exp): - """Adds the gived regular expression as a pattern to match internal - urls.""" - self._internal_res[exp] = re.compile(exp, re.IGNORECASE) - - def add_external_re(self, exp): - """Adds the gived regular expression as a pattern to match external - urls.""" - self._external_res[exp] = re.compile(exp, re.IGNORECASE) - - def add_yanked_re(self, exp): - """Adds the gived regular expression as a pattern to match urls that - will not be checked at all.""" - self._yanked_res[exp] = re.compile(exp, re.IGNORECASE) - - def _is_internal(self, url): - """Check whether the specified url is external or internal. - This uses the urls marked with add_internal() and the regular - expressions passed with add_external_re().""" - # check if it is internal through the regexps - for regexp in self._internal_res.values(): - if regexp.search(url) is not None: - return True - res = False - # check that the url starts with an internal url - if config.BASE_URLS_ONLY: - # the url must start with one of the _internal_urls - for i in self._internal_urls: - res |= (i == url[:len(i)]) - else: - # the netloc must match a netloc of an _internal_url - netloc = urlparse.urlsplit(url)[1] - for i in self._internal_urls: - res |= (urlparse.urlsplit(i)[1] == netloc) - # if it is not internal now, it never will be - if not res: - return False - # check if it is external through the regexps - for x in self._external_res.values(): - # if the url matches it is external and we can stop - if x.search(url): - return False - return True - - def _get_robotparser(self, scheme, netloc): - """Return the proper robots parser for the given url or None if one - cannot be constructed. Robot parsers are cached per scheme and - netloc.""" - # only some schemes have a meaningful robots.txt file - if scheme != 'http' and scheme != 'https': - debugio.debug('crawler._get_robotparser() ' - 'called with unsupported scheme (%s)' % scheme) - return None - # split out the key part of the url - location = urlparse.urlunsplit((scheme, netloc, '', '', '')) - # try to create a new robotparser if we don't already have one - if location not in self._robotparsers: - debugio.info(' getting robots.txt for %s' % location) - self._robotparsers[location] = None - try: - rp = robotparser.RobotFileParser() - rp.set_url(urlparse.urlunsplit( - (scheme, netloc, '/robots.txt', '', ''))) - rp.read() - self._robotparsers[location] = rp - except (TypeError, IOError, httplib.HTTPException): - # ignore any problems setting up robot parser - pass - return self._robotparsers[location] - - def _is_yanked(self, url): - """Check whether the specified url should not be checked at all. - This uses the regualr expressions passed with add_yanked_re() and the - robots information present.""" - # check if it is yanked through the regexps - for regexp in self._yanked_res.values(): - # if the url matches it is yanked and we can stop - if regexp.search(url): - return 'yanked' - # check if we should avoid external links - is_internal = self._is_internal(url) - if not is_internal and config.AVOID_EXTERNAL_LINKS: - return 'external avoided' - # check if we should use robot parsers - if not config.USE_ROBOTS: - return None - (scheme, netloc) = urlparse.urlsplit(url)[0:2] - # skip schemes not having robot.txt files - if scheme not in ('http', 'https'): - return None - # skip robot checks for external urls - # TODO: make this configurable - if not is_internal: - return None - # check robots for remaining links - rp = self._get_robotparser(scheme, netloc) - if rp and not rp.can_fetch('webcheck', url): - return 'robot restriced' - # fall back to allowing the url - return None - - def get_link(self, session, url): - # try to find the URL - url = db.Link.clean_url(url) - link = session.query(db.Link).filter_by(url=url).first() - if not link: - link = db.Link(url=url) - session.add(link) - return link - - def get_links_to_crawl(self, session): - links = session.query(db.Link).filter(db.Link.fetched == None) - return links.filter(db.Link.yanked == None) - - def crawl(self): - """Crawl the website based on the urls specified with - add_internal(). If the serialization file pointer - is specified the crawler writes out updated links to - the file while crawling the site.""" - # get a database session - session = db.Session() - # remove all links - if not config.CONTINUE: - session.query(db.LinkProblem).delete() - session.commit() - session.query(db.PageProblem).delete() - session.commit() - session.execute(db.children.delete()) - session.commit() - session.execute(db.embedded.delete()) - session.commit() - session.query(db.Link).delete() - session.commit() - # add all internal urls to the database - for url in self._internal_urls: - url = db.Link.clean_url(url) - self.get_link(session, url) - # add some URLs from the database that haven't been fetched - tocheck = self.get_links_to_crawl(session) - remaining = tocheck.count() - tocheck = tocheck[:100] - remaining -= len(tocheck) - # repeat until we have nothing more to check - while tocheck: - # choose a link from the tocheck list - link = tocheck.pop() - link.is_internal = self._is_internal(link.url) - link.yanked = self._is_yanked(link.url) - # see if there are any more links to check - if not tocheck: - tocheck = self.get_links_to_crawl(session) - remaining = tocheck.count() - tocheck = tocheck[:100] - remaining -= len(tocheck) - # skip link it there is nothing to check - if link.yanked or link.fetched: - continue - # fetch the link's contents - response = self.fetch(link) - if response: - self.parse(link, response) - # flush database changes - session.commit() - # sleep between requests if configured - if config.WAIT_BETWEEN_REQUESTS > 0: - debugio.debug('crawler.crawl(): sleeping %s seconds' % - config.WAIT_BETWEEN_REQUESTS) - time.sleep(config.WAIT_BETWEEN_REQUESTS) - debugio.debug('crawler.crawl(): items left to check: %d' % - (remaining + len(tocheck))) - session.commit() - - def fetch(self, link): - """Attempt to fetch the url (if not yanked) and fill in link - attributes (based on is_internal).""" - debugio.info(' %s' % link.url) - # mark the link as fetched to avoid loops - link.fetched = datetime.datetime.now() - # see if we can import the proper module for this scheme - try: - # FIXME: if an URI has a username:passwd add the uri, username and password to the HTTPPasswordMgr - request = urllib2.Request(link.url) - parent = link.parents.first() - if parent: - request.add_header('Referer', parent.url) - response = urllib2.urlopen(request) - link.mimetype = response.info().gettype() - link.set_encoding(response.headers.getparam('charset')) - # FIXME: get result code and other stuff - link.status = str(response.code) - # link.size = int(response.getheader('Content-length')) - # link.mtime = time.mktime(response.msg.getdate('Last-Modified')) - # if response.status == 301: link.add_linkproblem(str(response.status)+': '+response.reason) - # elif response.status != 200: link.add_linkproblem(str(response.status)+': '+response.reason) - # TODO: add checking for size - return response - except RedirectError, e: - link.status = str(e.code) - debugio.info(' ' + str(e)) - if e.code == 301: - link.add_linkproblem(str(e)) - link.add_redirect(e.newurl) - return - except urllib2.HTTPError, e: - link.status = str(e.code) - debugio.info(' ' + str(e)) - link.add_linkproblem(str(e)) - return - except urllib2.URLError, e: - debugio.info(' ' + str(e)) - link.add_linkproblem(str(e)) - return - except KeyboardInterrupt: - # handle this in a higher-level exception handler - raise - except Exception, e: - # handle all other exceptions - debugio.warn('unknown exception caught: ' + str(e)) - link.add_linkproblem('error reading HTTP response: %s' % str(e)) - import traceback - traceback.print_exc() - return - - def parse(self, link, response): - """Parse the fetched response.""" - # find a parser for the content-type - parsermodule = parsers.get_parsermodule(link.mimetype) - if parsermodule is None: - debugio.debug('crawler.Link.fetch(): unsupported content-type: %s' % link.mimetype) - return - # skip parsing of content if we were returned nothing - content = response.read() - if content is None: - return - # parse the content - debugio.debug('crawler.Link.fetch(): parsing using %s' % parsermodule.__name__) - try: - parsermodule.parse(content, link) - except Exception, e: - import traceback - traceback.print_exc() - debugio.warn('problem parsing page: ' + str(e)) - link.add_pageproblem('problem parsing page: ' + str(e)) - - def postprocess(self): - """Do some basic post processing of the collected data, including - depth calculation of every link.""" - # get a database session - session = db.Session() - # build the list of urls that were set up with add_internal() that - # do not have a parent (they form the base for the site) - for url in self._internal_urls: - link = self.get_link(session, url).follow_link() - if not link: - debugio.warn('base link %s redirects to nowhere' % url) - continue - # add the link to bases - debugio.debug('crawler.postprocess(): adding %s to bases' % link.url) - self.bases.append(link) - # if we got no bases, just use the first internal one - if not self.bases: - link = session.query(db.Link).filter(db.Link.is_internal == True).first() - debugio.debug('crawler.postprocess(): fallback to adding %s to bases' % link.url) - self.bases.append(link) - # do a breadth first traversal of the website to determine depth - session.query(db.Link).update(dict(depth=None), synchronize_session=False) - session.commit() - depth = 0 - count = len(self.bases) - for link in self.bases: - link.depth = 0 - session.commit() - debugio.debug('crawler.postprocess(): %d links at depth 0' % count) - while count > 0: - # update the depth of all links without a depth that have a - # parent with the previous depth - qry = session.query(db.Link).filter(db.Link.depth == None) - qry = qry.filter(db.Link.linked_from.any(db.Link.depth == depth)) - count = qry.update(dict(depth=depth + 1), synchronize_session=False) - session.commit() - depth += 1 - debugio.debug('crawler.postprocess(): %d links at depth %d' % (count, depth)) - # TODO: also handle embeds - # see if any of the plugins want to do postprocessing - for p in config.PLUGINS: - # import the plugin - plugin = __import__('plugins.' + p, globals(), locals(), [p]) - if hasattr(plugin, 'postprocess'): - debugio.info(' ' + p) - plugin.postprocess(self) - - def generate(self): - """Generate pages for plugins.""" - for p in config.PLUGINS: - # import the plugin - plugin = __import__('plugins.' + p, globals(), locals(), [p]) - if hasattr(plugin, 'generate'): - debugio.info(' ' + p) - plugin.generate(self) diff --git a/db.py b/db.py deleted file mode 100644 index 3426dfb..0000000 --- a/db.py +++ /dev/null @@ -1,300 +0,0 @@ - -# db.py - database access layer for webcheck -# -# Copyright (C) 2011 Arthur de Jong -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 2 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA -# -# The files produced as output from the software do not automatically fall -# under the copyright of the software, unless explicitly stated otherwise. - -import urlparse - -from sqlalchemy.ext.declarative import declarative_base -from sqlalchemy import distinct, func -from sqlalchemy import Table, Column, Integer, Boolean, String, DateTime, ForeignKey -from sqlalchemy.orm import relationship, backref, sessionmaker -from sqlalchemy.orm.session import object_session -from sqlalchemy.sql.expression import ClauseElement, union - -import config -import debugio -import myurllib - - -# provide session and schema classes -Session = sessionmaker() -Base = declarative_base() - - -children = Table( - 'children', Base.metadata, - Column('parent_id', Integer, ForeignKey('links.id', ondelete='CASCADE'), index=True), - Column('child_id', Integer, ForeignKey('links.id', ondelete='CASCADE'), index=True) - ) - - -embedded = Table( - 'embedded', Base.metadata, - Column('parent_id', Integer, ForeignKey('links.id', ondelete='CASCADE'), index=True), - Column('child_id', Integer, ForeignKey('links.id', ondelete='CASCADE'), index=True) - ) - - -class Link(Base): - - __tablename__ = 'links' - - id = Column(Integer, primary_key=True) - url = Column(String, index=True, nullable=False, unique=True) - is_internal = Column(Boolean, index=True) - yanked = Column(String, index=True) - fetched = Column(DateTime, index=True) - - # information about the retrieved link - status = Column(String) - mimetype = Column(String) - mimetype = Column(String) - encoding = Column(String) - size = Column(Integer) - mtime = Column(DateTime, index=True) - is_page = Column(Boolean, index=True) - title = Column(String, index=True) - author = Column(String) - - # relationships between links - children = relationship('Link', secondary=children, - backref=backref('linked_from', lazy='dynamic'), - primaryjoin=(id == children.c.parent_id), - secondaryjoin=(id == children.c.child_id), - lazy='dynamic') - embedded = relationship('Link', secondary=embedded, - backref=backref('embedded_in', lazy='dynamic'), - primaryjoin=(id == embedded.c.parent_id), - secondaryjoin=(id == embedded.c.child_id), - lazy='dynamic') - - # crawling information - redirectdepth = Column(Integer, default=0) - depth = Column(Integer) - - @staticmethod - def clean_url(url): - # normalise the URL, removing the fragment from the URL - url = myurllib.normalizeurl(url) - return urlparse.urldefrag(myurllib.normalizeurl(url))[0] - - def _get_link(self, url): - """Get a link object for the specified URL.""" - # get the session - session = object_session(self) - # normalise the URL, removing the fragment from the URL - url, fragment = urlparse.urldefrag(myurllib.normalizeurl(url)) - # try to find the link - instance = session.query(Link).filter_by(url=url).first() - if not instance: - instance = Link(url=url) - session.add(instance) - # mark that we were looking for an anchor/fragment - if fragment: - instance.add_reqanchor(self, fragment) - # return the link - return instance - - def set_encoding(self, encoding): - """Set the encoding of the link doing some basic checks to see if - the encoding is supported.""" - if not self.encoding and encoding: - try: - debugio.debug('crawler.Link.set_encoding(%r)' % encoding) - unicode('just some random text', encoding, 'replace') - self.encoding = encoding - except Exception, e: - import traceback - traceback.print_exc() - self.add_pageproblem('unknown encoding: %s' % encoding) - - def add_redirect(self, url): - """Indicate that this link redirects to the specified url.""" - url = self.clean_url(url) - # figure out depth - self.redirectdepth = max([self.redirectdepth] + - [x.redirectdepth for x in self.parents]) + 1 - # check depth - if self.redirectdepth >= config.REDIRECT_DEPTH: - self.add_linkproblem('too many redirects (%d)' % self.redirectdepth) - return - # check for redirect to self - if url == self.url: - self.add_linkproblem('redirect same as source: %s' % url) - return - # add child - self.add_child(url) - - def add_linkproblem(self, message): - """Indicate that something went wrong while retrieving this link.""" - self.linkproblems.append(LinkProblem(message=message)) - - def add_pageproblem(self, message): - """Indicate that something went wrong with parsing the document.""" - # only think about problems on internal pages - if not self.is_internal: - return - # TODO: only include a single problem once (e.g. multiple anchors) - self.pageproblems.append(PageProblem(message=message)) - - def add_child(self, url): - """Add the specified URL as a child of this link.""" - # ignore children for external links - if not self.is_internal: - return - # add to children - self.children.append(self._get_link(url)) - - def add_embed(self, url): - """Mark the given URL as used as an image on this page.""" - # ignore embeds for external links - if not self.is_internal: - return - # add to embedded - self.embedded.append(self._get_link(url)) - - def add_anchor(self, anchor): - """Indicate that this page contains the specified anchor.""" - # lowercase anchor - anchor = anchor.lower() - if self.anchors.filter(Anchor.anchor == anchor).first(): - self.add_pageproblem( - 'anchor/id "%(anchor)s" defined multiple times' - % {'anchor': anchor}) - else: - self.anchors.append(Anchor(anchor=anchor)) - - def add_reqanchor(self, parent, anchor): - """Indicate that the specified link contains a reference to the - specified anchor. This can be checked later.""" - # lowercase anchor - anchor = anchor.lower() - # if RequestedAnchor doesn't exist, add it - if not self.reqanchors.filter((RequestedAnchor.parent_id == parent.id) & (RequestedAnchor.anchor == anchor)).first(): - self.reqanchors.append(RequestedAnchor(parent_id=parent.id, anchor=anchor)) - - def follow_link(self, visited=None): - """If this link represents a redirect return the redirect target, - otherwise return self. If this redirect does not find a referenced - link None is returned.""" - # if this is not a redirect just return - if not self.redirectdepth: - return self - # if we don't know where this redirects, return None - if not self.children.count(): - return None - # avoid loops - if not visited: - visited = set() - visited.add(self.url) - # the first (and only) child is the redirect target - child = self.children.first() - if child.url in visited: - return None - # check where we redirect to - return child.follow_link(visited) - - @property - def count_parents(self): - session = object_session(self) - p1 = session.query(func.count(distinct(children.c.parent_id))).filter(children.c.child_id == self.id) - p2 = session.query(func.count(distinct(embedded.c.parent_id))).filter(embedded.c.child_id == self.id) - return p1.scalar() + p2.scalar() - - @property - def parents(self): - session = object_session(self) - #links = object_session(self).query(Link) - #links = links.join(children, Link.id == children.c.parent_id) - #links = links.join(embedded, Link.id == embedded.c.parent_id) - #return links.filter((children.c.child_id == self.id) | - # (embedded.c.child_id == self.id)).distinct() - parent_ids = union(session.query(children.c.parent_id).filter(children.c.child_id == self.id), - session.query(embedded.c.parent_id).filter(embedded.c.child_id == self.id)) - - return session.query(Link).filter(Link.id == parent_ids.c.children_parent_id).distinct() - - -class LinkProblem(Base): - """Storage of problems in the URL itself (e.g. problem downloading the - associated resource).""" - - __tablename__ = 'linkproblems' - - id = Column(Integer, primary_key=True) - link_id = Column(Integer, ForeignKey('links.id', ondelete='CASCADE'), index=True) - message = Column(String, index=True) - link = relationship(Link, backref=backref('linkproblems', order_by=message, - cascade='all,delete,delete-orphan')) - - def __unicode__(self): - return self.message - - -class PageProblem(Base): - """Storage of problems in the information from the retrieved URL (e.g. - invalid HTML).""" - - __tablename__ = 'pageproblems' - - id = Column(Integer, primary_key=True) - link_id = Column(Integer, ForeignKey('links.id', ondelete='CASCADE'), index=True) - message = Column(String, index=True) - link = relationship(Link, backref=backref('pageproblems', order_by=message, - cascade='all,delete,delete-orphan')) - - def __unicode__(self): - return self.message - - -class Anchor(Base): - """The named anchors (IDs) found on the page.""" - - __tablename__ = 'anchors' - - id = Column(Integer, primary_key=True) - link_id = Column(Integer, ForeignKey('links.id', ondelete='CASCADE'), index=True) - link = relationship(Link, backref=backref('anchors', - lazy='dynamic', - cascade='all,delete,delete-orphan')) - anchor = Column(String) - - def __unicode__(self): - return self.anchor - - -class RequestedAnchor(Base): - """The named anchors (IDs) found on the page.""" - - __tablename__ = 'reqanchors' - - id = Column(Integer, primary_key=True) - link_id = Column(Integer, ForeignKey('links.id', ondelete='CASCADE'), index=True) - link = relationship(Link, backref=backref('reqanchors', - lazy='dynamic', - cascade='all,delete,delete-orphan', - ), primaryjoin='Link.id == RequestedAnchor.link_id') - parent_id = Column(Integer, ForeignKey('links.id', ondelete='CASCADE'), index=True) - parent = relationship(Link, primaryjoin='Link.id == RequestedAnchor.parent_id') - anchor = Column(String) - - def __unicode__(self): - return self.anchor diff --git a/debugio.py b/debugio.py deleted file mode 100644 index 6d7f698..0000000 --- a/debugio.py +++ /dev/null @@ -1,65 +0,0 @@ - -# debugio.py - output logging module -# -# Copyright (C) 1998, 1999 Albert Hopkins (marduk) -# Copyright (C) 2002 Mike W. Meyer -# Copyright (C) 2005, 2006, 2011 Arthur de Jong -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 2 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA -# -# The files produced as output from the software do not automatically fall -# under the copyright of the software, unless explicitly stated otherwise. - -"""Debugging and message output module. - -This module contains facilities for logging program output. The use of -this module is really simple: import it, set loglevel, and use debug(), -info(), warn() and error() whenever you want to print something.""" - -import sys - -# log levels that can be used -ERROR = 0 -WARN = 1 -INFO = 2 -DEBUG = 3 - -# initialize logging at default level -loglevel = INFO - - -def debug(msg): - """Log the message to stderr if loglevel will allow it.""" - if loglevel >= DEBUG: - sys.stderr.write('webcheck: DEBUG: ' + str(msg) + '\n') - - -def info(msg): - """Log the message to stdout if loglevel will allow it.""" - if loglevel >= INFO: - sys.stdout.write('webcheck: ' + str(msg) + '\n') - sys.stdout.flush() - - -def warn(msg): - """Log a warning to stderr if loglevel will allow it.""" - if loglevel >= WARN: - sys.stderr.write('webcheck: Warning: ' + str(msg) + '\n') - - -def error(msg): - """Log an error to stderr if loglevel will allow it.""" - if loglevel >= ERROR: - sys.stderr.write('webcheck: Error: ' + str(msg) + '\n') diff --git a/monkeypatch.py b/monkeypatch.py deleted file mode 100644 index cf9218e..0000000 --- a/monkeypatch.py +++ /dev/null @@ -1,81 +0,0 @@ - -# monkeypatch.py - add missing functionality to standard modules -# -# Copyright (C) 2011 Arthur de Jong -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 2 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA -# -# The files produced as output from the software do not automatically fall -# under the copyright of the software, unless explicitly stated otherwise. - -import re -import urlparse -import urllib -import sys - - -__all__ = [] - - -# This monkeypatches RuleLine.applies_to to support * and $ characters in -# robots.txt path names. -def my_applies_to(ruleline, filename): - if not hasattr(ruleline, 'pattern'): - pat = [] - # we need to unescape the * from the path here - for x in ruleline.path.replace('%2A', '*'): - if x == '*': - pat.append('.*') - elif x == '$': - pat.append(r'\Z') - else: - pat.append(re.escape(x)) - ruleline.pattern = re.compile(''.join(pat) + '(?ms)') - return bool(ruleline.pattern.match(filename)) - -from robotparser import RuleLine -RuleLine.applies_to = my_applies_to - - -# This monkeypatches RobotFileParser.can_fetch to include the query string -# into the tested part of the URL, taken from http://bugs.python.org/issue6325 -# this should be fixed in Python 2.7 -if sys.version_info < (2, 7): - - def my_can_fetch(rfp, useragent, url): - """using the parsed robots.txt decide if useragent can fetch url""" - if rfp.disallow_all: - return False - if rfp.allow_all: - return True - # search for given user agent matches - # the first match counts - parsed_url = urlparse.urlparse(urllib.unquote(url)) - url = urlparse.urlunparse(('', '', parsed_url.path, - parsed_url.params, parsed_url.query, parsed_url.fragment)) - url = urllib.quote(url) - if not url: - url = "/" - for entry in rfp.entries: - if entry.applies_to(useragent): - return entry.allowance(url) - # try the default entry last - if rfp.default_entry: - return rfp.default_entry.allowance(url) - # agent not found ==> access granted - return True - - from robotparser import RobotFileParser - RobotFileParser.can_fetch = my_can_fetch diff --git a/myurllib.py b/myurllib.py deleted file mode 100644 index bd5987c..0000000 --- a/myurllib.py +++ /dev/null @@ -1,120 +0,0 @@ - -# myurllib.py - general purpose URL handling library -# -# Copyright (C) 2007, 2011 Arthur de Jong -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 2 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA -# -# The files produced as output from the software do not automatically fall -# under the copyright of the software, unless explicitly stated otherwise. - -import urlparse -import re -import urllib - -# this is a workaround for Python 2.3 -try: - set -except NameError: - from sets import Set as set - -# The way I read RFC3986 (especially sections 3.3 and 6.2) is that these -# are all separate and valid URLs that point to the same resource. -# -# In section 6.2.2.3 only the removal of "." and ".." in paths is -# mentioned although 6.2.3 does leave some room for other normalisation. - -# pattern for matching URL-encoded characters -_urlencpattern = re.compile('(%[0-9a-fA-F]{2})') - -# characters that should be unescaped in URLs -_okurlchars = set('-.0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ' \ - '_abcdefghijklmnopqrstuvwxyz~') - -# pattern for matching characters that should be escaped -_urlprobpattern = re.compile('([^-;/?:@&=+$,%#.0123456789' \ - 'ABCDEFGHIJKLMNOPQRSTUVWXYZ_' \ - 'abcdefghijklmnopqrstuvwxyz~])') - -# pattern for double slashes -_doubleslashpattern = re.compile('//+') - -# pattern for leading dots -_leadingdotpattern = re.compile('^(/\.\.)*') - - -def _unescape_printable(match): - """Helper function for _normalize_escapes() to perform the expansion of - html entity refs that are normal printable (but not reserver) - characters.""" - # unescape the character - r = chr(int(match.group(1)[1:3], 16)) - if r in _okurlchars: - return r - # transform remaining escapes to uppercase - return match.group(1).upper() - - -def _normalize_escapes(url): - """Ensure that escaping in the url is consistent. Any reserved characters - are left alone. Any characters that are printable but are escaped are - unescaped. Any non-printable characters are escaped.""" - # url decode any printable normal characters (this leaves us with a string - # with as much stuff unquoted as # possible) - url = _urlencpattern.sub(_unescape_printable, url) - # url encode any nonprintable or problematic characters (but not reserved - # characters) so we're left with a string with everything that needs to be - # quoted as such - url = _urlprobpattern.sub(lambda x: '%%%02X' % ord(x.group(1)), url) - return url - - -def _urlclean(url): - """Clean the url of uneccesary parts.""" - # make escaping consistent - url = _normalize_escapes(url) - # split the url in useful parts - (scheme, netloc, path, query, fragment) = urlparse.urlsplit(url) - # remove any leading /../ parts - if scheme in ('http', 'https'): - path = _leadingdotpattern.sub('', path) - if scheme in ('http', 'https', 'ftp'): - # http(s) urls should have a non-empty path - if path == '': - path = '/' - # make hostname lower case - (userpass, hostport) = urllib.splituser(netloc) - (host, port) = urllib.splitport(hostport) - # remove default port - if scheme == 'http' and str(port) == '80': - hostport = host - elif scheme == 'https' and str(port) == '443': - hostport = host - netloc = hostport.lower() - # trim trailing : - if netloc[-1:] == ':': - netloc = netloc[:-1] - if userpass is not None: - netloc = userpass + '@' + netloc - # get rid of double slashes in some paths - if scheme == 'file': - path = _doubleslashpattern.sub('/', path) - # put the url back together again - return urlparse.urlunsplit((scheme, netloc, path, query, fragment)) - - -def normalizeurl(url): - """Return a normalized URL.""" - return _urlclean(url) diff --git a/parsers/__init__.py b/parsers/__init__.py deleted file mode 100644 index 3bfbd1f..0000000 --- a/parsers/__init__.py +++ /dev/null @@ -1,63 +0,0 @@ - -# __init__.py - general content-type parser interface -# -# Copyright (C) 2005, 2006, 2011 Arthur de Jong -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 2 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA -# -# The files produced as output from the software do not automatically fall -# under the copyright of the software, unless explicitly stated otherwise. - -"""This package groups all the supported content-types. - -A content-type module can be requested by the get_parsemodule() function. -Each module should export the following function: - - parse(content, link) - Based on the content, fill in the common fields of the link object.""" - -# the modules that should be imported -_modules = ('html', 'css') - -# a map of mimetypes to modules -_parsermodules = {} - - -def _init_modules(): - """Initialize the modules.""" - # go throught all known modules to probe the content-types - # (do this only once) - for mod in _modules: - parser = __import__('parsers.' + mod, globals(), locals(), [mod]) - for mimetype in parser.mimetypes: - _parsermodules[mimetype] = parser - - -def get_parsermodule(mimetype): - """Look up the correct module for the specified mimetype.""" - if _parsermodules == {}: - _init_modules() - # check if we have a supported content-type - if mimetype in _parsermodules: - return _parsermodules[mimetype] - return None - - -def get_mimetypes(): - """Return a list of supported mime types that can be parsed - by the installed parsers.""" - if _parsermodules == {}: - _init_modules() - return _parsermodules.keys() diff --git a/parsers/css.py b/parsers/css.py deleted file mode 100644 index 5ab2905..0000000 --- a/parsers/css.py +++ /dev/null @@ -1,55 +0,0 @@ - -# css.py - parser functions for css content -# -# Copyright (C) 2005, 2006, 2009, 2011 Arthur de Jong -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 2 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA -# -# The files produced as output from the software do not automatically fall -# under the copyright of the software, unless explicitly stated otherwise. - -"""This modules attempts to parse CSS files. -It currently looks for url() links in stylesheet contents and also -looks for @import processing directives.""" - -mimetypes = ('text/css',) - -import urlparse -import re - -# pattern for matching /* ... */ comments in css -_commentpattern = re.compile('/\*.*?\*/', re.IGNORECASE | re.DOTALL) - -# pattern for matching @import "url" statments in css -_importpattern = re.compile('@import\s+["\']([^"\']*)["\']', - re.IGNORECASE | re.DOTALL) - -# pattern for matching url(...) in css -_urlpattern = re.compile('url\(["\']?(.*?)["\']?\)') - - -def parse(content, link, base=None): - """Parse the specified content and extract information for crawling the - site further.""" - # if no base is specified, get it from the link - base = base or link.url - # strip out comments from the content - content = _commentpattern.sub('', content) - # handle @imports - for embed in _importpattern.findall(content): - link.add_embed(urlparse.urljoin(base, embed)) - # handle url()s - for embed in _urlpattern.findall(content): - link.add_embed(urlparse.urljoin(base, embed)) diff --git a/parsers/html/__init__.py b/parsers/html/__init__.py deleted file mode 100644 index 09966f4..0000000 --- a/parsers/html/__init__.py +++ /dev/null @@ -1,123 +0,0 @@ - -# html.py - parser functions for html content -# -# Copyright (C) 2005, 2006, 2007, 2008, 2011 Arthur de Jong -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 2 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA -# -# The files produced as output from the software do not automatically fall -# under the copyright of the software, unless explicitly stated otherwise. - -"""Parser functions for processing HTML content. This a front-end -module that tries to load the BeatifulSoup parser first and falls -back to loading the legacy HTMLParser parser.""" - -import debugio -import re -import htmlentitydefs -import config - -# the list of mimetypes this module should be able to handle -mimetypes = ('text/html', 'application/xhtml+xml', 'text/x-server-parsed-html') - -# pattern for matching all html entities -_entitypattern = re.compile('&(#[0-9]{1,6}|[a-zA-Z]{2,10});') - - -def htmlescape(txt): - """HTML escape the given string and return an ASCII clean string with - known entities and character entities for the other values.""" - # check for empty string - if not txt: - return u'' - # convert to unicode object - if not isinstance(txt, unicode): - txt = unicode(txt) - # the output string - out = '' - # loop over the characters of the string - for c in txt: - if ord(c) in htmlentitydefs.codepoint2name: - out += '&%s;' % htmlentitydefs.codepoint2name[ord(c)] - elif ord(c) > 126: - out += '&#%d;' % ord(c) - else: - out += c.encode('utf-8') - return out - - -def _unescape_entity(match): - """Helper function for htmlunescape(). - This funcion unescapes a html entity, it is passed to the sub() - function.""" - if match.group(1) in htmlentitydefs.name2codepoint: - # we have a named entity, return proper character - return unichr(htmlentitydefs.name2codepoint[match.group(1)]) - elif match.group(1)[0] == '#': - # we have a numeric entity, replace with proper character - return unichr(int(match.group(1)[1:])) - else: - # we have something else, just keep the original - return match.group(0) - - -def htmlunescape(txt): - """This function unescapes a html encoded string. - This function returns a unicode string.""" - # check for empty string - if not txt: - return u'' - # convert to unicode - if not isinstance(txt, unicode): - txt = unicode(txt, errors='replace') - # replace &name; and &#nn; refs with proper characters - txt = _entitypattern.sub(_unescape_entity, txt) - # we're done - return txt - - -def _parsefunction(content, link): - # we find a suitable parse function - global _parsefunction - try: - # try BeautifulSoup parser first - import parsers.html.beautifulsoup - debugio.debug('parsers.html.parse(): the BeautifulSoup parser is ok') - _parsefunction = parsers.html.beautifulsoup.parse - except ImportError: - # fall back to legacy HTMLParser parser - debugio.warn('falling back to the legacy HTML parser, ' - 'consider installing BeautifulSoup') - import parsers.html.htmlparser - _parsefunction = parsers.html.htmlparser.parse - # call the actual parse function - _parsefunction(content, link) - - -def parse(content, link): - """Parse the specified content and extract an url list, a list of images a - title and an author. The content is assumed to contain HMTL.""" - # call the normal parse function - _parsefunction(content, link) - # call the tidy parse function - if config.TIDY_OPTIONS: - try: - import calltidy - debugio.debug('parsers.html.parse(): the Tidy parser is ok') - calltidy.parse(content, link) - except ImportError: - debugio.warn('tidy library (python-utidylib) is unavailable') - # remove config to only try once - config.TIDY_OPTIONS = None diff --git a/parsers/html/beautifulsoup.py b/parsers/html/beautifulsoup.py deleted file mode 100644 index 268014d..0000000 --- a/parsers/html/beautifulsoup.py +++ /dev/null @@ -1,191 +0,0 @@ - -# beautifulsoup.py - parser functions for html content -# -# Copyright (C) 2007, 2008, 2009, 2011 Arthur de Jong -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 2 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA -# -# The files produced as output from the software do not automatically fall -# under the copyright of the software, unless explicitly stated otherwise. - -"""Parser functions for processing HTML content. This module uses the -BeautifulSoup HTML parser and is more flexible than the legacy HTMLParser -module.""" - -import urlparse -import crawler -import re -import htmlentitydefs -import BeautifulSoup -import myurllib -from parsers.html import htmlunescape - -# pattern for matching http-equiv and content part of -# -_refreshhttpequivpattern = re.compile('^refresh$', re.I) -_refershcontentpattern = re.compile('^[0-9]+;url=(.*)$', re.I) - -# check BeautifulSoup find() function for bugs -if BeautifulSoup.BeautifulSoup('').find('foo', bar=True): - import debugio - debugio.warn('using buggy version of BeautifulSoup (%s)' % - BeautifulSoup.__version__) - - -def parse(content, link): - """Parse the specified content and extract an url list, a list of images a - title and an author. The content is assumed to contain HMTL.""" - # create parser and feed it the content - soup = BeautifulSoup.BeautifulSoup(content, - fromEncoding=str(link.encoding)) - # fetch document encoding - link.set_encoding(soup.originalEncoding) - # TITLE - title = soup.find('title') - if title and title.string: - link.title = htmlunescape(title.string).strip() - - # FIXME: using myurllib.normalizeurl is wrong below, we should probably use - # something like link.urlunescape() to do the escaping and check - # and log at the same time - - # - base = soup.find('base', href=True) - if base: - base = myurllib.normalizeurl(htmlunescape(base['href']).strip()) - else: - base = link.url - # - for l in soup.findAll('link', rel=True, href=True): - if l['rel'].lower() in ('stylesheet', 'alternate stylesheet', 'icon', - 'shortcut icon'): - embed = myurllib.normalizeurl(htmlunescape(l['href']).strip()) - if embed: - link.add_embed(urlparse.urljoin(base, embed)) - # - author = soup.find('meta', attrs={'name': re.compile("^author$", re.I), - 'content': True}) - if author and author['content']: - link.author = htmlunescape(author['content']).strip() - # - refresh = soup.find('meta', attrs={'http-equiv': _refreshhttpequivpattern, - 'content': True}) - if refresh and refresh['content']: - try: - child = _refershcontentpattern.search(refresh['content']).group(1) - except AttributeError: - pass # ignore cases where refresh header parsing causes problems - else: - link.add_child(urlparse.urljoin(base, child)) - # - for img in soup.findAll('img', src=True): - embed = myurllib.normalizeurl(htmlunescape(img['src']).strip()) - if embed: - link.add_embed(urlparse.urljoin(base, embed)) - # - for a in soup.findAll('a', href=True): - child = myurllib.normalizeurl(htmlunescape(a['href']).strip()) - if child: - link.add_child(urlparse.urljoin(base, child)) - # - # TODO: consistent url escaping? - for a in soup.findAll('a', attrs={'name': True}): - # get anchor name - a_name = myurllib.normalizeurl(htmlunescape(a['name']).strip()) - # if both id and name are used they should be the same - if 'id' in a and \ - a_name != myurllib.normalizeurl(htmlunescape(a['id']).strip()): - link.add_pageproblem( - 'anchors defined in name and id attributes do not match') - # add the id anchor anyway - link.add_anchor(myurllib.normalizeurl(htmlunescape(a['id']).strip())) - # add the anchor - link.add_anchor(a_name) - # - for elem in soup.findAll(id=True): - # skip anchor that have a name - if elem.name == 'a' and 'name' in elem: - continue - # add the anchor - link.add_anchor(myurllib.normalizeurl(htmlunescape(elem['id']).strip())) - # ... - for frame in soup.findAll('frame', src=True): - embed = myurllib.normalizeurl(htmlunescape(frame['src']).strip()) - if embed: - link.add_embed(urlparse.urljoin(base, embed)) - #