Arthur de Jong

Open Source / Free Software developer

summaryrefslogtreecommitdiffstats
path: root/webcheck/cmd.py
blob: 7047d500eccdbe268019658fe8d158d1f620c2ed (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
#!/usr/bin/env python

# cmd.py - command-line front-end for webcheck
#
# Copyright (C) 1998, 1999 Albert Hopkins (marduk)
# Copyright (C) 2002 Mike W. Meyer
# Copyright (C) 2005, 2006, 2007, 2008, 2010, 2011, 2013 Arthur de Jong
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
#
# The files produced as output from the software do not automatically fall
# under the copyright of the software, unless explicitly stated otherwise.

"""This is the main webcheck module."""

import argparse
import logging

import webcheck
import webcheck.monkeypatch
from webcheck.crawler import Crawler, default_cfg


version_string = '''
webcheck %s
Written by Albert Hopkins (marduk), Mike W. Meyer and Arthur de Jong.

Copyright (C) 1998-2013
Albert Hopkins (marduk), Mike W. Meyer and Arthur de Jong.
This is free software; see the source for copying conditions.  There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
'''.strip() % webcheck.__version__


class VersionAction(argparse.Action):

    def __init__(self, option_strings, dest,
                 help='output version information and exit'):
        super(VersionAction, self).__init__(
            option_strings=option_strings,
            dest=argparse.SUPPRESS,
            default=argparse.SUPPRESS,
            nargs=0,
            help=help)

    def __call__(self, parser, namespace, values, option_string=None):
        print version_string
        parser.exit()


# set up command line parser
parser = argparse.ArgumentParser(
    formatter_class=argparse.ArgumentDefaultsHelpFormatter,
    description='Generate a report for the given URLs.')
parser.add_argument(
    '-V', '--version', action=VersionAction)
parser.add_argument(
    '-i', '--internal', metavar='PATTERN', action='append',
    help='mark URLs matching PATTERN as internal')
parser.add_argument(
    '-x', '--external', metavar='PATTERN', action='append',
    help='mark URLs matching PATTERN as external')
parser.add_argument(
    '-y', '--yank', metavar='PATTERN', action='append',
    help='do not check URLs matching PATTERN')
parser.add_argument(
    '-b', '--base-only', action='store_true',
    help='base URLs only: consider any URL not starting with any of the base URLs to be external')
parser.add_argument(
    '-a', '--avoid-external', action='store_true',
    help='do not check external URLs')
parser.add_argument(
    '--ignore-robots', action='store_true',
    help='do not retrieve or parse robots.txt files')
parser.add_argument(
    '-q', '--quiet', '--silent', action='store_true',
    help='suppress progress messages')
parser.add_argument(
    '-d', '--debug', action='store_true',
    help='show programmer-level debug information')
parser.add_argument(
    '-o', '--output', dest='output_dir', metavar='DIRECTORY',
    help='store the generated reports in the specified directory')
parser.add_argument(
    '-c', '--continue', action='store_true',
    help='try to continue from a previous run')
parser.add_argument(
    '-f', '--force', action='store_true',
    help='overwrite files without asking')
parser.add_argument(
    '-r', '--redirects', metavar='N', type=int,
    help='the number of redirects webcheck should follow, 0 implies to follow all redirects')
parser.add_argument(
    '-l', '--max-depth', '--levels', metavar='N', type=int,
    help='maximum depth of links to follow from base urls')
parser.add_argument(
    '-w', '--wait', metavar='SECONDS', type=float,
    help='wait SECONDS between retrievals')
parser.add_argument(
    '--profile', action='store_true', help=argparse.SUPPRESS)
parser.add_argument(
    'base_urls', metavar='URL', nargs='+')
parser.set_defaults(**default_cfg)


def main(cfg):
    """Main program."""
    # configure logging
    if cfg.get('quiet', False):
        level = logging.WARNING
    elif cfg.get('debug', False):
        level = logging.DEBUG
    else:
        level = logging.INFO
    logging.basicConfig(format='webcheck: %(levelname)s: %(message)s', level=level)
    # set up crawler and go
    crawler = Crawler(cfg)
    logging.info('checking site....')
    crawler.crawl()
    logging.info('done.')
    logging.info('postprocessing....')
    crawler.postprocess()
    logging.info('done.')
    logging.info('generating reports...')
    crawler.generate()
    logging.info('done.')


def entry_point():
    """setuptools entry point"""
    args = parser.parse_args()
    main(vars(args))