diff options
author | Arthur de Jong <arthur@arthurdejong.org> | 2007-05-12 00:01:43 +0200 |
---|---|---|
committer | Arthur de Jong <arthur@arthurdejong.org> | 2007-05-12 00:01:43 +0200 |
commit | ab871230c5bdf62c88008d9e8b757cfedce6f8dc (patch) | |
tree | dd08371ba7dea290c6210cea425f644e174849c8 | |
parent | a96663a5d2fe9df0af364fe6119b791e9b67fb17 (diff) |
switch robots.txt handling to default on again (broken in 1.9.8) and add new --ignore-robots option to be able to ignore robots retrieval
git-svn-id: http://arthurdejong.org/svn/webcheck/webcheck@330 86f53f14-5ff3-0310-afe5-9b438ce3f40c
-rw-r--r-- | config.py | 4 | ||||
-rw-r--r-- | webcheck.1 | 9 | ||||
-rwxr-xr-x | webcheck.py | 4 |
3 files changed, 15 insertions, 2 deletions
@@ -3,7 +3,7 @@ # # Copyright (C) 1998, 1999 Albert Hopkins (marduk) # Copyright (C) 2002 Mike Meyer -# Copyright (C) 2005, 2006 Arthur de Jong +# Copyright (C) 2005, 2006, 2007 Arthur de Jong # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -113,4 +113,4 @@ FTP_INDEXES = [ 'index.html', 'index.htm' ] # Whether to fetch robots.txt files and do checking based on the information # present in those files (normally matching links are yanked). -USE_ROBOTS = False +USE_ROBOTS = True @@ -80,6 +80,15 @@ check to see if that external document exists. This flag disables that action. .TP +.B \-\-ignore\-robots +Do not retrieve and parse robots.txt files. +By default robots.txt files are retrieved and honored. +If you are sure you want to ignore and override the webmaster's +decision this option can be used. +.br +For more information on robots.txt handling see the NOTES section below. + +.TP .B \-q, \-\-quiet, \-\-silent Do not print out progress as webcheck traverses a site. diff --git a/webcheck.py b/webcheck.py index 12e84ab..6849961 100755 --- a/webcheck.py +++ b/webcheck.py @@ -70,6 +70,7 @@ def print_help(): ' -b, --base-only base URLs only: consider any URL not starting\n' ' with any of the base URLs to be external\n' ' -a, --avoid-external do not check external URLs\n' + ' --ignore-robots do not retrieve and parse robots.txt files\n' ' -q, --quiet, --silent suppress progress messages\n' ' -d, --debug do programmer-level debugging\n' ' -o, --output=DIRECTORY store the generated reports in the specified\n' @@ -90,6 +91,7 @@ def parse_args(site): optlist, args = getopt.gnu_getopt(sys.argv[1:], 'i:x:y:l:baqdo:cfr:w:Vh', ('internal=', 'external=', 'yank=', 'base-only', 'avoid-external', + 'ignore-robots', 'quiet', 'silent', 'debug', 'output=', 'continue', 'force', 'redirects=', 'wait=', 'version', 'help')) for flag, arg in optlist: @@ -103,6 +105,8 @@ def parse_args(site): config.BASE_URLS_ONLY = True elif flag in ('-a', '--avoid-external'): config.AVOID_EXTERNAL_LINKS = True + elif flag in ('--ignore-robots'): + config.USE_ROBOTS = False elif flag in ('-q', '--quiet', '--silent'): debugio.loglevel = debugio.ERROR elif flag in ('-d', '--debug'): |