From ab871230c5bdf62c88008d9e8b757cfedce6f8dc Mon Sep 17 00:00:00 2001 From: Arthur de Jong Date: Fri, 11 May 2007 22:01:43 +0000 Subject: switch robots.txt handling to default on again (broken in 1.9.8) and add new --ignore-robots option to be able to ignore robots retrieval git-svn-id: http://arthurdejong.org/svn/webcheck/webcheck@330 86f53f14-5ff3-0310-afe5-9b438ce3f40c --- config.py | 4 ++-- webcheck.1 | 9 +++++++++ webcheck.py | 4 ++++ 3 files changed, 15 insertions(+), 2 deletions(-) diff --git a/config.py b/config.py index 6b6430d..1c379e7 100644 --- a/config.py +++ b/config.py @@ -3,7 +3,7 @@ # # Copyright (C) 1998, 1999 Albert Hopkins (marduk) # Copyright (C) 2002 Mike Meyer -# Copyright (C) 2005, 2006 Arthur de Jong +# Copyright (C) 2005, 2006, 2007 Arthur de Jong # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -113,4 +113,4 @@ FTP_INDEXES = [ 'index.html', 'index.htm' ] # Whether to fetch robots.txt files and do checking based on the information # present in those files (normally matching links are yanked). -USE_ROBOTS = False +USE_ROBOTS = True diff --git a/webcheck.1 b/webcheck.1 index b166b77..bfcadf0 100644 --- a/webcheck.1 +++ b/webcheck.1 @@ -79,6 +79,15 @@ and it finds a link that points to an external document, it will check to see if that external document exists. This flag disables that action. +.TP +.B \-\-ignore\-robots +Do not retrieve and parse robots.txt files. +By default robots.txt files are retrieved and honored. +If you are sure you want to ignore and override the webmaster's +decision this option can be used. +.br +For more information on robots.txt handling see the NOTES section below. + .TP .B \-q, \-\-quiet, \-\-silent Do not print out progress as webcheck traverses a site. diff --git a/webcheck.py b/webcheck.py index 12e84ab..6849961 100755 --- a/webcheck.py +++ b/webcheck.py @@ -70,6 +70,7 @@ def print_help(): ' -b, --base-only base URLs only: consider any URL not starting\n' ' with any of the base URLs to be external\n' ' -a, --avoid-external do not check external URLs\n' + ' --ignore-robots do not retrieve and parse robots.txt files\n' ' -q, --quiet, --silent suppress progress messages\n' ' -d, --debug do programmer-level debugging\n' ' -o, --output=DIRECTORY store the generated reports in the specified\n' @@ -90,6 +91,7 @@ def parse_args(site): optlist, args = getopt.gnu_getopt(sys.argv[1:], 'i:x:y:l:baqdo:cfr:w:Vh', ('internal=', 'external=', 'yank=', 'base-only', 'avoid-external', + 'ignore-robots', 'quiet', 'silent', 'debug', 'output=', 'continue', 'force', 'redirects=', 'wait=', 'version', 'help')) for flag, arg in optlist: @@ -103,6 +105,8 @@ def parse_args(site): config.BASE_URLS_ONLY = True elif flag in ('-a', '--avoid-external'): config.AVOID_EXTERNAL_LINKS = True + elif flag in ('--ignore-robots'): + config.USE_ROBOTS = False elif flag in ('-q', '--quiet', '--silent'): debugio.loglevel = debugio.ERROR elif flag in ('-d', '--debug'): -- cgit v1.2.3