switch robots.txt handling to default on again (broken in 1.9.8) and add new --ignore-robots option to be able to ignore robots retrieval

git-svn-id: http://arthurdejong.org/svn/webcheck/webcheck@330 86f53f14-5ff3-0310-afe5-9b438ce3f40c
author: Arthur de Jong <arthur@arthurdejong.org> 2007-05-12 00:01:43 +0200
committer: Arthur de Jong <arthur@arthurdejong.org> 2007-05-12 00:01:43 +0200
commit: ab871230c5bdf62c88008d9e8b757cfedce6f8dc (patch)
tree: dd08371ba7dea290c6210cea425f644e174849c8
parent: a96663a5d2fe9df0af364fe6119b791e9b67fb17 (diff)
3 files changed, 15 insertions, 2 deletions
diff --git a/config.py b/config.py
index 6b6430d..1c379e7 100644
--- a/config.py
+++ b/config.py
@@ -3,7 +3,7 @@
 #
 # Copyright (C) 1998, 1999 Albert Hopkins (marduk)
 # Copyright (C) 2002 Mike Meyer
-# Copyright (C) 2005, 2006 Arthur de Jong
+# Copyright (C) 2005, 2006, 2007 Arthur de Jong
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@@ -113,4 +113,4 @@ FTP_INDEXES = [ 'index.html', 'index.htm' ]
 
 # Whether to fetch robots.txt files and do checking based on the information
 # present in those files (normally matching links are yanked).
-USE_ROBOTS = False
+USE_ROBOTS = True
diff --git a/webcheck.1 b/webcheck.1
index b166b77..bfcadf0 100644
--- a/webcheck.1
+++ b/webcheck.1
@@ -80,6 +80,15 @@ check to see if that external document exists.
 This flag disables that action.
 
 .TP 
+.B \-\-ignore\-robots
+Do not retrieve and parse robots.txt files.
+By default robots.txt files are retrieved and honored.
+If you are sure you want to ignore and override the webmaster's
+decision this option can be used.
+.br
+For more information on robots.txt handling see the NOTES section below.
+
+.TP 
 .B \-q, \-\-quiet, \-\-silent
 Do not print out progress as webcheck traverses a site.
 
diff --git a/webcheck.py b/webcheck.py
index 12e84ab..6849961 100755
--- a/webcheck.py
+++ b/webcheck.py
@@ -70,6 +70,7 @@ def print_help():
       '  -b, --base-only        base URLs only: consider any URL not starting\n'
       '                         with any of the base URLs to be external\n'
       '  -a, --avoid-external   do not check external URLs\n'
+      '      --ignore-robots    do not retrieve and parse robots.txt files\n'
       '  -q, --quiet, --silent  suppress progress messages\n'
       '  -d, --debug            do programmer-level debugging\n'
       '  -o, --output=DIRECTORY store the generated reports in the specified\n'
@@ -90,6 +91,7 @@ def parse_args(site):
         optlist, args = getopt.gnu_getopt(sys.argv[1:],
           'i:x:y:l:baqdo:cfr:w:Vh',
           ('internal=', 'external=', 'yank=', 'base-only', 'avoid-external',
+           'ignore-robots',
            'quiet', 'silent', 'debug', 'output=', 'continue',
            'force', 'redirects=', 'wait=', 'version', 'help'))
         for flag, arg in optlist:
@@ -103,6 +105,8 @@ def parse_args(site):
                 config.BASE_URLS_ONLY = True
             elif flag in ('-a', '--avoid-external'):
                 config.AVOID_EXTERNAL_LINKS = True
+            elif flag in ('--ignore-robots'):
+                 config.USE_ROBOTS = False
             elif flag in ('-q', '--quiet', '--silent'):
                 debugio.loglevel = debugio.ERROR
             elif flag in ('-d', '--debug'):
author	Arthur de Jong <arthur@arthurdejong.org>	2007-05-12 00:01:43 +0200
committer	Arthur de Jong <arthur@arthurdejong.org>	2007-05-12 00:01:43 +0200
commit	ab871230c5bdf62c88008d9e8b757cfedce6f8dc (patch)
tree	dd08371ba7dea290c6210cea425f644e174849c8
parent	a96663a5d2fe9df0af364fe6119b791e9b67fb17 (diff)