From ab871230c5bdf62c88008d9e8b757cfedce6f8dc Mon Sep 17 00:00:00 2001
From: Arthur de Jong <arthur@arthurdejong.org>
Date: Fri, 11 May 2007 22:01:43 +0000
Subject: switch robots.txt handling to default on again (broken in 1.9.8) and
 add new --ignore-robots option to be able to ignore robots retrieval

git-svn-id: http://arthurdejong.org/svn/webcheck/webcheck@330 86f53f14-5ff3-0310-afe5-9b438ce3f40c
---
 config.py   | 4 ++--
 webcheck.1  | 9 +++++++++
 webcheck.py | 4 ++++
 3 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/config.py b/config.py
index 6b6430d..1c379e7 100644
--- a/config.py
+++ b/config.py
@@ -3,7 +3,7 @@
 #
 # Copyright (C) 1998, 1999 Albert Hopkins (marduk)
 # Copyright (C) 2002 Mike Meyer
-# Copyright (C) 2005, 2006 Arthur de Jong
+# Copyright (C) 2005, 2006, 2007 Arthur de Jong
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@@ -113,4 +113,4 @@ FTP_INDEXES = [ 'index.html', 'index.htm' ]
 
 # Whether to fetch robots.txt files and do checking based on the information
 # present in those files (normally matching links are yanked).
-USE_ROBOTS = False
+USE_ROBOTS = True
diff --git a/webcheck.1 b/webcheck.1
index b166b77..bfcadf0 100644
--- a/webcheck.1
+++ b/webcheck.1
@@ -79,6 +79,15 @@ and it finds a link that points to an external document, it will
 check to see if that external document exists.
 This flag disables that action.
 
+.TP 
+.B \-\-ignore\-robots
+Do not retrieve and parse robots.txt files.
+By default robots.txt files are retrieved and honored.
+If you are sure you want to ignore and override the webmaster's
+decision this option can be used.
+.br
+For more information on robots.txt handling see the NOTES section below.
+
 .TP 
 .B \-q, \-\-quiet, \-\-silent
 Do not print out progress as webcheck traverses a site.
diff --git a/webcheck.py b/webcheck.py
index 12e84ab..6849961 100755
--- a/webcheck.py
+++ b/webcheck.py
@@ -70,6 +70,7 @@ def print_help():
       '  -b, --base-only        base URLs only: consider any URL not starting\n'
       '                         with any of the base URLs to be external\n'
       '  -a, --avoid-external   do not check external URLs\n'
+      '      --ignore-robots    do not retrieve and parse robots.txt files\n'
       '  -q, --quiet, --silent  suppress progress messages\n'
       '  -d, --debug            do programmer-level debugging\n'
       '  -o, --output=DIRECTORY store the generated reports in the specified\n'
@@ -90,6 +91,7 @@ def parse_args(site):
         optlist, args = getopt.gnu_getopt(sys.argv[1:],
           'i:x:y:l:baqdo:cfr:w:Vh',
           ('internal=', 'external=', 'yank=', 'base-only', 'avoid-external',
+           'ignore-robots',
            'quiet', 'silent', 'debug', 'output=', 'continue',
            'force', 'redirects=', 'wait=', 'version', 'help'))
         for flag, arg in optlist:
@@ -103,6 +105,8 @@ def parse_args(site):
                 config.BASE_URLS_ONLY = True
             elif flag in ('-a', '--avoid-external'):
                 config.AVOID_EXTERNAL_LINKS = True
+            elif flag in ('--ignore-robots'):
+                 config.USE_ROBOTS = False
             elif flag in ('-q', '--quiet', '--silent'):
                 debugio.loglevel = debugio.ERROR
             elif flag in ('-d', '--debug'):
-- 
cgit v1.2.3