Arthur de Jong

Open Source / Free Software developer

summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorArthur de Jong <arthur@arthurdejong.org>2007-05-12 00:01:43 +0200
committerArthur de Jong <arthur@arthurdejong.org>2007-05-12 00:01:43 +0200
commitab871230c5bdf62c88008d9e8b757cfedce6f8dc (patch)
treedd08371ba7dea290c6210cea425f644e174849c8
parenta96663a5d2fe9df0af364fe6119b791e9b67fb17 (diff)
switch robots.txt handling to default on again (broken in 1.9.8) and add new --ignore-robots option to be able to ignore robots retrieval
git-svn-id: http://arthurdejong.org/svn/webcheck/webcheck@330 86f53f14-5ff3-0310-afe5-9b438ce3f40c
-rw-r--r--config.py4
-rw-r--r--webcheck.19
-rwxr-xr-xwebcheck.py4
3 files changed, 15 insertions, 2 deletions
diff --git a/config.py b/config.py
index 6b6430d..1c379e7 100644
--- a/config.py
+++ b/config.py
@@ -3,7 +3,7 @@
#
# Copyright (C) 1998, 1999 Albert Hopkins (marduk)
# Copyright (C) 2002 Mike Meyer
-# Copyright (C) 2005, 2006 Arthur de Jong
+# Copyright (C) 2005, 2006, 2007 Arthur de Jong
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
@@ -113,4 +113,4 @@ FTP_INDEXES = [ 'index.html', 'index.htm' ]
# Whether to fetch robots.txt files and do checking based on the information
# present in those files (normally matching links are yanked).
-USE_ROBOTS = False
+USE_ROBOTS = True
diff --git a/webcheck.1 b/webcheck.1
index b166b77..bfcadf0 100644
--- a/webcheck.1
+++ b/webcheck.1
@@ -80,6 +80,15 @@ check to see if that external document exists.
This flag disables that action.
.TP
+.B \-\-ignore\-robots
+Do not retrieve and parse robots.txt files.
+By default robots.txt files are retrieved and honored.
+If you are sure you want to ignore and override the webmaster's
+decision this option can be used.
+.br
+For more information on robots.txt handling see the NOTES section below.
+
+.TP
.B \-q, \-\-quiet, \-\-silent
Do not print out progress as webcheck traverses a site.
diff --git a/webcheck.py b/webcheck.py
index 12e84ab..6849961 100755
--- a/webcheck.py
+++ b/webcheck.py
@@ -70,6 +70,7 @@ def print_help():
' -b, --base-only base URLs only: consider any URL not starting\n'
' with any of the base URLs to be external\n'
' -a, --avoid-external do not check external URLs\n'
+ ' --ignore-robots do not retrieve and parse robots.txt files\n'
' -q, --quiet, --silent suppress progress messages\n'
' -d, --debug do programmer-level debugging\n'
' -o, --output=DIRECTORY store the generated reports in the specified\n'
@@ -90,6 +91,7 @@ def parse_args(site):
optlist, args = getopt.gnu_getopt(sys.argv[1:],
'i:x:y:l:baqdo:cfr:w:Vh',
('internal=', 'external=', 'yank=', 'base-only', 'avoid-external',
+ 'ignore-robots',
'quiet', 'silent', 'debug', 'output=', 'continue',
'force', 'redirects=', 'wait=', 'version', 'help'))
for flag, arg in optlist:
@@ -103,6 +105,8 @@ def parse_args(site):
config.BASE_URLS_ONLY = True
elif flag in ('-a', '--avoid-external'):
config.AVOID_EXTERNAL_LINKS = True
+ elif flag in ('--ignore-robots'):
+ config.USE_ROBOTS = False
elif flag in ('-q', '--quiet', '--silent'):
debugio.loglevel = debugio.ERROR
elif flag in ('-d', '--debug'):