1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
|
"""
Robots.txt file parser class. Accepts a list of lines or robots.txt URL as
input, builds a set of rules from that list, then answers questions about
fetchability of other URLs.
Change made by marduk@python.net to support proxies.
RobotFileParser class can be instantiated with optional proxies parameter,
just like FancyURLopener in urllib.
"""
class RobotFileParser:
def __init__(self, proxies = None):
self.proxies = proxies
self.rules = {}
self.debug = 0
self.url = ''
self.last_checked = 0
def mtime(self):
return self.last_checked
def modified(self):
import time
self.last_checked = time.time()
def set_url(self, url):
self.url = url
## import urlmisc
## self.url = urlmisc.canonical_url(url)
def read(self):
import urllib
urlopener = urllib.FancyURLopener(self.proxies)
self.parse(urlopener.open(self.url).readlines())
def parse(self, lines):
import re, string
active = []
for line in lines:
if self.debug: print '>', line,
# blank line terminates current record
if not line[:-1]:
active = []
continue
# remove optional comment and strip line
line = string.strip(line[:string.find(line, '#')])
if not line:
continue
line = re.split(' *: *', line)
if len(line) == 2:
line[0] = string.lower(line[0])
if line[0] == 'user-agent':
# this record applies to this user agent
if self.debug: print '>> user-agent:', line[1]
active.append(line[1])
if not self.rules.has_key(line[1]):
self.rules[line[1]] = []
elif line[0] == 'disallow':
if line[1]:
if self.debug: print '>> disallow:', line[1]
for agent in active:
self.rules[agent].append(re.compile(line[1]))
else:
pass
for agent in active:
if self.debug: print '>> allow', agent
self.rules[agent] = []
else:
if self.debug: print '>> unknown:', line
self.modified()
# returns true if agent is allowed to fetch url
def can_fetch(self, agent, url):
import urlparse
ag = agent
if not self.rules.has_key(ag): ag = '*'
if not self.rules.has_key(ag):
if self.debug: print '>> allowing', url, 'fetch by', agent
return 1
path = urlparse.urlparse(url)[2]
for rule in self.rules[ag]:
if rule.match(path):
if self.debug: print '>> disallowing', url, 'fetch by', agent
return 0
if self.debug: print '>> allowing', url, 'fetch by', agent
return 1
def test():
rp = RobotFileParser()
rp.debug = 1
rp.set_url('http://www.automatrix.com/robots.txt')
rp.read()
print rp.rules
print rp.can_fetch('*', 'http://www.calendar.com/concerts/')
print rp.can_fetch('Musi-Cal-Robot',
'http://dolphin:80/cgi-bin/music-search?performer=Rolling+Stones')
print rp.can_fetch('Lycos', 'http://www/~skip/volkswagen/')
print rp.can_fetch('Lycos', 'http://www/~skip/volkswagen/vanagon-list-001')
|