1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
|
# crawler.py - definition of Link class for storing the crawled site
#
# Copyright (C) 1998, 1999 Albert Hopkins (marduk)
# Copyright (C) 2002 Mike W. Meyer
# Copyright (C) 2005, 2006, 2007, 2008, 2011, 2013 Arthur de Jong
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
#
# The files produced as output from the software do not automatically fall
# under the copyright of the software, unless explicitly stated otherwise.
"""General module to do site-checking. This module contains the Crawler class
containing the state for the crawled site and some functions to access and
manipulate the crawling of the website. This module also contains the Link
class that holds all the link related properties."""
import atexit
import cookielib
import datetime
import httplib
import logging
import os
import re
import robotparser
import time
import urllib
import urllib2
import urlparse
from webcheck import config
from webcheck.db import Session, Link, setup_db, truncate_db
from webcheck.output import install_file
import webcheck.parsers
logger = logging.getLogger(__name__)
class RedirectError(urllib2.HTTPError):
def __init__(self, url, code, msg, hdrs, fp, newurl):
self.newurl = newurl
urllib2.HTTPError.__init__(self, url, code, msg, hdrs, fp)
class NoRedirectHandler(urllib2.HTTPRedirectHandler):
def redirect_request(self, req, fp, code, msg, headers, newurl):
raise RedirectError(req.get_full_url(), code, msg, headers, fp, newurl)
def _setup_urllib2():
"""Configure the urllib2 module to store cookies in the output
directory."""
import webcheck # local import to avoid import loop
filename = os.path.join(config.OUTPUT_DIR, 'cookies.txt')
# set up our cookie jar
cookiejar = cookielib.MozillaCookieJar(filename)
try:
cookiejar.load(ignore_discard=False, ignore_expires=False)
except IOError:
pass
atexit.register(cookiejar.save, ignore_discard=False, ignore_expires=False)
# set up our custom opener that sets a meaningful user agent
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookiejar),
NoRedirectHandler())
opener.addheaders = [
('User-agent', 'webcheck %s' % webcheck.__version__),
]
if config.BYPASSHTTPCACHE:
opener.addheaders.append(('Cache-control', 'no-cache'))
opener.addheaders.append(('Pragma', 'no-cache'))
urllib2.install_opener(opener)
# pattern for matching spaces
_spacepattern = re.compile(' ')
# pattern to match anchor part of a url
_anchorpattern = re.compile('#([^#]+)$')
# get default configuration
default_cfg = dict(
internal=[], external=[], yank=[], base_only=config.BASE_URLS_ONLY,
avoid_external=config.AVOID_EXTERNAL_LINKS, ignore_robots=not(config.USE_ROBOTS),
output=config.OUTPUT_DIR, force=config.OVERWRITE_FILES,
redirects=config.REDIRECT_DEPTH, max_depth=config.MAX_DEPTH,
wait=config.WAIT_BETWEEN_REQUESTS)
default_cfg.update({'continue': config.CONTINUE})
class Config(object):
def __init__(self, *args, **kwargs):
self.update(*args, **kwargs)
def update(self, *args, **kwargs):
for arg in args:
vars(self).update(arg)
vars(self).update(kwargs)
class Crawler(object):
"""Class to represent gathered data of a site.
The available properties of this class are:
site_name - the name of the website that is crawled
base_urls - a list of base URLs
plugins - a list of plugin modules used by the crawler
"""
def __init__(self, cfg):
"""Creates an instance of the Crawler class and initializes the
state of the site."""
# complete the configuration
self.cfg = Config(default_cfg)
self.cfg.update(cfg)
# list of regexps considered internal
self._internal_res = {}
for pattern in self.cfg.internal:
self._internal_res[pattern] = re.compile(pattern, re.IGNORECASE)
# list of regexps considered external
self._external_res = {}
for pattern in self.cfg.external:
self._external_res[pattern] = re.compile(pattern, re.IGNORECASE)
# list of regexps matching links that should not be checked
self._yanked_res = {}
for pattern in self.cfg.yank:
self._yanked_res[pattern] = re.compile(pattern, re.IGNORECASE)
# update other configuration
config.BASE_URLS_ONLY = self.cfg.base_only
config.AVOID_EXTERNAL_LINKS = self.cfg.avoid_external
config.USE_ROBOTS = not(self.cfg.ignore_robots)
config.OUTPUT_DIR = self.cfg.output_dir
config.CONTINUE = getattr(self.cfg, 'continue')
config.OVERWRITE_FILES = self.cfg.force
config.REDIRECT_DEPTH = self.cfg.redirects
config.MAX_DEPTH = self.cfg.max_depth
config.WAIT_BETWEEN_REQUESTS = self.cfg.wait
# map of scheme+netloc to robot parsers
self._robotparsers = {}
# set up empty site name
self.site_name = None
# load the plugins
self.plugins = [
__import__(plugin, globals(), locals(), [plugin])
for plugin in config.PLUGINS]
# add base urls
self.base_urls = []
for url in self.cfg.base_urls:
# if it does not look like a url it is probably a local file
if urlparse.urlsplit(url)[0] == '':
url = 'file://' + urllib.pathname2url(os.path.abspath(url))
# clean the URL and add it
url = Link.clean_url(url)
if url not in self.base_urls:
self.base_urls.append(url)
# set up empty site name
self.site_name = None
def setup_database(self):
if hasattr(self, 'database_configed'):
return
self.database_configed = True
if not os.path.isdir(config.OUTPUT_DIR):
os.mkdir(config.OUTPUT_DIR)
filename = os.path.join(config.OUTPUT_DIR, 'webcheck.sqlite')
setup_db(filename)
def _is_internal(self, url):
"""Check whether the specified url is external or internal. This
uses the urls marked with add_base() and the regular expressions
passed with add_external_re()."""
for regexp in self._internal_res.values():
if regexp.search(url) is not None:
return True
if config.BASE_URLS_ONLY:
# the url must start with one of the base URLs
if not any(url.startswith(x) for x in self.base_urls):
return False
else:
# the netloc must match a netloc of an _internal_url
netloc = urlparse.urlsplit(url)[1]
if not any((urlparse.urlsplit(x)[1] == netloc) for x in self.base_urls):
return False
for x in self._external_res.values():
if x.search(url):
return False
return True
def _get_robotparser(self, scheme, netloc):
"""Return the proper robots parser for the given url or None if one
cannot be constructed. Robot parsers are cached per scheme and
netloc."""
# only some schemes have a meaningful robots.txt file
if scheme != 'http' and scheme != 'https':
logger.debug('called with unsupported scheme (%s)', scheme)
return None
# split out the key part of the url
location = urlparse.urlunsplit((scheme, netloc, '', '', ''))
# try to create a new robotparser if we don't already have one
if location not in self._robotparsers:
logger.info('getting robots.txt for %s', location)
self._robotparsers[location] = None
try:
rp = robotparser.RobotFileParser()
rp.set_url(urlparse.urlunsplit(
(scheme, netloc, '/robots.txt', '', '')))
rp.read()
self._robotparsers[location] = rp
except (TypeError, IOError, httplib.HTTPException):
# ignore any problems setting up robot parser
pass
return self._robotparsers[location]
def _is_yanked(self, url):
"""Check whether the specified url should not be checked at all.
This uses the regualr expressions passed with add_yanked_re() and the
robots information present."""
# check if it is yanked through the regexps
for regexp in self._yanked_res.values():
# if the url matches it is yanked and we can stop
if regexp.search(url):
return 'yanked'
# check if we should avoid external links
is_internal = self._is_internal(url)
if not is_internal and config.AVOID_EXTERNAL_LINKS:
return 'external avoided'
# check if we should use robot parsers
if not config.USE_ROBOTS:
return None
(scheme, netloc) = urlparse.urlsplit(url)[0:2]
# skip schemes not having robot.txt files
if scheme not in ('http', 'https'):
return None
# skip robot checks for external urls
# TODO: make this configurable
if not is_internal:
return None
# check robots for remaining links
rp = self._get_robotparser(scheme, netloc)
if rp and not rp.can_fetch('webcheck', url):
return 'robot restriced'
# fall back to allowing the url
return None
def _get_link(self, session, url):
return Link.get_or_create(session, Link.clean_url(url))
def _get_links_to_crawl(self, session):
links = session.query(Link).filter(Link.fetched == None)
if config.MAX_DEPTH != None:
links = links.filter(Link.depth <= config.MAX_DEPTH)
return links.filter(Link.yanked == None)
def crawl(self):
"""Crawl the website based on the urls specified with add_base().
If the serialization file pointer is specified the crawler writes
out updated links to the file while crawling the site."""
# connect to the database
self.setup_database()
# configure urllib2 to store cookies in the output directory
_setup_urllib2()
# get a database session
session = Session()
# remove all links
if not config.CONTINUE:
truncate_db()
# add all internal urls to the database
for url in self.base_urls:
self._get_link(session, url)
# add some URLs from the database that haven't been fetched
tocheck = self._get_links_to_crawl(session)
remaining = tocheck.count()
tocheck = tocheck[:100]
remaining -= len(tocheck)
# repeat until we have nothing more to check
while tocheck:
# choose a link from the tocheck list
link = tocheck.pop()
link.is_internal = self._is_internal(link.url)
link.yanked = self._is_yanked(str(link.url))
# see if there are any more links to check
if not tocheck:
tocheck = self._get_links_to_crawl(session)
remaining = tocheck.count()
tocheck = tocheck[:100]
remaining -= len(tocheck)
# skip link it there is nothing to check
if link.yanked or link.fetched:
continue
# fetch the link's contents
response = self._fetch_link(link)
if response:
self._parse_response(link, response)
# flush database changes
session.commit()
# sleep between requests if configured
if config.WAIT_BETWEEN_REQUESTS > 0:
logger.debug('sleeping %s seconds',
config.WAIT_BETWEEN_REQUESTS)
time.sleep(config.WAIT_BETWEEN_REQUESTS)
logger.debug('items left to check: %d' %
(remaining + len(tocheck)))
session.commit()
session.close()
def _fetch_link(self, link):
"""Attempt to fetch the url and return content. This updates the
link with information retrieved."""
logger.info(link.url)
# mark the link as fetched to avoid loops
link.fetched = datetime.datetime.now()
# see if we can import the proper module for this scheme
try:
# FIXME: if an URI has a username:passwd add the uri, username and password to the HTTPPasswordMgr
request = urllib2.Request(link.url)
parent = link.parents.first()
if parent:
request.add_header('Referer', parent.url)
response = urllib2.urlopen(request, timeout=config.IOTIMEOUT)
info = response.info()
link.mimetype = info.gettype()
link.set_encoding(response.headers.getparam('charset'))
# get result code and other stuff
link.status = str(response.code)
try:
link.size = int(info.getheader('Content-length'))
except (TypeError, ValueError):
pass
mtime = info.getdate('Last-Modified')
if mtime:
link.mtime = datetime.datetime(*mtime[:7])
# if response.status == 301: link.add_linkproblem(str(response.status)+': '+response.reason)
# elif response.status != 200: link.add_linkproblem(str(response.status)+': '+response.reason)
# TODO: add checking for size
return response
except RedirectError, e:
link.status = str(e.code)
logger.info(str(e))
if e.code == 301:
link.add_linkproblem(str(e))
link.add_redirect(e.newurl)
except urllib2.HTTPError, e:
link.status = str(e.code)
logger.info(str(e))
link.add_linkproblem(str(e))
except urllib2.URLError, e:
logger.info(str(e))
link.add_linkproblem(str(e))
except KeyboardInterrupt:
# handle this in a higher-level exception handler
raise
except Exception, e:
# handle all other exceptions
logger.exception('unknown exception caught: ' + str(e))
link.add_linkproblem('error reading HTTP response: %s' % str(e))
def _parse_response(self, link, response):
"""Parse the fetched response content."""
# find a parser for the content-type
parsermodule = webcheck.parsers.get_parsermodule(link.mimetype)
if parsermodule is None:
logger.debug('unsupported content-type: %s', link.mimetype)
return
try:
# skip parsing of content if we were returned nothing
content = response.read()
if content is None:
return
# parse the content
logger.debug('parsing using %s', parsermodule.__name__)
parsermodule.parse(content, link)
except KeyboardInterrupt:
# handle this in a higher-level exception handler
raise
except Exception, e:
logger.exception('problem parsing page: %s', str(e))
link.add_pageproblem('problem parsing page: %s' % str(e))
def postprocess(self):
"""Do some basic post processing of the collected data, including
depth calculation of every link."""
# ensure we have a connection to the database
self.setup_database()
# get a database session
session = Session()
# build the list of urls that were set up with add_base() that
# do not have a parent (they form the base for the site)
bases = []
for url in list(self.base_urls):
link = self._get_link(session, url).follow_link()
if not link:
logger.warn('base link %s redirects to nowhere', url)
self.base_urls.remove(url)
else:
bases.append(link)
# if we got no base URLs, just use the first internal one we find
if not self.base_urls:
link = session.query(Link).filter(Link.is_internal == True).first()
logger.debug('fallback to adding %s to base urls', link.url)
self.base_urls.append(link.url)
bases.append(link)
# set the site name
self.site_name = bases[0].title or bases[0].url
# do a breadth first traversal of the website to determine depth
session.query(Link).update(dict(depth=None), synchronize_session=False)
session.commit()
depth = 0
count = len(bases)
for link in bases:
link.depth = 0
session.commit()
while count > 0:
logger.debug('%d links at depth %d%s', count, depth,
' (max)' if depth == config.MAX_DEPTH else '')
# update the depth of all links without a depth that have a
# parent with the previous depth
qry = session.query(Link).filter(Link.depth == None)
qry = qry.filter(Link.linked_from.any(Link.depth == depth))
count = qry.update(dict(depth=depth + 1), synchronize_session=False)
session.commit()
depth += 1
# TODO: also handle embeds
session.commit()
session.close()
# see if any of the plugins want to do postprocessing
for plugin in self.plugins:
if hasattr(plugin, 'postprocess'):
logger.info(plugin.__name__)
plugin.postprocess(self)
def generate(self):
"""Generate pages for plugins."""
# ensure we have a connection to the database
self.setup_database()
# call all the plugins
for plugin in self.plugins:
if hasattr(plugin, 'generate'):
logger.info(plugin.__name__)
plugin.generate(self)
# install theme files
install_file('static/webcheck.css', True)
install_file('static/fancytooltips/fancytooltips.js', True)
install_file('static/favicon.ico', False)
|