1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
|
# http.py - handle urls with a http scheme
#
# Copyright (C) 1998, 1999 Albert Hopkins (marduk) <marduk@python.net>
# Copyright (C) 2002 Mike Meyer <mwm@mired.org>
# Copyright (C) 2005 Arthur de Jong <arthur@tiefighter.et.tudelft.nl>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
"""This module defines the functions needed for filling in information in Link
objects for urls using the http scheme."""
# http://www.ietf.org/rfc/rfc2616.txt
import string
import httplib
import urllib
import time
import urlparse
import base64
import mimetypes
import debugio
import version
import config
opener = urllib.FancyURLopener(config.PROXIES)
opener.addheaders = [('User-agent','webcheck ' + version.webcheck)]
if config.HEADERS:
opener.addheaders = opener.addheaders + config.HEADERS
def _get_reply(link):
"""Open connection to url and report information given by HEAD command."""
if config.PROXIES and config.PROXIES.has_key('http'):
host = urlparse.urlparse(config.PROXIES['http'])[1]
path = link.url
else:
host = link.netloc
path = string.join((link.path,link.query),'')
if not path:
path = '/'
userpass = urllib.splituser(link.netloc)[0]
if userpass is None:
(user, passwd) = (None, None)
else:
(user, passwd) = urllib.splitpasswd(userpass)
(host, port) = urllib.splitport(host)
if port:
h=httplib.HTTPConnection(host,port)
else:
h=httplib.HTTPConnection(host)
h.putrequest('HEAD', path)
if user and passwd:
auth = string.strip(base64.encodestring(user + ":" + passwd))
h.putheader('Authorization', 'Basic %s' % auth)
h.putheader('User-Agent','webcheck %s' % version.webcheck)
h.endheaders()
try:
r = h.getresponse()
errcode, errmsg, headers = r.status, r.reason, r.msg
h.close()
debugio.debug("HTTP response: %s %s" % (errcode, errmsg))
except httplib.BadStatusLine, e:
return (-1, "error reading HTTP response: "+str(e),[])
# handle redirects
# 301 = moved permanently
# 302 = found
# 303 = see other
# 307 = temporary redirect
if errcode == 301 or errcode == 302 or errcode == 303 or errcode == 307:
# TODO: consider pages linking to 301 (moved permanently) an error
# determin depth
redirectdepth = 0
for p in link.parents:
redirectdepth = max(redirectdepth,p.redirectdepth)
link.redirectdepth = redirectdepth + 1
# check depth
if link.redirectdepth >= config.REDIRECT_DEPTH:
debugio.error("too many redirects")
return (errcode, errmsg, headers)
# find url that is redirected to
location = headers['location']
debugio.info(' redirected to: ' + location)
location = urlparse.urljoin(link.url,location)
if location == link.url:
debugio.error("redirect same as source: %s" % location)
return (errcode, errmsg, headers)
# add child
link.add_child(location)
# TODO: add check for redirect loop detection
return (errcode, errmsg, headers)
def fetch(link):
"""Here, link is a reference of the link object that is calling this
pseudo-method."""
(status, message, headers) = _get_reply(link)
try:
link.mimetype = headers.gettype()
except AttributeError:
link.mimetype = 'text/html' # is this a good enough default?
debugio.debug('content-type: ' + link.mimetype)
try:
link.size = int(headers['content-length'])
except (KeyError, TypeError):
link.size = 0
debugio.debug('size: ' + str(link.size))
if (status != 200):
link.add_problem(str(status) + ": " + message)
return
try:
link.mtime = time.mktime(headers.getdate('Last-Modified'))
except (OverflowError, TypeError, ValueError):
pass
document = opener.open(link.url).read()
opener.cleanup()
return document
|