1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
|
# db.py - database access layer for webcheck
#
# Copyright (C) 2011 Arthur de Jong
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
#
# The files produced as output from the software do not automatically fall
# under the copyright of the software, unless explicitly stated otherwise.
import urlparse
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import distinct, func
from sqlalchemy import Table, Column, Integer, Boolean, String, DateTime, ForeignKey
from sqlalchemy.orm import relationship, backref, sessionmaker
from sqlalchemy.orm.session import object_session
from sqlalchemy.sql.expression import ClauseElement, union
import config
import debugio
import myurllib
# provide session and schema classes
Session = sessionmaker()
Base = declarative_base()
children = Table(
'children', Base.metadata,
Column('parent_id', Integer, ForeignKey('links.id', ondelete='CASCADE'), index=True),
Column('child_id', Integer, ForeignKey('links.id', ondelete='CASCADE'), index=True)
)
embedded = Table(
'embedded', Base.metadata,
Column('parent_id', Integer, ForeignKey('links.id', ondelete='CASCADE'), index=True),
Column('child_id', Integer, ForeignKey('links.id', ondelete='CASCADE'), index=True)
)
class Link(Base):
__tablename__ = 'links'
id = Column(Integer, primary_key=True)
url = Column(String, index=True, nullable=False, unique=True)
is_internal = Column(Boolean, index=True)
yanked = Column(String, index=True)
fetched = Column(DateTime, index=True)
# information about the retrieved link
status = Column(String)
mimetype = Column(String)
mimetype = Column(String)
encoding = Column(String)
size = Column(Integer)
mtime = Column(DateTime, index=True)
is_page = Column(Boolean, index=True)
title = Column(String, index=True)
author = Column(String)
# relationships between links
children = relationship('Link', secondary=children,
backref=backref('linked_from', lazy='dynamic'),
primaryjoin=(id == children.c.parent_id),
secondaryjoin=(id == children.c.child_id),
lazy='dynamic')
embedded = relationship('Link', secondary=embedded,
backref=backref('embedded_in', lazy='dynamic'),
primaryjoin=(id == embedded.c.parent_id),
secondaryjoin=(id == embedded.c.child_id),
lazy='dynamic')
# crawling information
redirectdepth = Column(Integer, default=0)
depth = Column(Integer)
@staticmethod
def clean_url(url):
# normalise the URL, removing the fragment from the URL
url = myurllib.normalizeurl(url)
return urlparse.urldefrag(myurllib.normalizeurl(url))[0]
def _get_link(self, url):
"""Get a link object for the specified URL."""
# get the session
session = object_session(self)
# normalise the URL, removing the fragment from the URL
url, fragment = urlparse.urldefrag(myurllib.normalizeurl(url))
# try to find the link
instance = session.query(Link).filter_by(url=url).first()
if not instance:
instance = Link(url=url)
session.add(instance)
# mark that we were looking for an anchor/fragment
if fragment:
instance.add_reqanchor(self, fragment)
# return the link
return instance
def set_encoding(self, encoding):
"""Set the encoding of the link doing some basic checks to see if
the encoding is supported."""
if not self.encoding and encoding:
try:
debugio.debug('crawler.Link.set_encoding(%r)' % encoding)
unicode('just some random text', encoding, 'replace')
self.encoding = encoding
except Exception, e:
import traceback
traceback.print_exc()
self.add_pageproblem('unknown encoding: %s' % encoding)
def add_redirect(self, url):
"""Indicate that this link redirects to the specified url."""
url = self.clean_url(url)
# figure out depth
self.redirectdepth = max([self.redirectdepth] +
[x.redirectdepth for x in self.parents]) + 1
# check depth
if self.redirectdepth >= config.REDIRECT_DEPTH:
self.add_linkproblem('too many redirects (%d)' % self.redirectdepth)
return
# check for redirect to self
if url == self.url:
self.add_linkproblem('redirect same as source: %s' % url)
return
# add child
self.add_child(url)
def add_linkproblem(self, message):
"""Indicate that something went wrong while retrieving this link."""
self.linkproblems.append(LinkProblem(message=message))
def add_pageproblem(self, message):
"""Indicate that something went wrong with parsing the document."""
# only think about problems on internal pages
if not self.is_internal:
return
# TODO: only include a single problem once (e.g. multiple anchors)
self.pageproblems.append(PageProblem(message=message))
def add_child(self, url):
"""Add the specified URL as a child of this link."""
# ignore children for external links
if not self.is_internal:
return
# add to children
self.children.append(self._get_link(url))
def add_embed(self, url):
"""Mark the given URL as used as an image on this page."""
# ignore embeds for external links
if not self.is_internal:
return
# add to embedded
self.embedded.append(self._get_link(url))
def add_anchor(self, anchor):
"""Indicate that this page contains the specified anchor."""
# lowercase anchor
anchor = anchor.lower()
if self.anchors.filter(Anchor.anchor == anchor).first():
self.add_pageproblem(
'anchor/id "%(anchor)s" defined multiple times'
% {'anchor': anchor})
else:
self.anchors.append(Anchor(anchor=anchor))
def add_reqanchor(self, parent, anchor):
"""Indicate that the specified link contains a reference to the
specified anchor. This can be checked later."""
# lowercase anchor
anchor = anchor.lower()
# if RequestedAnchor doesn't exist, add it
if not self.reqanchors.filter((RequestedAnchor.parent_id == parent.id) & (RequestedAnchor.anchor == anchor)).first():
self.reqanchors.append(RequestedAnchor(parent_id=parent.id, anchor=anchor))
def follow_link(self, visited=None):
"""If this link represents a redirect return the redirect target,
otherwise return self. If this redirect does not find a referenced
link None is returned."""
# if this is not a redirect just return
if not self.redirectdepth:
return self
# if we don't know where this redirects, return None
if not self.children.count():
return None
# avoid loops
if not visited:
visited = set()
visited.add(self.url)
# the first (and only) child is the redirect target
child = self.children.first()
if child.url in visited:
return None
# check where we redirect to
return child.follow_link(visited)
@property
def count_parents(self):
session = object_session(self)
p1 = session.query(func.count(distinct(children.c.parent_id))).filter(children.c.child_id == self.id)
p2 = session.query(func.count(distinct(embedded.c.parent_id))).filter(embedded.c.child_id == self.id)
return p1.scalar() + p2.scalar()
@property
def parents(self):
session = object_session(self)
#links = object_session(self).query(Link)
#links = links.join(children, Link.id == children.c.parent_id)
#links = links.join(embedded, Link.id == embedded.c.parent_id)
#return links.filter((children.c.child_id == self.id) |
# (embedded.c.child_id == self.id)).distinct()
parent_ids = union(session.query(children.c.parent_id).filter(children.c.child_id == self.id),
session.query(embedded.c.parent_id).filter(embedded.c.child_id == self.id))
return session.query(Link).filter(Link.id == parent_ids.c.children_parent_id).distinct()
class LinkProblem(Base):
"""Storage of problems in the URL itself (e.g. problem downloading the
associated resource)."""
__tablename__ = 'linkproblems'
id = Column(Integer, primary_key=True)
link_id = Column(Integer, ForeignKey('links.id', ondelete='CASCADE'), index=True)
message = Column(String, index=True)
link = relationship(Link, backref=backref('linkproblems', order_by=message,
cascade='all,delete,delete-orphan'))
def __unicode__(self):
return self.message
class PageProblem(Base):
"""Storage of problems in the information from the retrieved URL (e.g.
invalid HTML)."""
__tablename__ = 'pageproblems'
id = Column(Integer, primary_key=True)
link_id = Column(Integer, ForeignKey('links.id', ondelete='CASCADE'), index=True)
message = Column(String, index=True)
link = relationship(Link, backref=backref('pageproblems', order_by=message,
cascade='all,delete,delete-orphan'))
def __unicode__(self):
return self.message
class Anchor(Base):
"""The named anchors (IDs) found on the page."""
__tablename__ = 'anchors'
id = Column(Integer, primary_key=True)
link_id = Column(Integer, ForeignKey('links.id', ondelete='CASCADE'), index=True)
link = relationship(Link, backref=backref('anchors',
lazy='dynamic',
cascade='all,delete,delete-orphan'))
anchor = Column(String)
def __unicode__(self):
return self.anchor
class RequestedAnchor(Base):
"""The named anchors (IDs) found on the page."""
__tablename__ = 'reqanchors'
id = Column(Integer, primary_key=True)
link_id = Column(Integer, ForeignKey('links.id', ondelete='CASCADE'), index=True)
link = relationship(Link, backref=backref('reqanchors',
lazy='dynamic',
cascade='all,delete,delete-orphan',
), primaryjoin='Link.id == RequestedAnchor.link_id')
parent_id = Column(Integer, ForeignKey('links.id', ondelete='CASCADE'), index=True)
parent = relationship(Link, primaryjoin='Link.id == RequestedAnchor.parent_id')
anchor = Column(String)
def __unicode__(self):
return self.anchor
|