Arthur de Jong

Open Source / Free Software developer

summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorArthur de Jong <arthur@arthurdejong.org>2005-08-19 22:44:04 +0200
committerArthur de Jong <arthur@arthurdejong.org>2005-08-19 22:44:04 +0200
commit79a22f15d4890f31ae8314347654145185f40cf4 (patch)
tree01a2a3edb62168bf3dd4af171c46b302800a7df2
parentf80a9041ad64eda98cff51645e5180a87f2bde15 (diff)
fix bug with following redirects where otherwise unreferenced links were removed and implement redirect loop detection
git-svn-id: http://arthurdejong.org/svn/webcheck/webcheck@142 86f53f14-5ff3-0310-afe5-9b438ce3f40c
-rw-r--r--crawler.py11
1 files changed, 7 insertions, 4 deletions
diff --git a/crawler.py b/crawler.py
index f6984f8..c3ebb7b 100644
--- a/crawler.py
+++ b/crawler.py
@@ -411,23 +411,26 @@ class Link:
# parse the content
parsermodule.parse(content, self)
- def follow_link(self, delifunref=False):
+ def follow_link(self, delifunref=False, visited=[]):
"""If this link represents a redirect return the redirect target,
otherwise return self. If delifunref is set this link is discarded
if it has no parents. If this redirect does not find a referenced
link None is returned."""
- # FIXME: add checking for loops
if self.redirectdepth == 0:
return self
if len(self.children) == 0:
return None
+ # check for loops
+ visited.append(self)
+ if self.children[0] in visited:
+ return None
# remove link if this is the only place that it's used
- if (len(self.parents) == 0):
+ if (len(self.parents) == 0) and delifunref:
# remove me from the linkMap
del self.site.linkMap[self.url]
# remove me from parents of child
self.children[0].parents.remove(self)
- return self.children[0].follow_link(delifunref)
+ return self.children[0].follow_link(delifunref, visited)
def _pagechildren(self):
"""Determin the page children of this link, combining the children of