diff options
author | Arthur de Jong <arthur@arthurdejong.org> | 2005-08-19 22:44:04 +0200 |
---|---|---|
committer | Arthur de Jong <arthur@arthurdejong.org> | 2005-08-19 22:44:04 +0200 |
commit | 79a22f15d4890f31ae8314347654145185f40cf4 (patch) | |
tree | 01a2a3edb62168bf3dd4af171c46b302800a7df2 | |
parent | f80a9041ad64eda98cff51645e5180a87f2bde15 (diff) |
fix bug with following redirects where otherwise unreferenced links were removed and implement redirect loop detection
git-svn-id: http://arthurdejong.org/svn/webcheck/webcheck@142 86f53f14-5ff3-0310-afe5-9b438ce3f40c
-rw-r--r-- | crawler.py | 11 |
1 files changed, 7 insertions, 4 deletions
@@ -411,23 +411,26 @@ class Link: # parse the content parsermodule.parse(content, self) - def follow_link(self, delifunref=False): + def follow_link(self, delifunref=False, visited=[]): """If this link represents a redirect return the redirect target, otherwise return self. If delifunref is set this link is discarded if it has no parents. If this redirect does not find a referenced link None is returned.""" - # FIXME: add checking for loops if self.redirectdepth == 0: return self if len(self.children) == 0: return None + # check for loops + visited.append(self) + if self.children[0] in visited: + return None # remove link if this is the only place that it's used - if (len(self.parents) == 0): + if (len(self.parents) == 0) and delifunref: # remove me from the linkMap del self.site.linkMap[self.url] # remove me from parents of child self.children[0].parents.remove(self) - return self.children[0].follow_link(delifunref) + return self.children[0].follow_link(delifunref, visited) def _pagechildren(self): """Determin the page children of this link, combining the children of |