diff options
author | Arthur de Jong <arthur@arthurdejong.org> | 2006-04-27 23:53:59 +0200 |
---|---|---|
committer | Arthur de Jong <arthur@arthurdejong.org> | 2006-04-27 23:53:59 +0200 |
commit | 3e6a8c12558aa3fbe1e471a20ee2dc13d514be6d (patch) | |
tree | cab3f57d992e9ede18ad661be663e07c30c8eb0e | |
parent | dd715a1fd91ad86b1b3cfc5ed1baad35770d0722 (diff) |
also add all unfetched links from a site to make this method recallable
git-svn-id: http://arthurdejong.org/svn/webcheck/webcheck@249 86f53f14-5ff3-0310-afe5-9b438ce3f40c
-rw-r--r-- | crawler.py | 5 |
1 files changed, 5 insertions, 0 deletions
@@ -236,6 +236,11 @@ class Site: # TODO: have some different scheme to crawl a site (e.g. separate # internal and external queues, threading, etc) tocheck = [] + # add all unfetched site urls + for link in self.linkMap.values(): + if not link.isyanked and not link.isfetched: + tocheck.append(link) + # add all internal urls for url in self._internal_urls: tocheck.append(self.get_link(url)) # repeat until we have nothing more to check |