Arthur de Jong

Open Source / Free Software developer

summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorArthur de Jong <arthur@arthurdejong.org>2006-04-27 23:53:59 +0200
committerArthur de Jong <arthur@arthurdejong.org>2006-04-27 23:53:59 +0200
commit3e6a8c12558aa3fbe1e471a20ee2dc13d514be6d (patch)
treecab3f57d992e9ede18ad661be663e07c30c8eb0e
parentdd715a1fd91ad86b1b3cfc5ed1baad35770d0722 (diff)
also add all unfetched links from a site to make this method recallable
git-svn-id: http://arthurdejong.org/svn/webcheck/webcheck@249 86f53f14-5ff3-0310-afe5-9b438ce3f40c
-rw-r--r--crawler.py5
1 files changed, 5 insertions, 0 deletions
diff --git a/crawler.py b/crawler.py
index c738767..b5b68f5 100644
--- a/crawler.py
+++ b/crawler.py
@@ -236,6 +236,11 @@ class Site:
# TODO: have some different scheme to crawl a site (e.g. separate
# internal and external queues, threading, etc)
tocheck = []
+ # add all unfetched site urls
+ for link in self.linkMap.values():
+ if not link.isyanked and not link.isfetched:
+ tocheck.append(link)
+ # add all internal urls
for url in self._internal_urls:
tocheck.append(self.get_link(url))
# repeat until we have nothing more to check