From d55b995e35eb18d9d89aed0d4977af46dd8e5b22 Mon Sep 17 00:00:00 2001 From: Arthur de Jong Date: Tue, 24 Sep 2013 18:39:42 +0200 Subject: Get response size and modified date from request --- webcheck/crawler.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/webcheck/crawler.py b/webcheck/crawler.py index abd1828..d126ca7 100644 --- a/webcheck/crawler.py +++ b/webcheck/crawler.py @@ -353,12 +353,18 @@ class Crawler(object): if parent: request.add_header('Referer', parent.url) response = urllib2.urlopen(request, timeout=config.IOTIMEOUT) + info = response.info() link.mimetype = response.info().gettype() link.set_encoding(response.headers.getparam('charset')) - # FIXME: get result code and other stuff + # get result code and other stuff link.status = str(response.code) - # link.size = int(response.getheader('Content-length')) - # link.mtime = time.mktime(response.msg.getdate('Last-Modified')) + try: + link.size = int(info.getheader('Content-length')) + except (TypeError, ValueError): + pass + mtime = info.getdate('Last-Modified') + if mtime: + link.mtime = datetime.datetime(*mtime[:7]) # if response.status == 301: link.add_linkproblem(str(response.status)+': '+response.reason) # elif response.status != 200: link.add_linkproblem(str(response.status)+': '+response.reason) # TODO: add checking for size -- cgit v1.2.3