diff options
author | Arthur de Jong <arthur@arthurdejong.org> | 2005-08-16 22:36:47 +0200 |
---|---|---|
committer | Arthur de Jong <arthur@arthurdejong.org> | 2005-08-16 22:36:47 +0200 |
commit | 73c26fc1bd1c02155fc49ff5c32c26d9cb58d98a (patch) | |
tree | cc2607d95fa5ef2b57a3d4a9dd208cae06595499 | |
parent | a70fb80cb1c7bdf0e0f0d3d97baeae8e244fa343 (diff) |
pick up configured filenames if present in directories
git-svn-id: http://arthurdejong.org/svn/webcheck/webcheck@135 86f53f14-5ff3-0310-afe5-9b438ce3f40c
-rw-r--r-- | config.py | 10 | ||||
-rw-r--r-- | schemes/file.py | 28 | ||||
-rw-r--r-- | schemes/ftp.py | 84 |
3 files changed, 76 insertions, 46 deletions
@@ -83,3 +83,13 @@ REPORT_WHATSNEW_URL_AGE = 7 # The size of a page in kilobytes after which the page is considered too big. REPORT_SLOW_URL_SIZE = 76 + +# A list of names that will be checked when encountering an file:/// +# directory. This file will be picked up instead of the directory list. +FILE_INDEXES = [ 'index.html', 'index.htm' ] + +# A list of names that will be checked when encountering an ftp:// +# directory. This file will be picked up instead of the directory list. +FTP_INDEXES = [ 'index.html', 'index.htm' ] + + diff --git a/schemes/file.py b/schemes/file.py index b3940ae..4b47c15 100644 --- a/schemes/file.py +++ b/schemes/file.py @@ -22,6 +22,7 @@ """This module defines the functions needed for creating Link objects for urls using the file scheme.""" +import config import debugio import urlparse import urllib @@ -35,19 +36,20 @@ def _fetch_directory(link, path, acceptedtypes): link.redirectdepth = 1 link.add_child(urlparse.urljoin(link.url,link.path+'/')) return - if os.path.isfile(os.path.join(path,'index.html')): - debugio.debug('pick up index.html from directory') - # the the directory contains an index.html, use that - return _fetch_file(link, os.path.join(path,'index.html'), acceptedtypes) - else: - # otherwise add the directory's files as children - debugio.debug('add files as children of this page') - try: - link.ispage = True - for f in os.listdir(path): - link.add_child(urlparse.urljoin(link.url,urllib.pathname2url(f))) - except os.error, e: - link.add_problem(str(e)) + # check contents of directory for some common files + for f in config.FILE_INDEXES: + if os.path.isfile(os.path.join(path,f)): + debugio.debug('pick up %s from directory' % f) + # the the directory contains an index.html, use that + return _fetch_file(link, os.path.join(path,f), acceptedtypes) + # otherwise add the directory's files as children + debugio.debug('add files as children of this page') + try: + link.ispage = True + for f in os.listdir(path): + link.add_child(urlparse.urljoin(link.url,urllib.pathname2url(f))) + except os.error, e: + link.add_problem(str(e)) def _fetch_file(link, path, acceptedtypes): # get stats of file diff --git a/schemes/ftp.py b/schemes/ftp.py index b009918..216affa 100644 --- a/schemes/ftp.py +++ b/schemes/ftp.py @@ -22,6 +22,7 @@ """This module defines the functions needed for creating Link objects for urls using the ftp scheme.""" +import config import urllib import mimetypes import ftplib @@ -73,52 +74,69 @@ def _cwd(ftp, path): dirs.pop(0) return None except ftplib.error_perm, e: - debugio.debug('schemes.ftp._cwd(): CWD '+d+' : '+str(e)) + debugio.debug('schemes.ftp._cwd(): CWD '+d+': '+str(e)) return string.join(dirs,'/') +def _fetch_directory(link, ftp, acceptedtypes): + """Handle the ftp directory.""" + # check that the url ends with a slash + if link.path[-1:] != '/': + debugio.debug('schemes.ftp._fetch_directory(): directory referenced without trailing slash') + link.redirectdepth = 1 + link.add_child(urlparse.urljoin(link.url,link.path+'/')) + return + # retreive the contents of the directory + # FIXME: this raises an exception for empty directories, probably replace with own command + contents = ftp.nlst() + # check contents of directory for some common files + for f in config.FTP_INDEXES: + if f in contents: + debugio.debug('schemes.ftp._fetch_directory(): pick up %s from directory' % f) + # the the directory contains an index.html, use that + return _fetch_file(link, ftp, f, acceptedtypes) + # just add files in directory as children + debugio.debug('schemes.ftp._fetch_directory(): add files as children of this page') + link.ispage = True + debugio.debug('schemes.ftp._fetch_directory(): TYPE A: '+ftp.voidcmd('TYPE A')) + # FIXME: this raises an exception for empty directories + for f in contents: + link.add_child(urlparse.urljoin(link.url,urllib.quote(f))) + +def _fetch_file(link, ftp, path, acceptedtypes): + """Try to download the file in path that should be in the current + directory of the ftp instance. The path can also point to a non-existant + file or directory.""" + # figure out the size of the document + link.size = ftp.size(path) + debugio.debug('schemes.ftp.fetch(): size='+str(link.size)) + # guess the mimetype of the document + if link.mimetype is None: + link.mimetype = mimetypes.guess_type(path)[0] + # try to fetch file + if link.mimetype in acceptedtypes: + debugio.debug('schemes.ftp.fetch(): TYPE I: '+ftp.voidcmd('TYPE I')) + (conn, size) = ftp.ntransfercmd('RETR ' + path) + if size: + content = conn.makefile().read(size) + else: + content = conn.makefile().read() + debugio.debug('schemes.ftp.fetch(): fetched, size=%d' % len(content)) + return content + def fetch(link, acceptedtypes): """Fetch the specified link.""" # try to fetch the document - content = None try: ftp = _getconnection(link.netloc) - debugio.debug('schemes.ftp.fetch(): FTP: CWD / : '+ftp.cwd('/')) + debugio.debug('schemes.ftp.fetch(): CWD /: '+ftp.cwd('/')) # descend down the directory tree as far as we can go path=urllib.unquote(link.path) path=_cwd(ftp, path) # check if we are dealing with an (exising) directory if path is None: - # check that the url ends with a slash - if link.path[-1:] != '/': - debugio.debug('schemes.ftp.fetch(): directory referenced without trailing slash') - link.redirectdepth = 1 - link.add_child(urlparse.urljoin(link.url,link.path+'/')) - else: - # add children - debugio.debug('schemes.ftp.fetch(): add files as children of this page') - link.ispage = True - debugio.debug('schemes.ftp.fetch(): TYPE A: '+ftp.voidcmd('TYPE A')) - # FIXME: this raises an exception for empty directories - for f in ftp.nlst(): - link.add_child(urlparse.urljoin(link.url,urllib.quote(f))) + return _fetch_directory(link, ftp, acceptedtypes) else: - # figure out the size of the document - link.size = ftp.size(path) - debugio.debug('schemes.ftp.fetch(): size='+str(link.size)) - # guess the mimetype of the document - if link.mimetype is None: - link.mimetype = mimetypes.guess_type(path)[0] - # try to fetch file - if link.mimetype in acceptedtypes: - debugio.debug('schemes.ftp.fetch(): TYPE I: '+ftp.voidcmd('TYPE I')) - (conn, size) = ftp.ntransfercmd('RETR ' + path) - if size: - content = conn.makefile().read(size) - else: - content = conn.makefile().read() - debugio.debug('schemes.ftp.fetch(): fetched, size=%d' % len(content)) + return _fetch_file(link, ftp, path, acceptedtypes) except ftplib.all_errors, e: debugio.debug('schemes.ftp.fetch(): CAUGHT '+str(e)) link.add_problem(str(e)) - # we're done - return content |