Arthur de Jong

Open Source / Free Software developer

summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorArthur de Jong <arthur@arthurdejong.org>2005-08-16 22:36:47 +0200
committerArthur de Jong <arthur@arthurdejong.org>2005-08-16 22:36:47 +0200
commit73c26fc1bd1c02155fc49ff5c32c26d9cb58d98a (patch)
treecc2607d95fa5ef2b57a3d4a9dd208cae06595499
parenta70fb80cb1c7bdf0e0f0d3d97baeae8e244fa343 (diff)
pick up configured filenames if present in directories
git-svn-id: http://arthurdejong.org/svn/webcheck/webcheck@135 86f53f14-5ff3-0310-afe5-9b438ce3f40c
-rw-r--r--config.py10
-rw-r--r--schemes/file.py28
-rw-r--r--schemes/ftp.py84
3 files changed, 76 insertions, 46 deletions
diff --git a/config.py b/config.py
index 51e7514..8d1a0b5 100644
--- a/config.py
+++ b/config.py
@@ -83,3 +83,13 @@ REPORT_WHATSNEW_URL_AGE = 7
# The size of a page in kilobytes after which the page is considered too big.
REPORT_SLOW_URL_SIZE = 76
+
+# A list of names that will be checked when encountering an file:///
+# directory. This file will be picked up instead of the directory list.
+FILE_INDEXES = [ 'index.html', 'index.htm' ]
+
+# A list of names that will be checked when encountering an ftp://
+# directory. This file will be picked up instead of the directory list.
+FTP_INDEXES = [ 'index.html', 'index.htm' ]
+
+
diff --git a/schemes/file.py b/schemes/file.py
index b3940ae..4b47c15 100644
--- a/schemes/file.py
+++ b/schemes/file.py
@@ -22,6 +22,7 @@
"""This module defines the functions needed for creating Link objects for urls
using the file scheme."""
+import config
import debugio
import urlparse
import urllib
@@ -35,19 +36,20 @@ def _fetch_directory(link, path, acceptedtypes):
link.redirectdepth = 1
link.add_child(urlparse.urljoin(link.url,link.path+'/'))
return
- if os.path.isfile(os.path.join(path,'index.html')):
- debugio.debug('pick up index.html from directory')
- # the the directory contains an index.html, use that
- return _fetch_file(link, os.path.join(path,'index.html'), acceptedtypes)
- else:
- # otherwise add the directory's files as children
- debugio.debug('add files as children of this page')
- try:
- link.ispage = True
- for f in os.listdir(path):
- link.add_child(urlparse.urljoin(link.url,urllib.pathname2url(f)))
- except os.error, e:
- link.add_problem(str(e))
+ # check contents of directory for some common files
+ for f in config.FILE_INDEXES:
+ if os.path.isfile(os.path.join(path,f)):
+ debugio.debug('pick up %s from directory' % f)
+ # the the directory contains an index.html, use that
+ return _fetch_file(link, os.path.join(path,f), acceptedtypes)
+ # otherwise add the directory's files as children
+ debugio.debug('add files as children of this page')
+ try:
+ link.ispage = True
+ for f in os.listdir(path):
+ link.add_child(urlparse.urljoin(link.url,urllib.pathname2url(f)))
+ except os.error, e:
+ link.add_problem(str(e))
def _fetch_file(link, path, acceptedtypes):
# get stats of file
diff --git a/schemes/ftp.py b/schemes/ftp.py
index b009918..216affa 100644
--- a/schemes/ftp.py
+++ b/schemes/ftp.py
@@ -22,6 +22,7 @@
"""This module defines the functions needed for creating Link objects for urls
using the ftp scheme."""
+import config
import urllib
import mimetypes
import ftplib
@@ -73,52 +74,69 @@ def _cwd(ftp, path):
dirs.pop(0)
return None
except ftplib.error_perm, e:
- debugio.debug('schemes.ftp._cwd(): CWD '+d+' : '+str(e))
+ debugio.debug('schemes.ftp._cwd(): CWD '+d+': '+str(e))
return string.join(dirs,'/')
+def _fetch_directory(link, ftp, acceptedtypes):
+ """Handle the ftp directory."""
+ # check that the url ends with a slash
+ if link.path[-1:] != '/':
+ debugio.debug('schemes.ftp._fetch_directory(): directory referenced without trailing slash')
+ link.redirectdepth = 1
+ link.add_child(urlparse.urljoin(link.url,link.path+'/'))
+ return
+ # retreive the contents of the directory
+ # FIXME: this raises an exception for empty directories, probably replace with own command
+ contents = ftp.nlst()
+ # check contents of directory for some common files
+ for f in config.FTP_INDEXES:
+ if f in contents:
+ debugio.debug('schemes.ftp._fetch_directory(): pick up %s from directory' % f)
+ # the the directory contains an index.html, use that
+ return _fetch_file(link, ftp, f, acceptedtypes)
+ # just add files in directory as children
+ debugio.debug('schemes.ftp._fetch_directory(): add files as children of this page')
+ link.ispage = True
+ debugio.debug('schemes.ftp._fetch_directory(): TYPE A: '+ftp.voidcmd('TYPE A'))
+ # FIXME: this raises an exception for empty directories
+ for f in contents:
+ link.add_child(urlparse.urljoin(link.url,urllib.quote(f)))
+
+def _fetch_file(link, ftp, path, acceptedtypes):
+ """Try to download the file in path that should be in the current
+ directory of the ftp instance. The path can also point to a non-existant
+ file or directory."""
+ # figure out the size of the document
+ link.size = ftp.size(path)
+ debugio.debug('schemes.ftp.fetch(): size='+str(link.size))
+ # guess the mimetype of the document
+ if link.mimetype is None:
+ link.mimetype = mimetypes.guess_type(path)[0]
+ # try to fetch file
+ if link.mimetype in acceptedtypes:
+ debugio.debug('schemes.ftp.fetch(): TYPE I: '+ftp.voidcmd('TYPE I'))
+ (conn, size) = ftp.ntransfercmd('RETR ' + path)
+ if size:
+ content = conn.makefile().read(size)
+ else:
+ content = conn.makefile().read()
+ debugio.debug('schemes.ftp.fetch(): fetched, size=%d' % len(content))
+ return content
+
def fetch(link, acceptedtypes):
"""Fetch the specified link."""
# try to fetch the document
- content = None
try:
ftp = _getconnection(link.netloc)
- debugio.debug('schemes.ftp.fetch(): FTP: CWD / : '+ftp.cwd('/'))
+ debugio.debug('schemes.ftp.fetch(): CWD /: '+ftp.cwd('/'))
# descend down the directory tree as far as we can go
path=urllib.unquote(link.path)
path=_cwd(ftp, path)
# check if we are dealing with an (exising) directory
if path is None:
- # check that the url ends with a slash
- if link.path[-1:] != '/':
- debugio.debug('schemes.ftp.fetch(): directory referenced without trailing slash')
- link.redirectdepth = 1
- link.add_child(urlparse.urljoin(link.url,link.path+'/'))
- else:
- # add children
- debugio.debug('schemes.ftp.fetch(): add files as children of this page')
- link.ispage = True
- debugio.debug('schemes.ftp.fetch(): TYPE A: '+ftp.voidcmd('TYPE A'))
- # FIXME: this raises an exception for empty directories
- for f in ftp.nlst():
- link.add_child(urlparse.urljoin(link.url,urllib.quote(f)))
+ return _fetch_directory(link, ftp, acceptedtypes)
else:
- # figure out the size of the document
- link.size = ftp.size(path)
- debugio.debug('schemes.ftp.fetch(): size='+str(link.size))
- # guess the mimetype of the document
- if link.mimetype is None:
- link.mimetype = mimetypes.guess_type(path)[0]
- # try to fetch file
- if link.mimetype in acceptedtypes:
- debugio.debug('schemes.ftp.fetch(): TYPE I: '+ftp.voidcmd('TYPE I'))
- (conn, size) = ftp.ntransfercmd('RETR ' + path)
- if size:
- content = conn.makefile().read(size)
- else:
- content = conn.makefile().read()
- debugio.debug('schemes.ftp.fetch(): fetched, size=%d' % len(content))
+ return _fetch_file(link, ftp, path, acceptedtypes)
except ftplib.all_errors, e:
debugio.debug('schemes.ftp.fetch(): CAUGHT '+str(e))
link.add_problem(str(e))
- # we're done
- return content