1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
|
# file.py - handle urls with a file scheme
#
# Copyright (C) 1998, 1999 Albert Hopkins (marduk)
# Copyright (C) 2002 Mike W. Meyer
# Copyright (C) 2005 Arthur de Jong
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
"""This module defines the functions needed for creating Link objects for urls
using the file scheme."""
import debugio
import urlparse
import urllib
import os
import mimetypes
def _fetch_directory(link, path, acceptedtypes):
# if the name does not end with a slash, redirect
if path[-1:] != os.path.sep:
debugio.debug('directory referenced without trailing slash')
link.redirectdepth = 1
link.add_child(urlparse.urljoin(link.url,link.path+'/'))
return
if os.path.isfile(os.path.join(path,'index.html')):
debugio.debug('pick up index.html from directory')
# the the directory contains an index.html, use that
return _fetch_file(link, os.path.join(path,'index.html'), acceptedtypes)
else:
# otherwise add the directory's files as children
debugio.debug('add files as children of this page')
try:
link.ispage = True
for f in os.listdir(path):
link.add_child(urlparse.urljoin(link.url,urllib.pathname2url(f)))
except os.error, e:
link.add_problem(str(e))
def _fetch_file(link, path, acceptedtypes):
# get stats of file
try:
stats = os.stat(path)
link.size = stats.st_size
link.mtime = stats.st_mtime
except os.error, e:
link.add_problem(str(e))
return
# guess mimetype
if link.mimetype is None:
link.mimetype = mimetypes.guess_type(path)[0]
debugio.debug('mimetype='+str(link.mimetype))
debugio.debug('acceptedtypes='+str(acceptedtypes))
# fetch the document if there is any point
if link.mimetype in acceptedtypes:
debugio.debug('FETCH')
try:
# TODO: add size checking
return open(path,'r').read()
except IOError, e:
debugio.debug('PROBLEM: '+str(e))
ink.add_problem(str(e))
def fetch(link, acceptedtypes):
"""Retreive some basic information about the file.
Store the results in the link object."""
# get the local path component
path=urllib.url2pathname(link.path)
# do special things if we are a directory
if os.path.isdir(path):
return _fetch_directory(link, path, acceptedtypes)
else:
return _fetch_file(link, path, acceptedtypes)
|