1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
|
# crawler.py - filesystem crawler
#
# Copyright (C) 2015 Arthur de Jong
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
#
# The files produced as output from the software do not automatically fall
# under the copyright of the software, unless explicitly stated otherwise.
import os
import stat
def get_stat(name, path):
"""Collect relevant meta-data for path."""
s = os.lstat(name)
meta = dict(path=path, size=s.st_size, mtime=s.st_mtime,
uid=s.st_uid, gid=s.st_gid, mode=stat.S_IMODE(s.st_mode))
if stat.S_ISDIR(s.st_mode):
meta['type'] = 'D'
meta['size'] = 0
elif stat.S_ISCHR(s.st_mode):
meta['type'] = 'C'
meta['size'] = 0
elif stat.S_ISBLK(s.st_mode):
meta['type'] = 'B'
meta['size'] = 0
elif stat.S_ISFIFO(s.st_mode):
meta['type'] = 'F'
meta['size'] = 0
elif stat.S_ISLNK(s.st_mode):
meta['type'] = 'L'
meta['size'] = 0
# FIXME: add linkname property
elif stat.S_ISSOCK(s.st_mode):
meta['type'] = 'S'
meta['size'] = 0
elif stat.S_ISREG(s.st_mode):
meta['type'] = 'R'
else:
meta['type'] = '?'
return meta
def walk(top, path_excludes, dir_excludes):
"""Recursively go over the specified directory and return all files and
directories under it."""
meta = get_stat('.', top)
yield meta
if meta['type'] != 'D':
return
# get list of all entries in directory
# (ignore any errors)
try:
names = os.listdir(u'.')
except os.error:
return
# return all non-directories first
dirs = []
for name in names:
try:
path = os.path.join(top, name)
if any(x.search(path) for x in path_excludes):
continue
meta = get_stat(name, path)
if meta['type'] == 'D':
if any(x.search(path) for x in dir_excludes):
continue
dirs.append(name)
else:
yield meta
except (UnicodeDecodeError, OSError):
import traceback
print(traceback.format_exc())
print('%r %r' % (top, name))
# recurse into directories
for name in dirs:
try:
os.chdir(name)
except (UnicodeDecodeError, OSError):
import traceback
print(traceback.format_exc())
print('%r %r' % (top, name))
else:
for meta in walk(os.path.join(top, name),
path_excludes, dir_excludes):
yield meta
os.chdir('..')
def crawl(paths, excludes=()):
"""Crawl the paths, store the crawled files in the database and find
archives that contain files crawled."""
from path import pattern2re
# convert excludes
path_excludes = [
pattern2re(x) for x in excludes if not x.endswith('/')]
dir_excludes = [
pattern2re(x.rstrip('/')) for x in excludes if x.endswith('/')]
# go over filesystem
for path in paths:
path = u'' + path
print('Scanning %s' % path)
save_dir = os.getcwd()
os.chdir(path)
for meta in walk(path, path_excludes, dir_excludes):
yield meta
os.chdir(save_dir)
|