1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
|
# crawler.py - filesystem crawler
#
# Copyright (C) 2015 Arthur de Jong
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
#
# The files produced as output from the software do not automatically fall
# under the copyright of the software, unless explicitly stated otherwise.
import json
import os
import re
import stat
def get_stat(name, path):
"""Collect relevant meta-data for path."""
s = os.lstat(name)
meta = dict(path=path, size=s.st_size, mtime=s.st_mtime,
ctime=s.st_ctime, uid=s.st_uid, gid=s.st_gid,
mode=s.st_mode)
if stat.S_ISDIR(s.st_mode):
meta['is_dir'] = 1
meta['size'] = 1
return (
path, meta['size'], 'is_dir' in meta,
json.dumps(meta, sort_keys=True))
def walk(top, path_excludes, dir_excludes):
"""Recursively go over the specified directory and return all files and
directories under it."""
path, size, is_dir, meta = get_stat('.', top)
yield path, size, is_dir, meta
if not is_dir:
return
# get list of all entries in directory
# (ignore any errors)
try:
names = os.listdir(u'.')
except os.error:
return
# return all non-directories first
dirs = []
for name in names:
try:
path = os.path.join(top, name)
if any(x.search(path) for x in path_excludes):
continue
path, size, is_dir, meta = get_stat(name, path)
if is_dir:
if any(x.search(path) for x in dir_excludes):
continue
dirs.append(name)
else:
yield path, size, is_dir, meta
except (UnicodeDecodeError, OSError):
import traceback
print(traceback.format_exc())
print('%r %r' % (top, name))
# recurse into directories
for name in dirs:
try:
os.chdir(name)
except (UnicodeDecodeError, OSError):
import traceback
print(traceback.format_exc())
print('%r %r' % (top, name))
else:
for path, size, is_dir, meta in walk(os.path.join(top, name),
path_excludes, dir_excludes):
yield path, size, is_dir, meta
os.chdir('..')
def crawl(paths, excludes=()):
"""Crawl the paths, store the crawled files in the database and find
archives that contain files crawled."""
from path import pattern2re
# convert excludes
path_excludes = [
re.compile(pattern2re(x))
for x in excludes if not x.endswith('/')]
dir_excludes = [
re.compile(pattern2re(x.rstrip('/')))
for x in excludes if x.endswith('/')]
# go over filesystem
for path in paths:
path = u'' + path
print('Scanning %s' % path)
save_dir = os.getcwd()
os.chdir(path)
for path, size, is_dir, meta in walk(path,
path_excludes, dir_excludes):
yield path, size, is_dir, meta
os.chdir(save_dir)
|