Arthur de Jong

Open Source / Free Software developer

summaryrefslogtreecommitdiffstats
path: root/crawler.py
blob: 3f36bc80b2d2ecc4cdef50453bb1b446b79d28ab (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
# crawler.py - filesystem crawler
#
# Copyright (C) 2015 Arthur de Jong
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
#
# The files produced as output from the software do not automatically fall
# under the copyright of the software, unless explicitly stated otherwise.

import json
import os
import re
import stat


def get_stat(name, path):
    """Collect relevant meta-data for path."""
    s = os.lstat(name)
    meta = dict(path=path, size=s.st_size, mtime=s.st_mtime,
                ctime=s.st_ctime, uid=s.st_uid, gid=s.st_gid,
                mode=s.st_mode)
    if stat.S_ISDIR(s.st_mode):
        meta['is_dir'] = 1
        meta['size'] = 1
    return (
        path, meta['size'], 'is_dir' in meta,
        json.dumps(meta, sort_keys=True))


def walk(top, path_excludes, dir_excludes):
    """Recursively go over the specified directory and return all files and
    directories under it."""
    path, size, is_dir, meta = get_stat('.', top)
    yield path, size, is_dir, meta
    if not is_dir:
        return
    # get list of all entries in directory
    # (ignore any errors)
    try:
        names = os.listdir(u'.')
    except os.error:
        return
    # return all non-directories first
    dirs = []
    for name in names:
        try:
            path = os.path.join(top, name)
            if any(x.search(path) for x in path_excludes):
                continue
            path, size, is_dir, meta = get_stat(name, path)
            if is_dir:
                if any(x.search(path) for x in dir_excludes):
                    continue
                dirs.append(name)
            else:
                yield path, size, is_dir, meta
        except (UnicodeDecodeError, OSError):
            import traceback
            print(traceback.format_exc())
            print('%r %r' % (top, name))
    # recurse into directories
    for name in dirs:
        try:
            os.chdir(name)
        except (UnicodeDecodeError, OSError):
            import traceback
            print(traceback.format_exc())
            print('%r %r' % (top, name))
        else:
            for path, size, is_dir, meta in walk(os.path.join(top, name),
                                                 path_excludes, dir_excludes):
                yield path, size, is_dir, meta
            os.chdir('..')


def crawl(paths, excludes=()):
    """Crawl the paths, store the crawled files in the database and find
    archives that contain files crawled."""
    from path import pattern2re
    # convert excludes
    path_excludes = [
        re.compile(pattern2re(x))
        for x in excludes if not x.endswith('/')]
    dir_excludes = [
        re.compile(pattern2re(x.rstrip('/')))
        for x in excludes if x.endswith('/')]
    # go over filesystem
    for path in paths:
        path = u'' + path
        print('Scanning %s' % path)
        save_dir = os.getcwd()
        os.chdir(path)
        for path, size, is_dir, meta in walk(path,
                                             path_excludes, dir_excludes):
            yield path, size, is_dir, meta
        os.chdir(save_dir)