Arthur de Jong

Open Source / Free Software developer

summaryrefslogtreecommitdiffstats
path: root/crawler.py
blob: 4305b260dda4d5250a8a186a682196b7a1e25def (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
# crawler.py - filesystem crawler
#
# Copyright (C) 2015 Arthur de Jong
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
#
# The files produced as output from the software do not automatically fall
# under the copyright of the software, unless explicitly stated otherwise.

import os
import stat


def get_stat(name, path):
    """Collect relevant meta-data for path."""
    s = os.lstat(name)
    meta = dict(path=path, size=s.st_size, mtime=s.st_mtime,
                uid=s.st_uid, gid=s.st_gid, mode=stat.S_IMODE(s.st_mode))
    if stat.S_ISDIR(s.st_mode):
        meta['type'] = 'D'
        meta['size'] = 0
    elif stat.S_ISCHR(s.st_mode):
        meta['type'] = 'C'
        meta['size'] = 0
    elif stat.S_ISBLK(s.st_mode):
        meta['type'] = 'B'
        meta['size'] = 0
    elif stat.S_ISFIFO(s.st_mode):
        meta['type'] = 'F'
        meta['size'] = 0
    elif stat.S_ISLNK(s.st_mode):
        meta['type'] = 'L'
        meta['size'] = 0
        # FIXME: add linkname property
    elif stat.S_ISSOCK(s.st_mode):
        meta['type'] = 'S'
        meta['size'] = 0
    elif stat.S_ISREG(s.st_mode):
        meta['type'] = 'R'
    else:
        meta['type'] = '?'
    return meta


def walk(top, path_excludes, dir_excludes):
    """Recursively go over the specified directory and return all files and
    directories under it."""
    meta = get_stat('.', top)
    yield meta
    if meta['type'] != 'D':
        return
    # get list of all entries in directory
    # (ignore any errors)
    try:
        names = os.listdir(u'.')
    except os.error:
        return
    # return all non-directories first
    dirs = []
    for name in names:
        try:
            path = os.path.join(top, name)
            if any(x.search(path) for x in path_excludes):
                continue
            meta = get_stat(name, path)
            if meta['type'] == 'D':
                if any(x.search(path) for x in dir_excludes):
                    continue
                dirs.append(name)
            else:
                yield meta
        except (UnicodeDecodeError, OSError):
            import traceback
            print(traceback.format_exc())
            print('%r %r' % (top, name))
    # recurse into directories
    for name in dirs:
        try:
            os.chdir(name)
        except (UnicodeDecodeError, OSError):
            import traceback
            print(traceback.format_exc())
            print('%r %r' % (top, name))
        else:
            for meta in walk(os.path.join(top, name),
                             path_excludes, dir_excludes):
                yield meta
            os.chdir('..')


def crawl(paths, excludes=()):
    """Crawl the paths, store the crawled files in the database and find
    archives that contain files crawled."""
    from path import pattern2re
    # convert excludes
    path_excludes = [
        pattern2re(x) for x in excludes if not x.endswith('/')]
    dir_excludes = [
        pattern2re(x.rstrip('/')) for x in excludes if x.endswith('/')]
    # go over filesystem
    for path in paths:
        path = u'' + path
        print('Scanning %s' % path)
        save_dir = os.getcwd()
        os.chdir(path)
        for meta in walk(path, path_excludes, dir_excludes):
            yield meta
        os.chdir(save_dir)