Arthur de Jong

Open Source / Free Software developer

summaryrefslogtreecommitdiffstats
path: root/crawler.py
blob: 88d4a9664c1b8d102a32d3d2f661652a372473d4 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
# crawler.py - filesystem crawler
#
# Copyright (C) 2015 Arthur de Jong
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
#
# The files produced as output from the software do not automatically fall
# under the copyright of the software, unless explicitly stated otherwise.

import json
import os
import stat


def get_stat(top, path):
    """Collect relevant meta-data for path."""
    s = os.lstat(path)
    path = top if path == '.' else os.path.join(top, path)
    # FIXME: figure out what info to consider (also filetype)
    meta = dict(path=path, size=s.st_size, mtime=s.st_mtime,
                ctime=s.st_ctime, uid=s.st_uid, gid=s.st_gid)
    if stat.S_ISDIR(s.st_mode):
        meta['is_dir'] = 1
        meta['size'] = 1
    return (
        path, meta['size'], 'is_dir' in meta,
        json.dumps(meta, sort_keys=True))


def walk(top):
    """Recursively go over the specified directory and return all files and
    directories under it."""
    path, size, is_dir, meta = get_stat(top, '.')
    yield path, size, is_dir, meta
    if not is_dir:
        return
    # get list of all entries in directory
    # (ignore any errors)
    try:
        names = os.listdir(u'.')
    except os.error:
        return
    # return all non-directories first
    dirs = []
    for name in names:
        try:
            path, size, is_dir, meta = get_stat(top, name)
            if is_dir:
                dirs.append(name)
            else:
                yield path, size, is_dir, meta
        except (UnicodeDecodeError, OSError):
            import traceback
            print traceback.format_exc()
            print repr(top), repr(name)
    # recurse into directories
    for name in dirs:
        os.chdir(name)
        for path, size, is_dir, meta in walk(os.path.join(top, name)):
            yield path, size, is_dir, meta
        os.chdir('..')


def crawl(paths, db):
    """Crawl the paths, store the crawled files in the database and find
    archives that contain files crawled."""
    # clear archive usage
    db.connection.execute('''
        UPDATE `archives`
        SET `used` = 0
        ''')
    # go over filesystem
    for path in paths:
        path = unicode(path)
        print 'Scanning %s' % path
        save_dir = os.getcwd()
        os.chdir(path)
        for path, size, is_dir, meta in walk(path):
            # fill crawled table
            db.connection.execute('''
                INSERT INTO `crawled`
                  (`path`, `size`, `is_dir`, `meta`)
                VALUES
                  (?, ?, ?, ?)
                ''', (path, size, is_dir, meta))
            # TODO: consider making this an executemany for efficiency
            # update archive to indicate that some part can be used
            # TODO: consider moving the UPDATE to a separate query
            db.connection.execute('''
                UPDATE `archives`
                SET `used` = `size`
                WHERE `meta` = ?
                ''', (meta, ))
        db.commit()
        os.chdir(save_dir)