# crawler.py - filesystem crawler # # Copyright (C) 2015 Arthur de Jong # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA # # The files produced as output from the software do not automatically fall # under the copyright of the software, unless explicitly stated otherwise. import json import os import stat def get_stat(top, path): """Collect relevant meta-data for path.""" s = os.lstat(path) path = top if path == '.' else os.path.join(top, path) # FIXME: figure out what info to consider (also filetype) meta = dict(path=path, size=s.st_size, mtime=s.st_mtime, ctime=s.st_ctime, uid=s.st_uid, gid=s.st_gid) if stat.S_ISDIR(s.st_mode): meta['is_dir'] = 1 meta['size'] = 1 return ( path, meta['size'], 'is_dir' in meta, json.dumps(meta, sort_keys=True)) def walk(top): """Recursively go over the specified directory and return all files and directories under it.""" path, size, is_dir, meta = get_stat(top, '.') yield path, size, is_dir, meta if not is_dir: return # get list of all entries in directory # (ignore any errors) try: names = os.listdir(u'.') except os.error: return # return all non-directories first dirs = [] for name in names: try: path, size, is_dir, meta = get_stat(top, name) if is_dir: dirs.append(name) else: yield path, size, is_dir, meta except (UnicodeDecodeError, OSError): import traceback print traceback.format_exc() print repr(top), repr(name) # recurse into directories for name in dirs: os.chdir(name) for path, size, is_dir, meta in walk(os.path.join(top, name)): yield path, size, is_dir, meta os.chdir('..') def crawl(paths, db): """Crawl the paths, store the crawled files in the database and find archives that contain files crawled.""" # clear archive usage db.connection.execute(''' UPDATE `archives` SET `used` = 0 ''') # go over filesystem for path in paths: path = unicode(path) print 'Scanning %s' % path save_dir = os.getcwd() os.chdir(path) for path, size, is_dir, meta in walk(path): # fill crawled table db.connection.execute(''' INSERT INTO `crawled` (`path`, `size`, `is_dir`, `meta`) VALUES (?, ?, ?, ?) ''', (path, size, is_dir, meta)) # TODO: consider making this an executemany for efficiency # update archive to indicate that some part can be used # TODO: consider moving the UPDATE to a separate query db.connection.execute(''' UPDATE `archives` SET `used` = `size` WHERE `meta` = ? ''', (meta, )) db.commit() os.chdir(save_dir)