# crawler.py - filesystem crawler # # Copyright (C) 2015 Arthur de Jong # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA # # The files produced as output from the software do not automatically fall # under the copyright of the software, unless explicitly stated otherwise. import json import os import re import stat def get_stat(name, path): """Collect relevant meta-data for path.""" s = os.lstat(name) meta = dict(path=path, size=s.st_size, mtime=s.st_mtime, ctime=s.st_ctime, uid=s.st_uid, gid=s.st_gid, mode=s.st_mode) if stat.S_ISDIR(s.st_mode): meta['is_dir'] = 1 meta['size'] = 1 return ( path, meta['size'], 'is_dir' in meta, json.dumps(meta, sort_keys=True)) def walk(top, path_excludes, dir_excludes): """Recursively go over the specified directory and return all files and directories under it.""" path, size, is_dir, meta = get_stat('.', top) yield path, size, is_dir, meta if not is_dir: return # get list of all entries in directory # (ignore any errors) try: names = os.listdir(u'.') except os.error: return # return all non-directories first dirs = [] for name in names: try: path = os.path.join(top, name) if any(x.search(path) for x in path_excludes): continue path, size, is_dir, meta = get_stat(name, path) if is_dir: if any(x.search(path) for x in dir_excludes): continue dirs.append(name) else: yield path, size, is_dir, meta except (UnicodeDecodeError, OSError): import traceback print(traceback.format_exc()) print('%r %r' % (top, name)) # recurse into directories for name in dirs: try: os.chdir(name) except (UnicodeDecodeError, OSError): import traceback print(traceback.format_exc()) print('%r %r' % (top, name)) else: for path, size, is_dir, meta in walk(os.path.join(top, name), path_excludes, dir_excludes): yield path, size, is_dir, meta os.chdir('..') def crawl(paths, excludes=()): """Crawl the paths, store the crawled files in the database and find archives that contain files crawled.""" from path import pattern2re # convert excludes path_excludes = [ re.compile(pattern2re(x)) for x in excludes if not x.endswith('/')] dir_excludes = [ re.compile(pattern2re(x.rstrip('/'))) for x in excludes if x.endswith('/')] # go over filesystem for path in paths: path = u'' + path print('Scanning %s' % path) save_dir = os.getcwd() os.chdir(path) for path, size, is_dir, meta in walk(path, path_excludes, dir_excludes): yield path, size, is_dir, meta os.chdir(save_dir)