# crawler.py - filesystem crawler # # Copyright (C) 2015 Arthur de Jong # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA # # The files produced as output from the software do not automatically fall # under the copyright of the software, unless explicitly stated otherwise. import os import stat def get_stat(name, path): """Collect relevant meta-data for path.""" s = os.lstat(name) meta = dict(path=path, size=s.st_size, mtime=s.st_mtime, uid=s.st_uid, gid=s.st_gid, mode=stat.S_IMODE(s.st_mode)) if stat.S_ISDIR(s.st_mode): meta['type'] = 'D' meta['size'] = 0 elif stat.S_ISCHR(s.st_mode): meta['type'] = 'C' meta['size'] = 0 elif stat.S_ISBLK(s.st_mode): meta['type'] = 'B' meta['size'] = 0 elif stat.S_ISFIFO(s.st_mode): meta['type'] = 'F' meta['size'] = 0 elif stat.S_ISLNK(s.st_mode): meta['type'] = 'L' meta['size'] = 0 # FIXME: add linkname property elif stat.S_ISSOCK(s.st_mode): meta['type'] = 'S' meta['size'] = 0 elif stat.S_ISREG(s.st_mode): meta['type'] = 'R' else: meta['type'] = '?' return meta def walk(top, path_excludes, dir_excludes): """Recursively go over the specified directory and return all files and directories under it.""" meta = get_stat('.', top) yield meta if meta['type'] != 'D': return # get list of all entries in directory # (ignore any errors) try: names = os.listdir(u'.') except os.error: return # return all non-directories first dirs = [] for name in names: try: path = os.path.join(top, name) if any(x.search(path) for x in path_excludes): continue meta = get_stat(name, path) if meta['type'] == 'D': if any(x.search(path) for x in dir_excludes): continue dirs.append(name) else: yield meta except (UnicodeDecodeError, OSError): import traceback print(traceback.format_exc()) print('%r %r' % (top, name)) # recurse into directories for name in dirs: try: os.chdir(name) except (UnicodeDecodeError, OSError): import traceback print(traceback.format_exc()) print('%r %r' % (top, name)) else: for meta in walk(os.path.join(top, name), path_excludes, dir_excludes): yield meta os.chdir('..') def crawl(paths, excludes=()): """Crawl the paths, store the crawled files in the database and find archives that contain files crawled.""" from path import pattern2re # convert excludes path_excludes = [ pattern2re(x) for x in excludes if not x.endswith('/')] dir_excludes = [ pattern2re(x.rstrip('/')) for x in excludes if x.endswith('/')] # go over filesystem for path in paths: path = u'' + path print('Scanning %s' % path) save_dir = os.getcwd() os.chdir(path) for meta in walk(path, path_excludes, dir_excludes): yield meta os.chdir(save_dir)