# fsck.py - functions for checking repository consistency # # Copyright (C) 2015 Arthur de Jong # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA # # The files produced as output from the software do not automatically fall # under the copyright of the software, unless explicitly stated otherwise. import collections import json import os import re def _clean(config, repo, filename, msg='unused file or directory'): removed = False if config.clean: try: repo.backend.remove(filename) removed = True except EnvironmentError: import traceback print(traceback.format_exc()) print('%s: %s%s' % (filename, msg, ' (REMOVED)' if removed else '')) def _filter(extension, sequence): """Split the list into a list that has the items that begin with prefix and one with the other items.""" l1, l2 = [], [] extension = '.%s.' % extension for filename in sequence: if extension in filename: l1.append(filename) else: l2.append(filename) return l1, l2 def check_keys(config, repo): """Go over the files in the keys directory.""" # get list of files files = list(repo.backend.listdir('keys')) # check presence and validity of passphrase file if 'keys/passphrase.gpg' not in files: pass # ignore else: files.remove('keys/passphrase.gpg') # try to read the passphrase file from filters import GnuPGKeyEncryption try: with repo.read_file( 'keys/passphrase', encryption=GnuPGKeyEncryption()) as f: passphrase = str(f.read()).strip() except IOError: import traceback print(traceback.format_exc()) # list superfluous files for filename in files: _clean(config, repo, filename, 'unknown file or directory') def check_archive(config, repo, archive, files, found_archives): # TODO: check that file names contain recognised compression and # encryption extensions # split files in archives, json files and others archives, other = _filter('tar', files) jsons, other = _filter('json', other) # check that there is exactly one tar file if len(archives) < 1: for filename in files: _clean(config, repo, filename, 'corresponding archive file missing') return # skip further checks on this archive elif len(archives) > 1: for filename in archives: print('%s: duplicate archive file' % filename) # mark archive as found found_archives[archive] = 0 # check that there is exactly one JSON file if len(jsons) < 1: for filename in files: print('%s: corresponding JSON file missing' % filename) # TODO: alternatively try to reconstruct the JSON from the archive elif len(jsons) > 1: for filename in jsons: print('%s: duplicate JSON file' % filename) if jsons: try: for meta in repo.read_fileslist('archives/%s.json' % archive): if not meta.get('path'): raise ValueError('path property missing') except (StopIteration, ValueError, EnvironmentError): import traceback print(traceback.format_exc()) print('%s: corrupt' % jsons[0]) # check remaining files for filename in other: _clean(config, repo, filename, 'unknown file or directory') # TODO: consider unpacking the tar file (only if some option is # specified and check that the information matches the JSON) def check_archives(config, repo, found_archives): """Check the consistency of the archives directory.""" # go over all found files in archives directory archives = collections.defaultdict(list) for filename in repo.backend.listdir('archives'): m = re.match( r'archives/([0-9a-zA-Z]{8}-[0-9a-zA-Z]{8})\.(json|tar)(\..*)?$', filename) if m: archives[m.group(1)].append(filename) else: _clean(config, repo, filename, 'unknown file or directory') # see if each archive has two files: one archive and one json for archive, files in archives.items(): check_archive(config, repo, archive, files, found_archives) def check_unused_archives(config, repo, found_archives): """Check archives directory for unused archievs .""" # go over archive files for filename in repo.backend.listdir('archives'): m = re.match( r'archives/([0-9a-zA-Z]{8}-[0-9a-zA-Z]{8})\.(json|tar)(\..*)?$', filename) if m: if found_archives.get(m.group(1), 0) == 0: _clean(config, repo, filename, 'unused file') def check_backup(config, repo, backup, found_archives): """Check the validity of the specified backup.""" files = list( os.path.basename(f) for f in repo.backend.listdir('backups/' + backup)) # check for presence of restore.sh if 'restore.sh' not in files: print('backups/%s/restore.sh: missing' % backup) else: # check contents of restore.sh (first line hashbang, last line exit 0) files.remove('restore.sh') # check for presence of info.json info_files = [x for x in files if x.startswith('info.json')] if not info_files: print('backups/%s/info.json: missing' % backup) elif len(info_files) > 1: for x in info_files: print('backups/%s/%s: duplicate info file' % (backup, x)) files.remove(x) else: files.remove(info_files[0]) # read info.json info = {'archives': []} if info_files: try: with repo.read_file('backups/%s/info.json' % backup) as f: info = json.load(f) except (IOError, ValueError) as e: print('backups/%s/%s: invalid or currupt: %s' % ( backup, info_files[0], e)) # check that all required extractlists are present if 'extractlists' not in info: if info_files: print('backups/%s/%s: extractlist info missing' % ( backup, info_files[0])) # just remove all extractlist from file list for archive in info['archives']: files = [x for x in files if not x.startswith('%s.list' % archive)] else: # check that each extractlist is present for archive in info['extractlists']: elists = [x for x in files if x.startswith('%s.list' % archive)] if not elists: print('backups/%s/%s.list: missing' % (backup, archive)) elif len(elists) > 1: for filename in elists: print('backups/%s/%s: duplicate extractlist file' % ( backup, filename)) files.remove(filename) else: files.remove(elists[0]) # check that all required archives are present and update found_archives for archive in info['archives']: if archive not in found_archives: print('backups/%s: archive %s missing' % (backup, archive)) else: found_archives[archive] += 1 # check for presence of files.json files_files = [x for x in files if x.startswith('files.json')] if not files_files: print('backups/%s/files.json: missing' % backup) elif len(files_files) > 1: for x in files_files: print('backups/%s/%s: duplicate file list' % (backup, x)) files.remove(x) else: files.remove(files_files[0]) # read files.json if files_files: try: for meta in repo.read_fileslist('backups/%s/files.json' % backup): if not meta.get('path'): raise ValueError('path property missing') except (StopIteration, ValueError, EnvironmentError): import traceback print(traceback.format_exc()) print('%s: corrupt' % filename) # check remaining files for filename in files: filename = 'backups/%s/%s' % (backup, filename) _clean(config, repo, filename, 'unknown file or directory') def check_backups(config, repo, found_archives): """Go over the list of backups in the repository.""" for filename in repo.backend.listdir('backups'): check_backup(config, repo, os.path.basename(filename), found_archives) def fsck(config, repo): """Check consistency of the repository.""" # get contents of top-level directory top = list(repo.backend.listdir('')) archives = False backups = False keys = False # check that archives is present and is a directory if 'archives' not in top: print('archives: directory missing') elif not repo.backend.isdir('archives'): print('archives: is not a directory') else: archives = True top.remove('archives') # check that backups is present and is a directory if 'backups' not in top: print('backups: directory missing') elif not repo.backend.isdir('backups'): print('backups: is not a directory') else: backups = True top.remove('backups') # check that keys is present and is a directory if 'keys' not in top: pass # ignore: perhaps also pass result to archives and backups test to give an error if encryption is used elif not repo.backend.isdir('keys'): print('keys: is not a directory') else: # TODO: try to open key file # TODO: use key to see if every encrypted file can be decrypted using this key keys = True top.remove('keys') # check presence and validity of uuid file if 'uuid' not in top: print('uuid: missing') elif repo.backend.isdir('uuid'): print('uuid: is not a file') else: top.remove('uuid') try: with repo.read_file('uuid') as f: uuid = str(f.read()).strip() # check uuid validity if not re.match(r'^[0-9a-z]{8,16}$', uuid): print('uuid: invalid') except IOError: import traceback print(traceback.format_exc()) # list superfluous toplevel files for filename in top: _clean(config, repo, filename, 'unknown file or directory') # recurse into keys, archives and backups directories if keys: check_keys(config, repo) found_archives = {} if archives: check_archives(config, repo, found_archives) if backups: check_backups(config, repo, found_archives) if archives and backups: check_unused_archives(config, repo, found_archives) # print some statistics: print '%d archives found' % len(found_archives) counts = collections.defaultdict(int) for archive, count in found_archives.items(): counts[count] += 1 for use, archives in counts.items(): print '%d archives used %d times' % (archives, use)