From 1f56ce57c50f15e408ffc004e6f736f2fa3249b6 Mon Sep 17 00:00:00 2001 From: Arthur de Jong Date: Tue, 30 Jun 2015 22:48:24 +0200 Subject: Do not cache full backup contents Storing this in SQLite is slow and grows the cache to a huge size. The approach of reading these files lists may be a bit slower but saves a lot of space and overhead and removes quite some complexity. --- backup.py | 106 +++++++++++++++++++------------------------------------------ cache.py | 44 ------------------------- crawler.py | 7 ++-- path.py | 2 +- 4 files changed, 36 insertions(+), 123 deletions(-) diff --git a/backup.py b/backup.py index a13d4ec..6a45177 100755 --- a/backup.py +++ b/backup.py @@ -622,77 +622,49 @@ def list_backups(config, repo): print(' %s' % ', '.join(extra)) +def _pattern_match(meta, patterns): + return not patterns or \ + meta['path'] in patterns or \ + any(meta['path'].startswith(pattern + '/') + for pattern in patterns) + + def list_contents(config, repo): """List backup contents and print file listing.""" - from cache import MetaData, escape_like from ls import ls_format - backup = config.backup - # bring metadata cache in sync with repository - db = MetaData(config.cache_dir, repo.uuid) - db.resync_backup_contents(repo, backup=backup) - # filter file list - pred = [] - args = [backup] - for pattern in config.files: - pattern = pattern.rstrip('/') - pred.append('''`path` = ?''') - args.append(pattern) - pred.append('''`path` LIKE ? ESCAPE '\\' ''') - args.append(escape_like(pattern) + '/%') - cursor = db.connection.execute(''' - SELECT `path`, `meta` - FROM `backup_contents` - WHERE `backup` = ? - %s %s - ORDER BY `path` - ''' % ('AND' if pred else '', ' OR '.join(pred)), args) - # go over results - print('%s:' % backup) - for path, meta in cursor: - ls_format(json.loads(meta)) + patterns = [pattern.rstrip('/') for pattern in config.files] + print('%s:' % config.backup) + for meta in repo.read_fileslist('backups/%s/files.json' % config.backup): + if _pattern_match(meta, patterns): + ls_format(meta) def find(config, repo): """Find archives containing the files.""" - from cache import MetaData from ls import ls_format from path import pattern2re - # bring metadata cache in sync with repository - db = MetaData(config.cache_dir, repo.uuid) - db.resync_backup_contents(repo) - # find backups containing file - pred = [] - args = [] - for pattern in config.files: - if pattern.endswith('/'): - pred.append( - '''(`path` REGEXP ? AND `meta` LIKE '%"type": "D"%')''') - else: - pred.append('`path` REGEXP ?') - args.append(pattern2re(pattern.rstrip('/'))) - cursor = db.connection.execute(''' - SELECT `backup`, `meta` - FROM `backup_contents` - WHERE %s - ORDER BY `backup`, `path` - ''' % ' OR '.join(pred), args) - # print results - for backup, rows in itertools.groupby(cursor, lambda row: row[0]): + patterns = [ + (pattern2re(pattern.rstrip('/')), pattern.endswith('/')) + for pattern in config.files] + for backup in repo.list_backups(): print('%s:' % backup) - for row in rows: - ls_format(json.loads(row[1])) + for meta in repo.read_fileslist('backups/%s/files.json' % backup): + ok = False + if any(pattern.match(meta['path']) and ( + (not is_dir) or (meta['type'] == 'D')) + for pattern, is_dir in patterns): + ls_format(meta) def restore(config, repo): """Restore files from a backup in the repository.""" - from cache import MetaData, escape_like + from cache import MetaData from filters import GnuPGKeyEncryption, Reader repo.keyencryption = GnuPGKeyEncryption() backup = config.backup # bring metadata cache in sync with repository db = MetaData(config.cache_dir, repo.uuid) db.resync_backups(repo) - db.resync_backup_contents(repo, backup) db.resync_archives(repo) # get list of needed archives cursor = db.connection.execute(''' @@ -707,27 +679,19 @@ def restore(config, repo): CREATE TEMPORARY TABLE `tmp_torestore` ( `meta` TEXT NOT NULL ); ''') - pred = [] - args = [backup] - for pattern in config.files: - pattern = pattern.rstrip('/') - pred.append('''`path` = ?''') - args.append(pattern) - pred.append('''`path` LIKE ? ESCAPE '\\' ''') - args.append(escape_like(pattern) + '/%') + patterns = [pattern.rstrip('/') for pattern in config.files] + print('%s: reading files list' % config.backup) + fileslist = repo.read_fileslist('backups/%s/files.json' % backup) with db.connection: - pred = ' OR '.join(pred) - if pred: - pred = 'AND ( %s )' % pred - db.connection.execute(''' + db.connection.executemany(''' INSERT INTO `tmp_torestore` (`meta`) - SELECT `meta` - FROM `backup_contents` - WHERE `backup` = ? - %s - ORDER BY `path` - ''' % pred, args) + VALUES + (?) + ''', ( + (json.dumps(meta, sort_keys=True),) + for meta in fileslist + if _pattern_match(meta, patterns))) db.connection.executescript(''' CREATE INDEX IF NOT EXISTS `tmp_torestore_meta_idx` ON `tmp_torestore` (`meta`); @@ -888,10 +852,6 @@ def remove_backups(config, repo): DELETE FROM `backups` WHERE `backup` = ? ''', (backup, )) - db.connection.execute(''' - DELETE FROM `backup_contents` - WHERE `backup` = ? - ''', (backup, )) # find archives that are no longer used cursor = db.connection.execute(''' SELECT `archive` diff --git a/cache.py b/cache.py index 791be27..4f4c090 100644 --- a/cache.py +++ b/cache.py @@ -21,22 +21,9 @@ import json import os -import re import sqlite3 -def escape_like(pattern): - """Escape LIKE expression using backslash.""" - # perform escaping of \, % and _ first - pattern = pattern.replace('\\', '\\\\') - pattern = pattern.replace('%', '\\%') - return pattern.replace('_', '\\_') - - -def sqlite_regexp(expr, item): - return re.search(expr, item) is not None - - class MetaData(object): def __init__(self, cache_dir, uuid): @@ -51,7 +38,6 @@ class MetaData(object): # set up SQLite connection self.connection = sqlite3.connect( '%s/%s-v2.sqlite' % (cache_dir, uuid)) - self.connection.create_function('REGEXP', 2, sqlite_regexp) # create database tables self.connection.executescript(''' PRAGMA secure_delete = false; @@ -77,14 +63,6 @@ class MetaData(object): `json` TEXT NOT NULL ); CREATE INDEX IF NOT EXISTS `backups_backup_idx` ON `backups` (`backup`); - CREATE TABLE IF NOT EXISTS `backup_contents` - ( `backup` TEXT NOT NULL, - `path` TEXT NOT NULL, - `meta` TEXT NOT NULL ); - CREATE INDEX IF NOT EXISTS `backup_contents_backup_idx` - ON `backup_contents` (`backup`); - CREATE INDEX IF NOT EXISTS `backup_contents_path_idx` - ON `backup_contents` (`path`); ''') def is_crawled(self, path): @@ -216,25 +194,3 @@ class MetaData(object): except (EnvironmentError, ValueError): import traceback print(traceback.format_exc()) - - def resync_backup_contents(self, repo, backup=None): - """Update metadata cache with the information from the repository.""" - for backup in self._check_backups(repo, 'backup_contents', - backup=backup): - print('Importing backup %s file list' % backup) - try: - fileslist = repo.read_fileslist( - 'backups/%s/files.json' % backup) - with self.connection: - self.connection.executemany(''' - INSERT INTO `backup_contents` - (`backup`, `path`, `meta`) - VALUES - (?, ?, ?) - ''', (( - backup, meta['path'], - json.dumps(meta, sort_keys=True)) - for meta in fileslist)) - except (EnvironmentError, ValueError): - import traceback - print(traceback.format_exc()) diff --git a/crawler.py b/crawler.py index 82f160f..4305b26 100644 --- a/crawler.py +++ b/crawler.py @@ -20,7 +20,6 @@ # under the copyright of the software, unless explicitly stated otherwise. import os -import re import stat @@ -107,11 +106,9 @@ def crawl(paths, excludes=()): from path import pattern2re # convert excludes path_excludes = [ - re.compile(pattern2re(x)) - for x in excludes if not x.endswith('/')] + pattern2re(x) for x in excludes if not x.endswith('/')] dir_excludes = [ - re.compile(pattern2re(x.rstrip('/'))) - for x in excludes if x.endswith('/')] + pattern2re(x.rstrip('/')) for x in excludes if x.endswith('/')] # go over filesystem for path in paths: path = u'' + path diff --git a/path.py b/path.py index 57d60e0..19d7656 100644 --- a/path.py +++ b/path.py @@ -63,7 +63,7 @@ def pattern2re(pattern): else: res = res + re.escape(c) # TODO: do something special with patterns ending with / - return res + '\Z(?ms)' + return re.compile(res + '\Z(?ms)') def parents(path): -- cgit v1.2.3