Arthur de Jong

Open Source / Free Software developer

summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorArthur de Jong <arthur@arthurdejong.org>2015-06-30 22:48:24 +0200
committerArthur de Jong <arthur@arthurdejong.org>2015-06-30 22:58:58 +0200
commit1f56ce57c50f15e408ffc004e6f736f2fa3249b6 (patch)
tree0902504fdaa4e8de43a2763ec709916c5f90095d
parent4c3cbfb8876efe74a918deda71e180be31f785c7 (diff)
Do not cache full backup contents
Storing this in SQLite is slow and grows the cache to a huge size. The approach of reading these files lists may be a bit slower but saves a lot of space and overhead and removes quite some complexity.
-rwxr-xr-xbackup.py106
-rw-r--r--cache.py44
-rw-r--r--crawler.py7
-rw-r--r--path.py2
4 files changed, 36 insertions, 123 deletions
diff --git a/backup.py b/backup.py
index a13d4ec..6a45177 100755
--- a/backup.py
+++ b/backup.py
@@ -622,77 +622,49 @@ def list_backups(config, repo):
print(' %s' % ', '.join(extra))
+def _pattern_match(meta, patterns):
+ return not patterns or \
+ meta['path'] in patterns or \
+ any(meta['path'].startswith(pattern + '/')
+ for pattern in patterns)
+
+
def list_contents(config, repo):
"""List backup contents and print file listing."""
- from cache import MetaData, escape_like
from ls import ls_format
- backup = config.backup
- # bring metadata cache in sync with repository
- db = MetaData(config.cache_dir, repo.uuid)
- db.resync_backup_contents(repo, backup=backup)
- # filter file list
- pred = []
- args = [backup]
- for pattern in config.files:
- pattern = pattern.rstrip('/')
- pred.append('''`path` = ?''')
- args.append(pattern)
- pred.append('''`path` LIKE ? ESCAPE '\\' ''')
- args.append(escape_like(pattern) + '/%')
- cursor = db.connection.execute('''
- SELECT `path`, `meta`
- FROM `backup_contents`
- WHERE `backup` = ?
- %s %s
- ORDER BY `path`
- ''' % ('AND' if pred else '', ' OR '.join(pred)), args)
- # go over results
- print('%s:' % backup)
- for path, meta in cursor:
- ls_format(json.loads(meta))
+ patterns = [pattern.rstrip('/') for pattern in config.files]
+ print('%s:' % config.backup)
+ for meta in repo.read_fileslist('backups/%s/files.json' % config.backup):
+ if _pattern_match(meta, patterns):
+ ls_format(meta)
def find(config, repo):
"""Find archives containing the files."""
- from cache import MetaData
from ls import ls_format
from path import pattern2re
- # bring metadata cache in sync with repository
- db = MetaData(config.cache_dir, repo.uuid)
- db.resync_backup_contents(repo)
- # find backups containing file
- pred = []
- args = []
- for pattern in config.files:
- if pattern.endswith('/'):
- pred.append(
- '''(`path` REGEXP ? AND `meta` LIKE '%"type": "D"%')''')
- else:
- pred.append('`path` REGEXP ?')
- args.append(pattern2re(pattern.rstrip('/')))
- cursor = db.connection.execute('''
- SELECT `backup`, `meta`
- FROM `backup_contents`
- WHERE %s
- ORDER BY `backup`, `path`
- ''' % ' OR '.join(pred), args)
- # print results
- for backup, rows in itertools.groupby(cursor, lambda row: row[0]):
+ patterns = [
+ (pattern2re(pattern.rstrip('/')), pattern.endswith('/'))
+ for pattern in config.files]
+ for backup in repo.list_backups():
print('%s:' % backup)
- for row in rows:
- ls_format(json.loads(row[1]))
+ for meta in repo.read_fileslist('backups/%s/files.json' % backup):
+ ok = False
+ if any(pattern.match(meta['path']) and (
+ (not is_dir) or (meta['type'] == 'D'))
+ for pattern, is_dir in patterns):
+ ls_format(meta)
def restore(config, repo):
"""Restore files from a backup in the repository."""
- from cache import MetaData, escape_like
+ from cache import MetaData
from filters import GnuPGKeyEncryption, Reader
repo.keyencryption = GnuPGKeyEncryption()
backup = config.backup
# bring metadata cache in sync with repository
db = MetaData(config.cache_dir, repo.uuid)
db.resync_backups(repo)
- db.resync_backup_contents(repo, backup)
db.resync_archives(repo)
# get list of needed archives
cursor = db.connection.execute('''
@@ -707,27 +679,19 @@ def restore(config, repo):
CREATE TEMPORARY TABLE `tmp_torestore`
( `meta` TEXT NOT NULL );
''')
- pred = []
- args = [backup]
- for pattern in config.files:
- pattern = pattern.rstrip('/')
- pred.append('''`path` = ?''')
- args.append(pattern)
- pred.append('''`path` LIKE ? ESCAPE '\\' ''')
- args.append(escape_like(pattern) + '/%')
+ patterns = [pattern.rstrip('/') for pattern in config.files]
+ print('%s: reading files list' % config.backup)
+ fileslist = repo.read_fileslist('backups/%s/files.json' % backup)
with db.connection:
- pred = ' OR '.join(pred)
- if pred:
- pred = 'AND ( %s )' % pred
- db.connection.execute('''
+ db.connection.executemany('''
INSERT INTO `tmp_torestore`
(`meta`)
- SELECT `meta`
- FROM `backup_contents`
- WHERE `backup` = ?
- %s
- ORDER BY `path`
- ''' % pred, args)
+ VALUES
+ (?)
+ ''', (
+ (json.dumps(meta, sort_keys=True),)
+ for meta in fileslist
+ if _pattern_match(meta, patterns)))
db.connection.executescript('''
CREATE INDEX IF NOT EXISTS `tmp_torestore_meta_idx`
ON `tmp_torestore` (`meta`);
@@ -888,10 +852,6 @@ def remove_backups(config, repo):
DELETE FROM `backups`
WHERE `backup` = ?
''', (backup, ))
- db.connection.execute('''
- DELETE FROM `backup_contents`
- WHERE `backup` = ?
- ''', (backup, ))
# find archives that are no longer used
cursor = db.connection.execute('''
SELECT `archive`
diff --git a/cache.py b/cache.py
index 791be27..4f4c090 100644
--- a/cache.py
+++ b/cache.py
@@ -21,22 +21,9 @@
import json
import os
-import re
import sqlite3
-def escape_like(pattern):
- """Escape LIKE expression using backslash."""
- # perform escaping of \, % and _ first
- pattern = pattern.replace('\\', '\\\\')
- pattern = pattern.replace('%', '\\%')
- return pattern.replace('_', '\\_')
-
-
-def sqlite_regexp(expr, item):
- return re.search(expr, item) is not None
-
-
class MetaData(object):
def __init__(self, cache_dir, uuid):
@@ -51,7 +38,6 @@ class MetaData(object):
# set up SQLite connection
self.connection = sqlite3.connect(
'%s/%s-v2.sqlite' % (cache_dir, uuid))
- self.connection.create_function('REGEXP', 2, sqlite_regexp)
# create database tables
self.connection.executescript('''
PRAGMA secure_delete = false;
@@ -77,14 +63,6 @@ class MetaData(object):
`json` TEXT NOT NULL );
CREATE INDEX IF NOT EXISTS `backups_backup_idx`
ON `backups` (`backup`);
- CREATE TABLE IF NOT EXISTS `backup_contents`
- ( `backup` TEXT NOT NULL,
- `path` TEXT NOT NULL,
- `meta` TEXT NOT NULL );
- CREATE INDEX IF NOT EXISTS `backup_contents_backup_idx`
- ON `backup_contents` (`backup`);
- CREATE INDEX IF NOT EXISTS `backup_contents_path_idx`
- ON `backup_contents` (`path`);
''')
def is_crawled(self, path):
@@ -216,25 +194,3 @@ class MetaData(object):
except (EnvironmentError, ValueError):
import traceback
print(traceback.format_exc())
-
- def resync_backup_contents(self, repo, backup=None):
- """Update metadata cache with the information from the repository."""
- for backup in self._check_backups(repo, 'backup_contents',
- backup=backup):
- print('Importing backup %s file list' % backup)
- try:
- fileslist = repo.read_fileslist(
- 'backups/%s/files.json' % backup)
- with self.connection:
- self.connection.executemany('''
- INSERT INTO `backup_contents`
- (`backup`, `path`, `meta`)
- VALUES
- (?, ?, ?)
- ''', ((
- backup, meta['path'],
- json.dumps(meta, sort_keys=True))
- for meta in fileslist))
- except (EnvironmentError, ValueError):
- import traceback
- print(traceback.format_exc())
diff --git a/crawler.py b/crawler.py
index 82f160f..4305b26 100644
--- a/crawler.py
+++ b/crawler.py
@@ -20,7 +20,6 @@
# under the copyright of the software, unless explicitly stated otherwise.
import os
-import re
import stat
@@ -107,11 +106,9 @@ def crawl(paths, excludes=()):
from path import pattern2re
# convert excludes
path_excludes = [
- re.compile(pattern2re(x))
- for x in excludes if not x.endswith('/')]
+ pattern2re(x) for x in excludes if not x.endswith('/')]
dir_excludes = [
- re.compile(pattern2re(x.rstrip('/')))
- for x in excludes if x.endswith('/')]
+ pattern2re(x.rstrip('/')) for x in excludes if x.endswith('/')]
# go over filesystem
for path in paths:
path = u'' + path
diff --git a/path.py b/path.py
index 57d60e0..19d7656 100644
--- a/path.py
+++ b/path.py
@@ -63,7 +63,7 @@ def pattern2re(pattern):
else:
res = res + re.escape(c)
# TODO: do something special with patterns ending with /
- return res + '\Z(?ms)'
+ return re.compile(res + '\Z(?ms)')
def parents(path):