# cache.py - functions for metadata cache # # Copyright (C) 2015 Arthur de Jong # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA # # The files produced as output from the software do not automatically fall # under the copyright of the software, unless explicitly stated otherwise. import json import os import sqlite3 class MetaData(object): def __init__(self, cache_dir, uuid): # set a default cache directory if not cache_dir: import xdg.BaseDirectory cache_dir = os.path.join( xdg.BaseDirectory.xdg_cache_home, 'sloth') # create cache directory if needed if not os.path.isdir(cache_dir): os.makedirs(cache_dir) # set up SQLite connection self.connection = sqlite3.connect( '%s/%s-v2.sqlite' % (cache_dir, uuid)) # create database tables self.connection.executescript(''' PRAGMA secure_delete = false; PRAGMA temp_store = MEMORY; PRAGMA cache_size = 20000; PRAGMA synchronous = OFF; PRAGMA journal_mode = MEMORY; CREATE TABLE IF NOT EXISTS `archive_contents` ( `id` INTEGER PRIMARY KEY, `archive` TEXT NOT NULL, `path` TEXT NOT NULL, `size` INTEGER NOT NULL, `meta` TEXT NOT NULL, `used` INTEGER NOT NULL DEFAULT 0 ); CREATE INDEX IF NOT EXISTS `archive_contents_archive_idx` ON `archive_contents` (`archive`); CREATE INDEX IF NOT EXISTS `archive_contents_path_idx` ON `archive_contents` (`path`); CREATE INDEX IF NOT EXISTS `archive_contents_meta_idx` ON `archive_contents` (`meta`); CREATE TABLE IF NOT EXISTS `backups` ( `backup` TEXT PRIMARY KEY, `json` TEXT NOT NULL ); CREATE INDEX IF NOT EXISTS `backups_backup_idx` ON `backups` (`backup`); ''') def is_crawled(self, path): cursor = self.connection.execute(''' SELECT 1 FROM `crawled` WHERE `path` = ? ''', (path, )) return bool(list(cursor)) def resync_archives(self, repo): """Update metadata cache with the information from the repository.""" # set up temporary table to hold list of archives found in repository self.connection.executescript(''' CREATE TEMPORARY TABLE `tmp_archives` ( `archive` TEXT NOT NULL ); CREATE INDEX IF NOT EXISTS `tmp_archives_archive_idx` ON `tmp_archives` (`archive`); ''') # get list of archives in the repository with self.connection: self.connection.executemany(''' INSERT INTO `tmp_archives` (`archive`) VALUES (?) ''', ((archive, ) for archive in repo.list_archives())) # remove archives from database that are not in the repository with self.connection: self.connection.execute(''' DELETE FROM `archive_contents` WHERE `archive` NOT IN ( SELECT `archive` FROM `tmp_archives`) ''') # read archive metadata that is missing from the database cursor = self.connection.execute(''' SELECT `archive` FROM `tmp_archives` WHERE `archive` NOT IN ( SELECT DISTINCT `archive` FROM `archive_contents`) ''') # use fetchall because SQLite cannot handle partial reads from a cursor # if the database is being modified in another cursor for archive, in cursor.fetchall(): print('Importing archive %s file list' % archive) try: fileslist = repo.read_fileslist('archives/%s.json' % archive) with self.connection: self.connection.executemany(''' INSERT INTO `archive_contents` (`archive`, `path`, `size`, `meta`) VALUES (?, ?, ?, ?) ''', (( archive, meta['path'], meta['size'], json.dumps(meta, sort_keys=True)) for meta in fileslist)) except (EnvironmentError, ValueError): import traceback print(traceback.format_exc()) # clean up self.connection.executescript(''' DROP TABLE `tmp_archives`; ''') def _check_backups(self, repo, table, backup=None): """Return the list of backups that need to be synced, checking the specified table.""" if backup is not None: # if we already have data for the backup, we're done cursor = self.connection.execute(''' SELECT 1 FROM `%s` WHERE `backup` = ? ''' % table, (backup, )) if bool(list(cursor)): return [] return [backup] # get list of backups in the repository self.connection.executescript(''' CREATE TEMPORARY TABLE `tmp_backups` ( `backup` TEXT NOT NULL ); CREATE INDEX IF NOT EXISTS `tmp_backups_backup_idx` ON `tmp_backups` (`backup`); ''') with self.connection: self.connection.executemany(''' INSERT INTO `tmp_backups` (`backup`) VALUES (?) ''', ((backup, ) for backup in repo.list_backups())) # remove backups from database that are not in the repository with self.connection: self.connection.execute(''' DELETE FROM `%s` WHERE `backup` NOT IN ( SELECT `backup` FROM `tmp_backups`) ''' % table) # read backup metadata that is missing from the database backups = [ row[0] for row in self.connection.execute(''' SELECT `backup` FROM `tmp_backups` WHERE `backup` NOT IN ( SELECT `backup` FROM `%s`) ''' % table)] # clean up self.connection.executescript(''' DROP TABLE IF EXISTS `tmp_backups`; ''') return backups def resync_backups(self, repo): """Update metadata cache with the information from the repository.""" for backup in self._check_backups(repo, 'backups'): print('Importing backup %s metadata' % backup) try: with repo.read_file('backups/%s/info.json' % backup) as f: info = json.load(f) with self.connection: self.connection.execute(''' INSERT OR REPLACE INTO `backups` (`backup`, `json`) VALUES (?, ?) ''', ( backup, json.dumps(info, sort_keys=True))) except (EnvironmentError, ValueError): import traceback print(traceback.format_exc())