Arthur de Jong

Open Source / Free Software developer

summaryrefslogtreecommitdiffstats
path: root/fsck.py
blob: faae503a623026d1babcbc9714d61ac5a2dd9790 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
# fsck.py - functions for checking repository consistency
#
# Copyright (C) 2015 Arthur de Jong
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
#
# The files produced as output from the software do not automatically fall
# under the copyright of the software, unless explicitly stated otherwise.

import collections
import json
import os
import re


def _clean(config, repo, filename, msg='unused file or directory'):
    removed = False
    if config.clean:
        try:
            repo.backend.remove(filename)
            removed = True
        except EnvironmentError:
            import traceback
            print(traceback.format_exc())
    print('%s: %s%s' % (filename, msg, ' (REMOVED)' if removed else ''))


def _filter(extension, sequence):
    """Split the list into a list that has the items that begin with
    prefix and one with the other items."""
    l1, l2 = [], []
    extension = '.%s.' % extension
    for filename in sequence:
        if extension in filename:
            l1.append(filename)
        else:
            l2.append(filename)
    return l1, l2


def check_keys(config, repo):
    """Go over the files in the keys directory."""
    # get list of files
    files = list(repo.backend.listdir('keys'))
    # check presence and validity of passphrase file
    if 'keys/passphrase.gpg' not in files:
        pass  # ignore
    else:
        files.remove('keys/passphrase.gpg')
        # try to read the passphrase file
        from filters import GnuPGKeyEncryption
        try:
            with repo.read_file(
                    'keys/passphrase', encryption=GnuPGKeyEncryption()) as f:
                passphrase = str(f.read()).strip()
        except IOError:
            import traceback
            print(traceback.format_exc())
    # list superfluous files
    for filename in files:
        _clean(config, repo, filename, 'unknown file or directory')


def check_archive(config, repo, archive, files, found_archives):
    # TODO: check that file names contain recognised compression and
    #       encryption extensions
    # split files in archives, json files and others
    archives, other = _filter('tar', files)
    jsons, other = _filter('json', other)
    # check that there is exactly one tar file
    if len(archives) < 1:
        for filename in files:
            _clean(config, repo, filename,
                   'corresponding archive file missing')
        return  # skip further checks on this archive
    elif len(archives) > 1:
        for filename in archives:
            print('%s: duplicate archive file' % filename)
    # mark archive as found
    found_archives[archive] = 0
    # check that there is exactly one JSON file
    if len(jsons) < 1:
        for filename in files:
            print('%s: corresponding JSON file missing' % filename)
        # TODO: alternatively try to reconstruct the JSON from the archive
    elif len(jsons) > 1:
        for filename in jsons:
            print('%s: duplicate JSON file' % filename)
    if jsons:
        try:
            for meta in repo.read_fileslist('archives/%s.json' % archive):
                if not meta.get('path'):
                    raise ValueError('path property missing')
        except (StopIteration, ValueError, EnvironmentError):
            import traceback
            print(traceback.format_exc())
            print('%s: corrupt' % jsons[0])
    # check remaining files
    for filename in other:
        _clean(config, repo, filename, 'unknown file or directory')
    # TODO: consider unpacking the tar file (only if some option is
    #       specified and check that the information matches the JSON)


def check_archives(config, repo, found_archives):
    """Check the consistency of the archives directory."""
    # go over all found files in archives directory
    archives = collections.defaultdict(list)
    for filename in repo.backend.listdir('archives'):
        m = re.match(
            r'archives/([0-9a-zA-Z]{8}-[0-9a-zA-Z]{8})\.(json|tar)(\..*)?$',
            filename)
        if m:
            archives[m.group(1)].append(filename)
        else:
            _clean(config, repo, filename, 'unknown file or directory')
    # see if each archive has two files: one archive and one json
    for archive, files in archives.items():
        check_archive(config, repo, archive, files, found_archives)


def check_unused_archives(config, repo, found_archives):
    """Check archives directory for unused archievs ."""
    # go over archive files
    for filename in repo.backend.listdir('archives'):
        m = re.match(
            r'archives/([0-9a-zA-Z]{8}-[0-9a-zA-Z]{8})\.(json|tar)(\..*)?$',
            filename)
        if m:
            if found_archives.get(m.group(1), 0) == 0:
                _clean(config, repo, filename, 'unused file')


def check_backup(config, repo, backup, found_archives):
    """Check the validity of the specified backup."""
    files = list(
        os.path.basename(f) for f in repo.backend.listdir('backups/' + backup))
    # check for presence of restore.sh
    if 'restore.sh' not in files:
        print('backups/%s/restore.sh: missing' % backup)
    else:
        # check contents of restore.sh (first line hashbang, last line exit 0)
        files.remove('restore.sh')
    # check for presence of info.json
    info_files = [x for x in files if x.startswith('info.json')]
    if not info_files:
        print('backups/%s/info.json: missing' % backup)
    elif len(info_files) > 1:
        for x in info_files:
            print('backups/%s/%s: duplicate info file' % (backup, x))
            files.remove(x)
    else:
        files.remove(info_files[0])
    # read info.json
    info = {'archives': []}
    if info_files:
        try:
            with repo.read_file('backups/%s/info.json' % backup) as f:
                info = json.load(f)
        except (IOError, ValueError) as e:
            print('backups/%s/%s: invalid or currupt: %s' % (
                backup, info_files[0], e))
    # check that all required extractlists are present
    if 'extractlists' not in info:
        if info_files:
            print('backups/%s/%s: extractlist info missing' % (
                backup, info_files[0]))
        # just remove all extractlist from file list
        for archive in info['archives']:
            files = [x for x in files if not x.startswith('%s.list' % archive)]
    else:
        # check that each extractlist is present
        for archive in info['extractlists']:
            elists = [x for x in files if x.startswith('%s.list' % archive)]
            if not elists:
                print('backups/%s/%s.list: missing' % (backup, archive))
            elif len(elists) > 1:
                for filename in elists:
                    print('backups/%s/%s: duplicate extractlist file' % (
                        backup, filename))
                    files.remove(filename)
            else:
                files.remove(elists[0])
    # check that all required archives are present and update found_archives
    for archive in info['archives']:
        if archive not in found_archives:
            print('backups/%s: archive %s missing' % (backup, archive))
        else:
            found_archives[archive] += 1
    # check for presence of files.json
    files_files = [x for x in files if x.startswith('files.json')]
    if not files_files:
        print('backups/%s/files.json: missing' % backup)
    elif len(files_files) > 1:
        for x in files_files:
            print('backups/%s/%s: duplicate file list' % (backup, x))
            files.remove(x)
    else:
        files.remove(files_files[0])
    # read files.json
    if files_files:
        try:
            for meta in repo.read_fileslist('backups/%s/files.json' % backup):
                if not meta.get('path'):
                    raise ValueError('path property missing')
        except (StopIteration, ValueError, EnvironmentError):
            import traceback
            print(traceback.format_exc())
            print('%s: corrupt' % filename)
    # check remaining files
    for filename in files:
        filename = 'backups/%s/%s' % (backup, filename)
        _clean(config, repo, filename, 'unknown file or directory')


def check_backups(config, repo, found_archives):
    """Go over the list of backups in the repository."""
    for filename in repo.backend.listdir('backups'):
        check_backup(config, repo, os.path.basename(filename), found_archives)


def fsck(config, repo):
    """Check consistency of the repository."""
    # get contents of top-level directory
    top = list(repo.backend.listdir(''))
    archives = False
    backups = False
    keys = False
    # check that archives is present and is a directory
    if 'archives' not in top:
        print('archives: directory missing')
    elif not repo.backend.isdir('archives'):
        print('archives: is not a directory')
    else:
        archives = True
        top.remove('archives')
    # check that backups is present and is a directory
    if 'backups' not in top:
        print('backups: directory missing')
    elif not repo.backend.isdir('backups'):
        print('backups: is not a directory')
    else:
        backups = True
        top.remove('backups')
    # check that keys is present and is a directory
    if 'keys' not in top:
        pass  # ignore: perhaps also pass result to archives and backups test to give an error if encryption is used
    elif not repo.backend.isdir('keys'):
        print('keys: is not a directory')
    else:
        # TODO: try to open key file
        # TODO: use key to see if every encrypted file can be decrypted using this key
        keys = True
        top.remove('keys')
    # check presence and validity of uuid file
    if 'uuid' not in top:
        print('uuid: missing')
    elif repo.backend.isdir('uuid'):
        print('uuid: is not a file')
    else:
        top.remove('uuid')
        try:
            with repo.read_file('uuid') as f:
                uuid = str(f.read()).strip()
            # check uuid validity
            if not re.match(r'^[0-9a-z]{8,16}$', uuid):
                print('uuid: invalid')
        except IOError:
            import traceback
            print(traceback.format_exc())
    # list superfluous toplevel files
    for filename in top:
        _clean(config, repo, filename, 'unknown file or directory')
    # recurse into keys, archives and backups directories
    if keys:
        check_keys(config, repo)
    found_archives = {}
    if archives:
        check_archives(config, repo, found_archives)
    if backups:
        check_backups(config, repo, found_archives)
    if archives and backups:
        check_unused_archives(config, repo, found_archives)
    # print some statistics:
    print '%d archives found' % len(found_archives)
    counts = collections.defaultdict(int)
    for archive, count in found_archives.items():
        counts[count] += 1
    for use, archives in counts.items():
        print '%d archives used %d times' % (archives, use)