# numdb.py - module for handling hierarchically organised numbers # # Copyright (C) 2010-2023 Arthur de Jong # # This library is free software; you can redistribute it and/or # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. # # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA # 02110-1301 USA """Query structured number format files with number properties. This module contains functions for reading and querying a database that stores numbers that use a hierarchical format (e.g. ISBN, IBAN, phone numbers, etc). To read a database from a file: >>> with open('tests/numdb-test.dat', 'r') as f: ... dbfile = read(f) To split a number: >>> dbfile.split('01006') ['0', '100', '6'] >>> dbfile.split('902006') ['90', '20', '06'] >>> dbfile.split('909856') ['90', '985', '6'] To split the number and get properties for each part: >>> import pprint >>> pprint.pprint(dbfile.info('01006')) [('0', {'prop1': 'foo'}), ('100', {'prop2': 'bar'}), ('6', {})] >>> pprint.pprint(dbfile.info('02006')) [('0', {'prop1': 'foo'}), ('200', {'prop2': 'bar', 'prop3': 'baz'}), ('6', {})] >>> pprint.pprint(dbfile.info('03456')) [('0', {'prop1': 'foo'}), ('345', {'prop2': 'bar', 'prop3': 'baz'}), ('6', {})] >>> pprint.pprint(dbfile.info('902006')) [('90', {'prop1': 'booz'}), ('20', {'prop2': 'foo'}), ('06', {})] >>> pprint.pprint(dbfile.info('909856')) [('90', {'prop1': 'booz'}), ('985', {'prop2': 'fooz'}), ('6', {})] >>> pprint.pprint(dbfile.info('9889')) [('98', {'prop1': 'booz'}), ('89', {'prop2': 'foo'})] >>> pprint.pprint(dbfile.info('633322')) [('6', {'prop1': 'boo'}), ('333', {'prop2': 'bar', 'prop3': 'baz', 'prop4': 'bla'}), ('22', {})] >>> pprint.pprint(dbfile.info('1200333')) [('1', {'prop1': 'foo'}), ('200', {'prop2': 'bar', 'prop3': 'baz'}), ('333', {'prop4': 'bax'})] """ from __future__ import annotations import re TYPE_CHECKING = False if TYPE_CHECKING: # pragma: no cover (only used when type checking) from collections.abc import Generator, Iterable from typing import IO, Any PrefixInfo = tuple[int, str, str, dict[str, str], list['PrefixInfo']] _line_re = re.compile( r'^(?P *)' r'(?P([^-,\s]+(-[^-,\s]+)?)(,[^-,\s]+(-[^-,\s]+)?)*)\s*' r'(?P.*)$') _prop_re = re.compile( r'(?P[0-9a-zA-Z-_]+)="(?P[^"]*)"') # this is a cache of open databases _open_databases = {} # the prefixes attribute of NumDB is structured as follows: # prefixes = [ # [ length, low, high, props, children ] # ... # ] # where children is a prefixes structure in its own right # (there is no expected ordering within the list) class NumDB(): """Number database.""" prefixes: list[PrefixInfo] def __init__(self) -> None: """Construct an empty database.""" self.prefixes = [] @staticmethod def _find(number: str, prefixes: list[PrefixInfo]) -> list[tuple[str, dict[str, str]]]: """Lookup the specified number in the list of prefixes, this will return basically what info() should return but works recursively.""" if not number: return [] part = number properties: dict[str, Any] = {} next_prefixes: list[PrefixInfo] = [] # go over prefixes and find matches for length, low, high, props, children in prefixes: if len(part) >= length and low <= part[:length] <= high: # only use information from the shortest match if length < len(part): part = part[:length] properties = {} next_prefixes = [] properties.update(props) next_prefixes.extend(children) # return first part and recursively find next matches return [(part, properties)] + NumDB._find(number[len(part):], next_prefixes) def info(self, number: str) -> list[tuple[str, dict[str, str]]]: """Split the provided number in components and associate properties with each component. This returns a tuple of tuples. Each tuple consists of a string (a part of the number) and a dict of properties. """ return NumDB._find(number, self.prefixes) def split(self, number: str) -> list[str]: """Split the provided number in components. This returns a tuple with the number of components identified.""" return [part for part, props in self.info(number)] def _parse( fp: Iterable[str], ) -> Generator[tuple[int, int, str, str, dict[str, str], list[PrefixInfo]]]: """Read lines of text from the file pointer and generate indent, length, low, high, properties tuples.""" for line in fp: # ignore comments if line[0] == '#' or line.strip() == '': continue # pragma: no cover (optimisation takes it out) # any other line should parse match = _line_re.search(line) assert match is not None indent = len(match.group('indent')) ranges = match.group('ranges') props = dict(_prop_re.findall(match.group('props'))) children: list[PrefixInfo] = [] for rnge in ranges.split(','): if '-' in rnge: low, high = rnge.split('-') else: low, high = rnge, rnge yield indent, len(low), low, high, props, children def read(fp: Iterable[str]) -> NumDB: """Return a new database with the data read from the specified file.""" last_indent = 0 db = NumDB() stack = {0: db.prefixes} for indent, length, low, high, props, children in _parse(fp): if indent > last_indent: # set our stack location to the last parent entry stack[indent] = stack[last_indent][-1][4] stack[indent].append((length, low, high, props, children)) last_indent = indent return db def _get_resource_stream(name: str) -> IO[bytes]: """Return a readable file-like object for the resource.""" try: # pragma: no cover (Python 3.9 and newer) import importlib.resources return importlib.resources.files(__package__).joinpath(name).open('rb') except (ImportError, AttributeError): # pragma: no cover (older Python versions) import pkg_resources # type: ignore[import-untyped] return pkg_resources.resource_stream(__name__, name) # type: ignore[no-any-return] def get(name: str) -> NumDB: """Open a database with the specified name to perform queries on.""" if name not in _open_databases: import codecs reader = codecs.getreader('utf-8') with reader(_get_resource_stream(name + '.dat')) as fp: _open_databases[name] = read(fp) return _open_databases[name]