update/cfi.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98

#!/usr/bin/env python3

# update/cfi.py - script to download CFI code list from the SIX group
#
# Copyright (C) 2022 Arthur de Jong
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
# 02110-1301 USA

"""This script downloads the list of CFI codes as published by the SIX group."""

import re

import lxml.html
import requests
import xlrd


# the location of the Statistical Classification file
download_url = 'https://www.six-group.com/en/products-services/financial-information/data-standards.html'


def normalise(value):
    """Clean and minimise attribute names and values."""
    return re.sub(r' *[(\[\n].*', '', value, re.MULTILINE).strip()


def get_categories(sheet):
    """Get the list of top-level CFI categories."""
    for row in sheet.get_rows():
        if len(row[0].value) == 1 and row[1].value:
            yield (row[0].value, row[1].value)


def get_attributes(sheet):
    """Get the list of characters and attributes from the group-specific sheet."""
    attribute = None
    value_list = []
    values = None
    for row in sheet.get_rows():
        if row[0].value and not row[1].value and row[2].value:
            attribute = normalise(row[2].value)
            values = []
            value_list.append((attribute, values))
        elif attribute and row[1].value and row[2].value:
            values.append((row[1].value, normalise(row[2].value)))
    return value_list


def print_attributes(attributes, index=0):
    """Print the collected attributes in a nested structure."""
    attribute, values = attributes[index]
    if len(values) == 1 and values[0][0] == 'X':
        print('%sA-Z' % (' ' * (index + 2)))
    else:
        for char, value in sorted(values):
            print('%s%s v="%s"' % (' ' * (index + 2), char, value))
        print('%sA-Z a="%s"' % (
            ' ' * (index + 2), attribute))
    if index < 3:
        print_attributes(attributes, index + 1)


if __name__ == '__main__':
    # Download the page that contains the link to the current XLS file
    response = requests.get(download_url, timeout=30)
    response.raise_for_status()
    # Find the download link
    document = lxml.html.document_fromstring(response.content)
    links = [a.get('href') for a in document.findall('.//a[@href]')]
    link_url = next(a for a in links if re.match(r'.*/cfi/.*xlsx?$', a))
    # Download and parse the spreadsheet
    response = requests.get(link_url, timeout=30)
    response.raise_for_status()
    workbook = xlrd.open_workbook(file_contents=response.content)

    print('# generated from %s, downloaded from' % link_url.split('/')[-1])
    print('# %s' % download_url)

    groups = sorted(x for x in workbook.sheet_names() if len(x) == 6 and x.endswith('XXXX'))
    for category, name in sorted(get_categories(workbook.sheet_by_name('Categories'))):
        print('%s category="%s"' % (category, name))
        for group in (x for x in groups if x.startswith(category)):
            sheet = workbook.sheet_by_name(group)
            print(' %s group="%s"' % (group[1], normalise(sheet.cell(0, 0).value)))
            print_attributes(get_attributes(sheet))