1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
|
#!/usr/bin/env python3
# update/cfi.py - script to download CFI code list from the SIX group
#
# Copyright (C) 2022 Arthur de Jong
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
# 02110-1301 USA
"""This script downloads the list of CFI codes as published by the SIX group."""
import re
import lxml.html
import requests
import xlrd
# the location of the Statistical Classification file
download_url = 'https://www.six-group.com/en/products-services/financial-information/data-standards.html'
def normalise(value):
"""Clean and minimise attribute names and values."""
return re.sub(r' *[(\[\n].*', '', value, re.MULTILINE).strip()
def get_categories(sheet):
"""Get the list of top-level CFI categories."""
for row in sheet.get_rows():
if len(row[0].value) == 1 and row[1].value:
yield (row[0].value, row[1].value)
def get_attributes(sheet):
"""Get the list of characters and attributes from the group-specific sheet."""
attribute = None
value_list = []
values = None
for row in sheet.get_rows():
if row[0].value and not row[1].value and row[2].value:
attribute = normalise(row[2].value)
values = []
value_list.append((attribute, values))
elif attribute and row[1].value and row[2].value:
values.append((row[1].value, normalise(row[2].value)))
return value_list
def print_attributes(attributes, index=0):
"""Print the collected attributes in a nested structure."""
attribute, values = attributes[index]
if len(values) == 1 and values[0][0] == 'X':
print('%sA-Z' % (' ' * (index + 2)))
else:
for char, value in sorted(values):
print('%s%s v="%s"' % (' ' * (index + 2), char, value))
print('%sA-Z a="%s"' % (
' ' * (index + 2), attribute))
if index < 3:
print_attributes(attributes, index + 1)
if __name__ == '__main__':
# Download the page that contains the link to the current XLS file
response = requests.get(download_url, timeout=30)
response.raise_for_status()
# Find the download link
document = lxml.html.document_fromstring(response.content)
links = [a.get('href') for a in document.findall('.//a[@href]')]
link_url = next(a for a in links if re.match(r'.*/cfi/.*xlsx?$', a))
# Download and parse the spreadsheet
response = requests.get(link_url, timeout=30)
response.raise_for_status()
workbook = xlrd.open_workbook(file_contents=response.content)
print('# generated from %s, downloaded from' % link_url.split('/')[-1])
print('# %s' % download_url)
groups = sorted(x for x in workbook.sheet_names() if len(x) == 6 and x.endswith('XXXX'))
for category, name in sorted(get_categories(workbook.sheet_by_name('Categories'))):
print('%s category="%s"' % (category, name))
for group in (x for x in groups if x.startswith(category)):
sheet = workbook.sheet_by_name(group)
print(' %s group="%s"' % (group[1], normalise(sheet.cell(0, 0).value)))
print_attributes(get_attributes(sheet))
|