diff options
author | Arthur de Jong <arthur@arthurdejong.org> | 2024-03-17 17:34:26 +0100 |
---|---|---|
committer | Arthur de Jong <arthur@arthurdejong.org> | 2024-03-17 18:18:04 +0100 |
commit | 26fd25b32a5d4377d2b17e22adc2db0d27503f93 (patch) | |
tree | dbdef70eea57c7474d5a5b5e31647d95d2cdcbe4 | |
parent | 923060485816ba6b897655251e4b6446e4a5dcc0 (diff) |
Switch to using openpyxl for parsing XLSX files
The xlrd has dropped support for parsing XLSX files. We still use xlrd
for update/be_banks.py because they use the classic XLS format and
openpyxl does not support that format.
-rw-r--r-- | setup.cfg | 1 | ||||
-rwxr-xr-x | update/cfi.py | 21 | ||||
-rwxr-xr-x | update/nz_banks.py | 16 | ||||
-rw-r--r-- | update/requirements.txt | 1 |
4 files changed, 19 insertions, 20 deletions
@@ -47,5 +47,6 @@ lines_after_imports = 2 multi_line_output = 4 known_third_party = lxml + openpyxl requests xlrd diff --git a/update/cfi.py b/update/cfi.py index e47e6bc..f5431a8 100755 --- a/update/cfi.py +++ b/update/cfi.py @@ -2,7 +2,7 @@ # update/cfi.py - script to download CFI code list from the SIX group # -# Copyright (C) 2022 Arthur de Jong +# Copyright (C) 2022-2024 Arthur de Jong # # This library is free software; you can redistribute it and/or # modify it under the terms of the GNU Lesser General Public @@ -21,11 +21,12 @@ """This script downloads the list of CFI codes as published by the SIX group.""" +import io import re import lxml.html +import openpyxl import requests -import xlrd # the location of the Statistical Classification file @@ -39,8 +40,8 @@ def normalise(value): def get_categories(sheet): """Get the list of top-level CFI categories.""" - for row in sheet.get_rows(): - if len(row[0].value) == 1 and row[1].value: + for row in sheet.iter_rows(): + if row[0].value and len(row[0].value) == 1 and row[1].value: yield (row[0].value, row[1].value) @@ -49,7 +50,7 @@ def get_attributes(sheet): attribute = None value_list = [] values = None - for row in sheet.get_rows(): + for row in sheet.iter_rows(): if row[0].value and not row[1].value and row[2].value: attribute = normalise(row[2].value) values = [] @@ -84,15 +85,15 @@ if __name__ == '__main__': # Download and parse the spreadsheet response = requests.get(link_url, timeout=30) response.raise_for_status() - workbook = xlrd.open_workbook(file_contents=response.content) + workbook = openpyxl.load_workbook(io.BytesIO(response.content), read_only=True) print('# generated from %s, downloaded from' % link_url.split('/')[-1]) print('# %s' % download_url) - groups = sorted(x for x in workbook.sheet_names() if len(x) == 6 and x.endswith('XXXX')) - for category, name in sorted(get_categories(workbook.sheet_by_name('Categories'))): + groups = sorted(x for x in workbook.sheetnames if len(x) == 6 and x.endswith('XXXX')) + for category, name in sorted(get_categories(workbook['Categories'])): print('%s category="%s"' % (category, name)) for group in (x for x in groups if x.startswith(category)): - sheet = workbook.sheet_by_name(group) - print(' %s group="%s"' % (group[1], normalise(sheet.cell(0, 0).value))) + sheet = workbook[group] + print(' %s group="%s"' % (group[1], normalise(sheet.cell(1, 1).value))) print_attributes(get_attributes(sheet)) diff --git a/update/nz_banks.py b/update/nz_banks.py index 816930f..af1a433 100755 --- a/update/nz_banks.py +++ b/update/nz_banks.py @@ -3,7 +3,7 @@ # update/nz_banks.py - script to download Bank list from Bank Branch Register # -# Copyright (C) 2019-2021 Arthur de Jong +# Copyright (C) 2019-2024 Arthur de Jong # # This library is free software; you can redistribute it and/or # modify it under the terms of the GNU Lesser General Public @@ -23,16 +23,12 @@ """This script downloads the list of banks with bank codes as used in the New Zealand bank account numbers.""" +import io import re from collections import OrderedDict, defaultdict +import openpyxl import requests -import xlrd - - -# Monkey patch xlrd avoiding bug in combination with Python 3.9 -xlrd.xlsx.ensure_elementtree_imported(False, None) -xlrd.xlsx.Element_has_iter = True # The page that contains a link to the latest XLS version of the codes. @@ -41,7 +37,7 @@ download_url = 'https://www.paymentsnz.co.nz/resources/industry-registers/bank-b def get_values(sheet): """Return rows from the worksheet as a dict per row.""" - rows = sheet.get_rows() + rows = sheet.iter_rows() # the first row has column names columns = [column.value.lower().replace(' ', '_') for column in next(rows)] # go over rows with values @@ -75,8 +71,8 @@ if __name__ == '__main__': response.raise_for_status() content_disposition = response.headers.get('content-disposition', '') filename = re.findall(r'filename=?(.+)"?', content_disposition)[0].strip('"') - workbook = xlrd.open_workbook(file_contents=response.content) - sheet = workbook.sheet_by_index(0) + workbook = openpyxl.load_workbook(io.BytesIO(response.content), read_only=True) + sheet = workbook.worksheets[0] # print header print('# generated from %s downloaded from' % filename) print('# %s' % download_url) diff --git a/update/requirements.txt b/update/requirements.txt index 16e367c..b51c684 100644 --- a/update/requirements.txt +++ b/update/requirements.txt @@ -1,3 +1,4 @@ lxml +openpyxl requests xlrd |