diff options
author | Arthur de Jong <arthur@arthurdejong.org> | 2021-01-24 15:37:20 +0100 |
---|---|---|
committer | Arthur de Jong <arthur@arthurdejong.org> | 2021-01-24 15:40:47 +0100 |
commit | 407a02f98c8c8dd911e040eb6f8a784a57f40af6 (patch) | |
tree | c5130016c6c7109e3a09d8a80028a15492dfc6df | |
parent | 53f13b4af087250e43f8841c01906e46d5687b44 (diff) |
Switch postal code download to Austrian open-data portal
This simplifies the process of downloading Austrian postal codes by
downloading a JSON blob instead from
https://www.data.gv.at/katalog/dataset/f76ed887-00d6-450f-a158-9f8b1cbbeebf
This filters the list to only use addressable (adressierbar) postal
codes because it matches the previous list.
Thanks Bernd Schlapsi for providing the pointer.
Closes https://github.com/arthurdejong/python-stdnum/issues/235
-rw-r--r-- | stdnum/at/postleitzahl.dat | 4 | ||||
-rwxr-xr-x | update/at_postleitzahl.py | 75 |
2 files changed, 15 insertions, 64 deletions
diff --git a/stdnum/at/postleitzahl.dat b/stdnum/at/postleitzahl.dat index 0c5a466..1c6bd46 100644 --- a/stdnum/at/postleitzahl.dat +++ b/stdnum/at/postleitzahl.dat @@ -1,5 +1,5 @@ -# generated from PLZ_Verzeichnis-07012021.xls downloaded from -# https://www.post.at/g/c/postlexikon +# generated from https://data.rtr.at/api/v1/tables/plz.json +# version 20232 published 2020-07-06T10:40:00+02:00 1010 location="Wien" region="Wien" 1020 location="Wien" region="Wien" 1030 location="Wien" region="Wien" diff --git a/update/at_postleitzahl.py b/update/at_postleitzahl.py index c596848..0ea7851 100755 --- a/update/at_postleitzahl.py +++ b/update/at_postleitzahl.py @@ -24,23 +24,12 @@ from __future__ import print_function, unicode_literals -import os -import os.path - -import lxml.html import requests -import xlrd - -try: - from urllib.parse import urljoin -except ImportError: - from urlparse import urljoin +# The URL of postal codes on the Austrian open-data portal in CSV format. +download_url = 'https://data.rtr.at/api/v1/tables/plz.json' -# The page that contains a link to the downloadable spreadsheet with current -# Austrian postal codes -base_url = 'https://www.post.at/g/c/postlexikon' # The list of regions that can be used in the document. regions = { @@ -55,57 +44,19 @@ regions = { 'W': 'Wien', } -# The user agent that will be passed in requests -user_agent = 'Mozilla/5.0 (compatible; python-stdnum updater; +https://arthurdejong.org/python-stdnum/)' - - -# Custom headers that will be passed to requests -headers = { - 'User-Agent': user_agent, -} - - -def find_download_url(): - """Extract the spreadsheet URL from the Austrian Post website.""" - response = requests.get(base_url, headers=headers) - response.raise_for_status() - document = lxml.html.document_fromstring(response.content) - url = [ - a.get('href') - for a in document.findall('.//a[@href]') - if 'Werben/PLZ_Verzeichnis' in a.get('href')][0] - return urljoin(base_url, url.split('?')[0]) - - -def get_postal_codes(download_url): - """Download the Austrian postal codes spreadsheet.""" - response = requests.get(download_url, headers=headers) - response.raise_for_status() - workbook = xlrd.open_workbook( - file_contents=response.content, logfile=open(os.devnull, 'w')) - sheet = workbook.sheet_by_index(0) - rows = sheet.get_rows() - # the first row contains the column names - columns = [column.value.lower() for column in next(rows)] - # the other rows contain data - for row in rows: - data = dict(zip( - columns, - [column.value for column in row])) - if data['adressierbar'].lower() == 'ja': - yield ( - data['plz'], - data['ort'], - regions.get(data['bundesland'])) - if __name__ == '__main__': - # download/parse the information - download_url = find_download_url() + response = requests.get(download_url) + response.raise_for_status() + data = response.json() # print header - print('# generated from %s downloaded from' % - os.path.basename(download_url)) - print('# %s' % base_url) + print('# generated from %s' % download_url) + print('# version %s published %s' % ( + data['version']['id'], data['version']['published'])) # build an ordered list of postal codes - for code, location, region in sorted(get_postal_codes(download_url)): + results = [] + for row in data['data']: + if row['adressierbar'] == 'Ja': + results.append((str(row['plz']), row['ort'], regions[row['bundesland']])) + for code, location, region in sorted(results): print('%s location="%s" region="%s"' % (code, location, region)) |