update/my_bp.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99

#!/usr/bin/env python3

# update/my_bp.py - script to download data from Malaysian government site
#
# Copyright (C) 2013-2022 Arthur de Jong
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
# 02110-1301 USA

"""This script downloads the list of states and countries and their
birthplace code from the National Registration Department of Malaysia."""

import os
import re
from collections import defaultdict

import lxml.html
import requests


# URLs that are downloaded
state_list_url = 'https://www.jpn.gov.my/en/kod-negeri-eng'
country_list_url = 'https://www.jpn.gov.my/en/kod-negara-eng'


# The user agent that will be passed in requests
user_agent = 'Mozilla/5.0 (compatible; python-stdnum updater; +https://arthurdejong.org/python-stdnum/)'


spaces_re = re.compile(r'\s+', re.UNICODE)


def clean(td):
    """Clean up the element removing unneeded stuff from it."""
    s = lxml.html.tostring(td, method='text', encoding='utf-8').decode('utf-8')
    return spaces_re.sub(' ', s.replace(u'\u0096', '')).strip()


def parse(content):
    """Parse the specified file."""
    document = lxml.html.document_fromstring(content)
    # find all table rows
    for tr in document.findall('.//div[@class="uk-container"]//tr'):
        tds = [clean(td) for td in tr.findall('td')]
        # table has two columns
        if len(tds) >= 2 and tds[0] and tds[1]:
            yield tds[0], [bp.strip() for bp in tds[1].split(',') if re.match(r' *[0-9]{2} *', bp)]
        if len(tds) >= 4 and tds[2] and tds[3]:
            yield tds[2], [bp.strip() for bp in tds[3].split(',') if re.match(r' *[0-9]{2} *', bp)]


if __name__ == '__main__':
    ca_certificate = os.path.join(os.path.dirname(__file__), 'my_bp.crt')
    headers = {
        'User-Agent': user_agent,
    }
    results = defaultdict(lambda: defaultdict(set))
    # read the states
    response = requests.get(state_list_url, headers=headers, verify=ca_certificate, timeout=30)
    response.raise_for_status()
    for state, bps in parse(response.content):
        for bp in bps:
            results[bp]['state'] = state
            results[bp]['countries'].add('Malaysia')
    # read the countries
    response = requests.get(country_list_url, headers=headers, verify=ca_certificate, timeout=30)
    response.raise_for_status()
    for country, bps in parse(response.content):
        for bp in bps:
            results[bp]['countries'].add(country)
    # print the results
    print('# generated from National Registration Department of Malaysia, downloaded from')
    print('# %s' % state_list_url)
    print('# %s' % country_list_url)
    print('')
    for bp in sorted(results.keys()):
        res = bp
        row = results[bp]
        if 'state' in row:
            res += ' state="%s"' % row['state']
        countries = list(row['countries'])
        countries.sort()
        if len(countries) == 1:
            res += ' country="%s"' % countries[0]
        if len(countries) > 0:
            res += ' countries="%s"' % (', '.join(countries))
        print(res)