Arthur de Jong

Open Source / Free Software developer

summaryrefslogtreecommitdiffstats
path: root/update/my_bp.py
blob: 3cf1227f744ee34ad175446987ee3624c04baf68 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
#!/usr/bin/env python

# update/my_bp.py - script to download data from Malaysian government site
#
# Copyright (C) 2013-2018 Arthur de Jong
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
# 02110-1301 USA

"""This script downloads the list of states and countries and their
birthplace code from the National Registration Department of Malaysia."""

import re
import urllib
from collections import defaultdict

import BeautifulSoup


# URLs that are downloaded
state_list_url = 'http://www.jpn.gov.my/informasi/kod-negeri/'
country_list_url = 'http://www.jpn.gov.my/en/informasi/kod-negara/'


spaces_re = re.compile(r'\s+', re.UNICODE)


def clean(s):
    """Clean up the string removing unneeded stuff from it."""
    return spaces_re.sub(' ', s.replace(u'\u0096', '')).strip().encode('utf-8')


def parse(f):
    """Parse the specified file."""
    soup = BeautifulSoup.BeautifulSoup(f, convertEntities='html')
    # find all table rows
    for tr in soup.find('div', {'class': 'box-content'}).findAll('tr'):
        # find the rows with four columns of text
        tds = [
            clean(''.join(x.string for x in td.findAll(text=True)))
            for td in tr.findAll('td')
        ]
        if len(tds) >= 2 and tds[0] and tds[1]:
            yield tds[0], tds[1]
        if len(tds) >= 4 and tds[2] and tds[3]:
            yield tds[2], tds[3]


if __name__ == '__main__':
    results = defaultdict(lambda: defaultdict(set))
    # read the states
    # f = open('/tmp/states.html', 'r')
    f = urllib.urlopen(state_list_url)
    for state, bps in parse(f):
        for bp in bps.split(','):
            results[bp.strip()]['state'] = state
            results[bp.strip()]['countries'].add('Malaysia')
    # read the countries
    # f = open('/tmp/countries.html', 'r')
    f = urllib.urlopen(country_list_url)
    for country, bp in parse(f):
        results[bp]['countries'].add(country)
    # print the results
    print('# generated from National Registration Department of Malaysia, downloaded from')
    print('# %s' % state_list_url)
    print('# %s' % country_list_url)
    print('')
    for bp in sorted(results.iterkeys()):
        res = bp
        row = results[bp]
        if 'state' in row:
            res += ' state="%s"' % row['state']
        countries = list(row['countries'])
        countries.sort()
        if len(countries) == 1:
            res += ' country="%s"' % countries[0]
        if len(countries) > 0:
            res += ' countries="%s"' % (', '.join(countries))
        print(res)