getcnloc.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116

#!/usr/bin/env python

# getcnloc.py - script to fetch data from the China (PRC) government site
#
# Copyright (C) 2014 Jiangge Zhang
# Copyright (C) 2015 Arthur de Jong
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
# 02110-1301 USA

from __future__ import print_function, unicode_literals

import sys
import codecs
from urlparse import urljoin
from operator import itemgetter
from datetime import datetime

import requests
import lxml.html


revisions_url = 'http://www.stats.gov.cn/tjsj/tjbz/xzqhdm/'


def make_etree(response, encoding='utf-8'):
    if not response.ok:
        args = (response.status_code, response.reason, response.url)
        print('%d %s: %s' % args, file=sys.stderr)
        sys.exit(-1)
    response.encoding = encoding
    return lxml.html.fromstring(response.text)


def get_revisions(url):
    """Return the links to versions of the published administrative division
    codes."""
    html = make_etree(requests.get(url))
    anchors = html.xpath('.//div[@class="center_list"]/ul/li/a')
    for anchor in anchors:
        url = urljoin(url, anchor.attrib['href'])
        date_text = anchor.findtext('.//span/*[@class="cont_tit02"]')
        date = datetime.strptime(date_text, '%Y-%m-%d').date()
        yield url, date


def iter_records(url):
    html = make_etree(requests.get(url))
    lines = html.xpath('.//div[@class="xilan_con"]//p')
    for line in lines:
        line = ' '.join(line.xpath('.//text()'))
        try:
            city_code, city_name = line.strip().split()
        except ValueError:
            if line.strip():
                print('invalid line: %r' % line, file=sys.stderr)
        else:
            yield city_code.strip(), city_name.strip()


def group_records(url):

    provinces = {}
    prefectures = {}

    for city_code, city_name in iter_records(url):
        province_code = city_code[:2]
        prefecture_code = city_code[2:4]
        county_code = city_code[4:6]

        county_name = None

        if prefecture_code == '00':
            provinces[province_code] = city_name
        elif county_code == '00':
            prefectures[prefecture_code] = city_name
        else:
            county_name = city_name

        yield city_code, dict(
            province=provinces.get(province_code),
            prefecture=prefectures.get(prefecture_code),
            county=county_name)


def print_data_file(file):
    print("# generated from National Bureau of Statistics of the People's",
          file=file)
    print('# Republic of China, downloaded from %s' % revisions_url, file=file)
    url, dt = max(get_revisions(revisions_url), key=itemgetter(1))
    print('# %s (revision %s)' % (url, dt), file=file)
    for city_code, city_data in group_records(url):
        if not all(city_data.values()):
            continue
        city_pairs = ' '.join(
            '%s="%s"' % (k, v) for k, v in sorted(city_data.items()) if v)
        print('%s %s' % (city_code, city_pairs), file=file)


if __name__ == '__main__':
    if sys.stdout.isatty():
        print_data_file(sys.stdout)
    else:
        print_data_file(codecs.getwriter('utf-8')(sys.stdout))