Arthur de Jong

Open Source / Free Software developer

summaryrefslogtreecommitdiffstats
path: root/getcnloc.py
diff options
context:
space:
mode:
Diffstat (limited to 'getcnloc.py')
-rwxr-xr-xgetcnloc.py19
1 files changed, 12 insertions, 7 deletions
diff --git a/getcnloc.py b/getcnloc.py
index 9126be7..c54d68b 100755
--- a/getcnloc.py
+++ b/getcnloc.py
@@ -3,6 +3,7 @@
# getcnloc.py - script to fetch data from the China (PRC) government site
#
# Copyright (C) 2014 Jiangge Zhang
+# Copyright (C) 2015 Arthur de Jong
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
@@ -43,11 +44,13 @@ def make_etree(response, encoding='utf-8'):
return lxml.html.fromstring(response.text)
-def iter_revisions():
- html = make_etree(requests.get(revisions_url))
+def get_revisions(url):
+ """Return the links to versions of the published administrative division
+ codes."""
+ html = make_etree(requests.get(url))
anchors = html.xpath('.//div[@class="center_list"]/ul/li/a')
for anchor in anchors:
- url = urljoin(revisions_url, anchor.attrib['href'])
+ url = urljoin(url, anchor.attrib['href'])
date_text = anchor.findtext('.//span/*[@class="cont_tit02"]')
date = datetime.strptime(date_text, '%Y-%m-%d').date()
yield url, date
@@ -55,8 +58,9 @@ def iter_revisions():
def iter_records(url):
html = make_etree(requests.get(url))
- lines = html.xpath('.//div[@class="xilan_con"]//p/text()')
+ lines = html.xpath('.//div[@class="xilan_con"]//p')
for line in lines:
+ line = ' '.join(line.xpath('.//text()'))
try:
city_code, city_name = line.strip().split()
except ValueError:
@@ -66,8 +70,7 @@ def iter_records(url):
yield city_code.strip(), city_name.strip()
-def group_records():
- url, _ = max(iter_revisions(), key=itemgetter(1)) # latest revision
+def group_records(url):
provinces = {}
prefectures = {}
@@ -96,7 +99,9 @@ def print_data_file(file):
print("# generated from National Bureau of Statistics of the People's",
file=file)
print('# Republic of China, downloaded from %s' % revisions_url, file=file)
- for city_code, city_data in group_records():
+ url, dt = max(get_revisions(revisions_url), key=itemgetter(1))
+ print('# %s (revision %s)' % (url, dt), file=file)
+ for city_code, city_data in group_records(url):
if not all(city_data.values()):
continue
city_pairs = ' '.join(