Arthur de Jong

Open Source / Free Software developer

summaryrefslogtreecommitdiffstats
path: root/getimsi.py
diff options
context:
space:
mode:
authorArthur de Jong <arthur@arthurdejong.org>2013-12-04 20:54:12 +0100
committerArthur de Jong <arthur@arthurdejong.org>2013-12-04 20:59:38 +0100
commit7f30979a49c2605c9a5825281f73ef57bfa66192 (patch)
tree4c895f1d45be2c23697178f6ec95f6cbd63feeb2 /getimsi.py
parentb0c47d5e2889fe7ead576c9b4e8334e9699e5fda (diff)
Update getimsi script
This updates the script due to the Wikipedia article change and removes the code for getting the data from ITU for now. See: https://github.com/arthurdejong/python-stdnum/issues/1
Diffstat (limited to 'getimsi.py')
-rwxr-xr-xgetimsi.py225
1 files changed, 85 insertions, 140 deletions
diff --git a/getimsi.py b/getimsi.py
index 4c6b9d2..4e3b4a4 100755
--- a/getimsi.py
+++ b/getimsi.py
@@ -19,144 +19,102 @@
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
# 02110-1301 USA
-import urllib
+from collections import defaultdict
import re
+import urllib
# URLs that are downloaded
-mcc_list_url = 'http://en.wikipedia.org/w/index.php?title=List_of_mobile_country_codes&action=raw'
-mnc_list_url = 'http://en.wikipedia.org/w/index.php?title=Mobile_Network_Code&action=raw'
+mcc_list_url = 'https://en.wikipedia.org/w/index.php?title=Mobile_country_code&action=raw'
cleanup_replacements = {
- 'Anguilla (United Kingdom)': 'Anguilla',
- 'Argentina|Argentine Republic': 'Argentina',
- 'Aruba (Kingdom of the Netherlands|Netherlands)': 'Aruba',
- 'Azerbaijan|Azerbaijani Republic': 'Azerbaijan',
- 'Bermuda (United Kingdom)': 'Bermuda',
- 'British Virgin Islands (United Kingdom)': 'British Virgin Islands',
- 'Brunei|Brunei Darussalam': 'Brunei',
- 'Cayman Islands': 'Cayman Islands (United Kingdom)',
- 'Cayman Islands (United Kingdom)': 'Cayman Islands (United Kingdom)',
- 'Czech Rep.': 'Czech Republic',
- 'Democratic People\'s Republic of Korea|Korea, North': 'North Korea',
- 'Denmark (Kingdom of Denmark)': 'Denmark',
- 'Faroe Islands (Kingdom of Denmark)': 'Faroe Islands (Denmark)',
- 'French Polynesia (France)': 'French Polynesia',
- 'Gabon|Gabonese Republic': 'Gabon',
- 'Georgia (country)|Georgia': 'Georgia',
- 'Gibraltar': 'Gibraltar (United Kingdom)',
- 'Gibraltar (United Kingdom)': 'Gibraltar (United Kingdom)',
- 'Greenland (Kingdom of Denmark)': 'Greenland (Denmark)',
- 'Guadeloupe': 'Guadeloupe (France)',
- 'Hong Kong (People\'s Republic of China|PRC)': 'Hong Kong (China)',
- 'Hong Kong (Special Administrative Region of People\'s Republic of China)': 'Hong Kong (China)',
- 'Korea (Rep. of)': 'South Korea',
- 'Kyrgyz Republic': 'Kyrgyzstan',
- 'Lao People\'s Democratic Republic|Laos': 'Laos',
- 'Macau (People\'s Republic of China)': 'Macau (China)',
- 'Macau (People\'s Republic of China|PRC)': 'Macau (China)',
- 'Martinique': 'Martinique (France)',
- 'Moldova (Republic of)': 'Moldova',
- 'Montenegro (Republic of)': 'Montenegro',
- 'Netherlands (Kingdom of the Netherlands)': 'Netherlands',
- 'Palestinian Authority': 'Palestinian territories',
- 'Palestinian territories|Palestine': 'Palestinian territories',
- 'People\'s Republic of China|China': 'China',
- 'Puerto Rico (United States)': 'Puerto Rico',
- 'Republic of Ireland|Ireland': 'Ireland',
- 'Republic of Korea|Korea, South': 'South Korea',
- 'Russian Federation': 'Russian Federation',
- 'Rwanda|Rwandese Republic': 'Rwanda',
- 'Serbia (Republic of)': 'Serbia',
- 'Somali Democratic Republic|Somalia': 'Somalia',
- 'Syrian Arab Republic': 'Syria',
- 'Syrian Arab Republic|Syria': 'Syria',
- 'Turks and Caicos Islands (United Kingdom)': 'Turks and Caicos Islands',
- 'United States': 'United States of America',
- 'United States Virgin Islands (United States)': 'United States Virgin Islands',
- 'Venezuela (Bolivarian Republic of)': 'Venezuela',
- 'Vietnam|Viet Nam': 'Vietnam',
+ 'Anguilla (United Kingdom)': 'Anguilla',
+ 'Argentina|Argentine Republic': 'Argentina',
+ 'Aruba (Kingdom of the Netherlands|Netherlands)': 'Aruba',
+ 'Azerbaijan|Azerbaijani Republic': 'Azerbaijan',
+ 'Bermuda (United Kingdom)': 'Bermuda',
+ 'British Virgin Islands (United Kingdom)': 'British Virgin Islands',
+ 'Brunei|Brunei Darussalam': 'Brunei',
+ 'Cayman Islands': 'Cayman Islands (United Kingdom)',
+ 'Cayman Islands (United Kingdom)': 'Cayman Islands (United Kingdom)',
+ 'Czech Rep.': 'Czech Republic',
+ 'Democratic People\'s Republic of Korea|Korea, North': 'North Korea',
+ 'Denmark (Kingdom of Denmark)': 'Denmark',
+ 'Faroe Islands (Kingdom of Denmark)': 'Faroe Islands (Denmark)',
+ 'French Polynesia (France)': 'French Polynesia',
+ 'Gabon|Gabonese Republic': 'Gabon',
+ 'Georgia (country)|Georgia': 'Georgia',
+ 'Gibraltar': 'Gibraltar (United Kingdom)',
+ 'Gibraltar (United Kingdom)': 'Gibraltar (United Kingdom)',
+ 'Greenland (Kingdom of Denmark)': 'Greenland (Denmark)',
+ 'Guadeloupe': 'Guadeloupe (France)',
+ 'Hong Kong (People\'s Republic of China|PRC)': 'Hong Kong (China)',
+ 'Hong Kong (Special Administrative Region of People\'s Republic of China)': 'Hong Kong (China)',
+ 'Korea (Rep. of)': 'South Korea',
+ 'Kyrgyz Republic': 'Kyrgyzstan',
+ 'Lao People\'s Democratic Republic|Laos': 'Laos',
+ 'Macau (People\'s Republic of China)': 'Macau (China)',
+ 'Macau (People\'s Republic of China|PRC)': 'Macau (China)',
+ 'Martinique': 'Martinique (France)',
+ 'Moldova (Republic of)': 'Moldova',
+ 'Montenegro (Republic of)': 'Montenegro',
+ 'Netherlands (Kingdom of the Netherlands)': 'Netherlands',
+ 'Palestinian Authority': 'Palestinian territories',
+ 'Palestinian territories|Palestine': 'Palestinian territories',
+ 'People\'s Republic of China|China': 'China',
+ 'Puerto Rico (United States)': 'Puerto Rico',
+ 'Republic of Ireland|Ireland': 'Ireland',
+ 'Republic of Korea|Korea, South': 'South Korea',
+ 'Russian Federation': 'Russian Federation',
+ 'Rwanda|Rwandese Republic': 'Rwanda',
+ 'Serbia (Republic of)': 'Serbia',
+ 'Somali Democratic Republic|Somalia': 'Somalia',
+ 'Syrian Arab Republic': 'Syria',
+ 'Syrian Arab Republic|Syria': 'Syria',
+ 'Turks and Caicos Islands (United Kingdom)': 'Turks and Caicos Islands',
+ 'United States': 'United States of America',
+ 'United States Virgin Islands (United States)': 'United States Virgin Islands',
+ 'Venezuela (Bolivarian Republic of)': 'Venezuela',
+ 'Vietnam|Viet Nam': 'Vietnam',
}
+remove_ref_re = re.compile(r'<ref>.*?</ref>')
+
+
def cleanup_value(val):
"""Remove unneeded markup from the value."""
# remove uninteresting things from value
val = val.replace('[', '').replace(']', '').strip()
+ val = remove_ref_re.sub('', val)
# replace value
val = val.replace('United Kingdom|UK', 'United Kingdom')
val = val.replace('United States|US', 'United States')
- val = val.replace('New Zealand|NZ', 'New Zealand')
+ val = val.replace('New Zealand|NZ', 'New Zealand').strip()
return cleanup_replacements.get(val, val)
-def update_mccs(mccs, mcc, **kwargs):
- """Merge provided information in kwrags with the already stored
- information in mccs."""
- if mcc not in mccs:
- mccs[mcc] = dict()
- mccs[mcc].update(dict((k, cleanup_value(v)) for k,v in kwargs.items() if v))
-
-
-def update_mncs(mccs, mcc, mnc, **kwargs):
+def update_mncs(data, mcc, mnc, **kwargs):
"""Merge provided mnc information with the data that is already stored
in mccs."""
- if mcc not in mccs:
- mccs[mcc] = dict()
- mncs = mccs[mcc]
- if mnc not in mncs:
- mncs[mnc] = dict()
- mncs[mnc].update(dict((k, cleanup_value(v)) for k,v in kwargs.items() if v))
-
-
-def get_mccs_from_wikipedia(mccs):
- """Returns a dictionary of Mobile Country Codes mapping to a dictionary
- that holds the cc (country code) and country (country name) keys. This
- function parses a Wikipedia page."""
- mcc_line_re = re.compile('^\|\s*(?P<mcc>[0-9]+)\s*\|\|\s*(?P<cc>[^\s]+)\s*\|\|\s*(?P<country>.*)\s*$')
+ data[mcc][mnc].update(dict((k, cleanup_value(v)) for k, v in kwargs.items() if v))
+
+
+def get_mncs_from_wikipedia(data):
+ """Update the collection of Mobile Country Codes from Wikipedia.
+ This parses a Wikipedia page to extract the MCC and MNC, the first
+ part of any IMSI, and stores the results."""
+ mnc_country_re = re.compile(r'^====\s+(?P<country>.*?)(\s+-\s+(?P<cc>[^\s]{2}))?\s+====$')
+ mnc_line_re = re.compile(r'^\|\s+(?P<mcc>[0-9]+)' +
+ r'\s+\|\|\s+(?P<mnc>[0-9]+)' +
+ r'(\s+\|\|\s+(?P<brand>[^|]*)' +
+ r'(\s+\|\|\s+(?P<operator>[^|]*)' +
+ r'(\s+\|\|\s+(?P<status>[^|]*)' +
+ r'(\s+\|\|\s+(?P<bands>[^|]*)' +
+ r'(\s+\|\|\s+(?P<notes>[^|]*)' +
+ r')?)?)?)?)?')
f = urllib.urlopen(mcc_list_url)
- for line in f.readlines():
- # search for lines that are part of the table
- match = mcc_line_re.search(line)
- if match:
- update_mccs(mccs, match.group('mcc'), cc=match.group('cc').lower(),
- country=match.group('country'))
-
-
-def get_mncs_from_itu(mccs):
- """This parses a text file that contains the copy-pasted table from the
- "Mobile Network Codes (MNC) for the international identification plan
- for public networks and subscriptions" document by the
- TELECOMMUNICATION STANDARDIZATION BUREAU OF ITU downloaded from
- http://www.itu.int/itu-t/bulletin/annex.html"""
- twonumbers_re = re.compile('^\s*(?P<mcc>[0-9]+)\s+(?P<mnc>[0-9]+)\s*$')
- f = open('imsi.info', 'r')
- country = operator = ''
- for line in f.readlines():
- line = line.strip()
- if not line:
- country = operator
- else:
- match = twonumbers_re.search(line)
- if not match:
- operator = line
- else:
- update_mncs(mccs, match.group('mcc'), match.group('mnc'),
- country=country, operator=operator)
-
-
-def get_mncs_from_wikipedia(mccs):
- """Returns a dictionary of Mobile Country Codes mapping to a dictionary
- that holds the cc (country code) and country (country name) keys. This
- function parses a Wikipedia page."""
- mnc_country_re = re.compile('^====\s+(?P<country>.*?)(\s+-\s+(?P<cc>[^\s]{2}))?\s+====$')
- mnc_line_re = re.compile('^\|\s+(?P<mcc>[0-9]+)\s+\|\|\s+(?P<mnc>[0-9]+)' +
- '(\s+\|\|\s+(?P<brand>[^|]*)' +
- '(\s+\|\|\s+(?P<operator>[^|]*)' +
- '(\s+\|\|\s+(?P<status>[^|]*)' +
- '(\s+\|\|\s+(?P<bands>[^|]*)' + '))))')
- f = urllib.urlopen(mnc_list_url)
country = cc = ''
for line in f.readlines():
line = line.strip()
@@ -166,7 +124,7 @@ def get_mncs_from_wikipedia(mccs):
cc = (match.group('cc') or '').lower()
match = mnc_line_re.match(line)
if match:
- update_mncs(mccs, match.group('mcc'), match.group('mnc'),
+ update_mncs(data, match.group('mcc'), match.group('mnc'),
country=country, cc=cc, brand=match.group('brand'),
operator=match.group('operator'),
status=match.group('status'),
@@ -175,41 +133,28 @@ def get_mncs_from_wikipedia(mccs):
if __name__ == '__main__':
# download/parse the information
- mccs_info = {}
- get_mccs_from_wikipedia(mccs_info)
- mccs_mncs_info = {}
- get_mncs_from_itu(mccs_mncs_info)
- get_mncs_from_wikipedia(mccs_mncs_info)
+ data = defaultdict(lambda: defaultdict(dict))
+ get_mncs_from_wikipedia(data)
# print header
print '# generated from various sources'
print '# %s' % mcc_list_url
- print '# %s' % mnc_list_url
- print '# http://www.itu.int/itu-t/bulletin/annex.html'
# build an ordered list of mccs
- mccs = list(set(mccs_info.keys() + mccs_mncs_info.keys()))
- mccs.sort()
+ mcc_list = list(data.keys())
+ mcc_list.sort()
# go over mccs
- for mcc in mccs:
- mcci = mccs_info.get(mcc, {})
- cc = mcci.get('cc', '')
- country = mcci.get('country', None)
- print '%s%s%s' % ( mcc, ' cc="%s"' % cc if cc else '',
- ' country="%s"' % country if country else '')
+ for mcc in mcc_list:
+ print '%s' % mcc
# build an ordered list of mncs
- mncs = mccs_mncs_info.get(mcc, {}).keys()
- mncs.sort()
- for mnc in mncs:
- info = mccs_mncs_info[mcc][mnc]
- if cc and info.get('cc', '') == cc:
- del info['cc']
- if country and info.get('country', None) == country:
- del info['country']
+ mnc_list = data[mcc].keys()
+ mnc_list.sort()
+ for mnc in mnc_list:
+ info = data[mcc][mnc]
infokeys = info.keys()
infokeys.sort()
print ' %s%s' % (mnc, ''.join([' %s="%s"' % (k, info[k]) for k in infokeys]))
# try to get the length of mnc's
try:
- l = len(mncs[0])
+ l = len(mnc_list[0])
print ' %s-%s' % (l * '0', l * '9')
except IndexError:
pass # ignore