#!/usr/bin/env python
# getismsi.py - script to donwload data from Wikipedia to build the database
#
# Copyright (C) 2011 Arthur de Jong
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
# 02110-1301 USA
from collections import defaultdict
import re
import urllib
# URLs that are downloaded
mcc_list_url = 'https://en.wikipedia.org/w/index.php?title=Mobile_country_code&action=raw'
cleanup_replacements = {
'Anguilla (United Kingdom)': 'Anguilla',
'Argentina|Argentine Republic': 'Argentina',
'Aruba (Kingdom of the Netherlands|Netherlands)': 'Aruba',
'Azerbaijan|Azerbaijani Republic': 'Azerbaijan',
'Bermuda (United Kingdom)': 'Bermuda',
'British Virgin Islands (United Kingdom)': 'British Virgin Islands',
'Brunei|Brunei Darussalam': 'Brunei',
'Cayman Islands': 'Cayman Islands (United Kingdom)',
'Cayman Islands (United Kingdom)': 'Cayman Islands (United Kingdom)',
'Czech Rep.': 'Czech Republic',
'Democratic People\'s Republic of Korea|Korea, North': 'North Korea',
'Denmark (Kingdom of Denmark)': 'Denmark',
'Faroe Islands (Kingdom of Denmark)': 'Faroe Islands (Denmark)',
'French Polynesia (France)': 'French Polynesia',
'Gabon|Gabonese Republic': 'Gabon',
'Georgia (country)|Georgia': 'Georgia',
'Gibraltar': 'Gibraltar (United Kingdom)',
'Gibraltar (United Kingdom)': 'Gibraltar (United Kingdom)',
'Greenland (Kingdom of Denmark)': 'Greenland (Denmark)',
'Guadeloupe': 'Guadeloupe (France)',
'Hong Kong (People\'s Republic of China|PRC)': 'Hong Kong (China)',
'Hong Kong (Special Administrative Region of People\'s Republic of China)': 'Hong Kong (China)',
'Korea (Rep. of)': 'South Korea',
'Kyrgyz Republic': 'Kyrgyzstan',
'Lao People\'s Democratic Republic|Laos': 'Laos',
'Macau (People\'s Republic of China)': 'Macau (China)',
'Macau (People\'s Republic of China|PRC)': 'Macau (China)',
'Martinique': 'Martinique (France)',
'Moldova (Republic of)': 'Moldova',
'Montenegro (Republic of)': 'Montenegro',
'Netherlands (Kingdom of the Netherlands)': 'Netherlands',
'Palestinian Authority': 'Palestinian territories',
'Palestinian territories|Palestine': 'Palestinian territories',
'People\'s Republic of China|China': 'China',
'Puerto Rico (United States)': 'Puerto Rico',
'Republic of Ireland|Ireland': 'Ireland',
'Republic of Korea|Korea, South': 'South Korea',
'Russian Federation': 'Russian Federation',
'Rwanda|Rwandese Republic': 'Rwanda',
'Serbia (Republic of)': 'Serbia',
'Somali Democratic Republic|Somalia': 'Somalia',
'Syrian Arab Republic': 'Syria',
'Syrian Arab Republic|Syria': 'Syria',
'Turks and Caicos Islands (United Kingdom)': 'Turks and Caicos Islands',
'United States': 'United States of America',
'United States Virgin Islands (United States)': 'United States Virgin Islands',
'Venezuela (Bolivarian Republic of)': 'Venezuela',
'Vietnam|Viet Nam': 'Vietnam',
}
remove_ref_re = re.compile(r'[.*?]')
remove_comment_re = re.compile(r'{{.*?}}')
remove_href_re = re.compile(ur'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+' +
ur'[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|' +
ur'(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|' +
ur'(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>' +
ur'?\xab\xbb\u201c\u201d\u2018\u2019]))')
def cleanup_value(val):
"""Remove unneeded markup from the value."""
# remove uninteresting things from value
val = remove_comment_re.sub('', val)
val = remove_ref_re.sub('', val)
val = remove_href_re.sub('', val)
val = val.replace('[', '').replace(']', '').replace('\'\'', '').strip()
val = val.split('|')[-1]
# replace value
val = val.replace('Unknown', '')
val = val.replace('United Kingdom|UK', 'United Kingdom')
val = val.replace('United States|US', 'United States')
val = val.replace('New Zealand|NZ', 'New Zealand').strip()
return cleanup_replacements.get(val, val)
def update_mncs(data, mcc, mnc, **kwargs):
"""Merge provided mnc information with the data that is already stored
in mccs."""
data[mcc][mnc].update(dict((k, cleanup_value(v)) for k, v in kwargs.items() if v))
def get_mncs_from_wikipedia(data):
"""Update the collection of Mobile Country Codes from Wikipedia.
This parses a Wikipedia page to extract the MCC and MNC, the first
part of any IMSI, and stores the results."""
mnc_country_re = re.compile(r'^[=]{2,4}\s+(?P.*?)(\s+-\s+(?P[^\s]{2}))?\s+[=]{2,4}$')
mnc_line_re = re.compile(r'^\|\s*(?P[0-9]+)' +
r'\s*\\\\\s*(?P[0-9]+)' +
r'(\s*\\\\\s*(?P[^\\]*)' +
r'(\s*\\\\\s*(?P[^\\]*)' +
r'(\s*\\\\\s*(?P[^\\]*)' +
r'(\s*\\\\\s*(?P[^\\]*)' +
r'(\s*\\\\\s*(?P[^\\]*)' +
r')?)?)?)?)?')
f = urllib.urlopen(mcc_list_url)
country = cc = ''
for line in f.readlines():
line = line.strip()
match = mnc_country_re.match(line)
if match:
country = match.group('country')
cc = (match.group('cc') or '').lower()
if '||' not in line:
continue
line = line.replace('||', '\\\\')
match = mnc_line_re.match(line)
if match:
mnc_list = str2range(match.group('mnc'))
for mnc in mnc_list:
update_mncs(data, match.group('mcc'), mnc,
country=country, cc=cc, brand=match.group('brand'),
operator=match.group('operator'),
status=match.group('status'),
bands=match.group('bands'))
def str2range(x):
result = []
for part in x.split(','):
if '-' in part:
a, b = part.split('-')
f = '%0' + str(len(b)) + 'd'
a, b = int(a), int(b)
for i in range(a, b + 1):
result.append(f % (i))
else:
a = part
result.append(part)
return result
if __name__ == '__main__':
# download/parse the information
data = defaultdict(lambda: defaultdict(dict))
get_mncs_from_wikipedia(data)
# print header
print '# generated from various sources'
print '# %s' % mcc_list_url
# build an ordered list of mccs
mcc_list = list(data.keys())
mcc_list.sort()
# go over mccs
for mcc in mcc_list:
print '%s' % mcc
# build an ordered list of mncs
mnc_list = data[mcc].keys()
mnc_list.sort()
for mnc in mnc_list:
info = data[mcc][mnc]
infokeys = info.keys()
infokeys.sort()
print ' %s%s' % (mnc, ''.join([' %s="%s"' % (k, info[k]) for k in infokeys]))
# try to get the length of mnc's
try:
l = len(mnc_list[0])
print ' %s-%s' % (l * '0', l * '9')
except IndexError:
pass # ignore