#!/usr/bin/env python # getismsi.py - script to donwload data from Wikipedia to build the database # # Copyright (C) 2011 Arthur de Jong # # This library is free software; you can redistribute it and/or # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. # # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA # 02110-1301 USA import urllib import re # URLs that are downloaded mcc_list_url = 'http://en.wikipedia.org/w/index.php?title=List_of_mobile_country_codes&action=raw' mnc_list_url = 'http://en.wikipedia.org/w/index.php?title=Mobile_Network_Code&action=raw' cleanup_replacements = { 'Anguilla (United Kingdom)': 'Anguilla', 'Argentina|Argentine Republic': 'Argentina', 'Aruba (Kingdom of the Netherlands|Netherlands)': 'Aruba', 'Azerbaijan|Azerbaijani Republic': 'Azerbaijan', 'Bermuda (United Kingdom)': 'Bermuda', 'British Virgin Islands (United Kingdom)': 'British Virgin Islands', 'Brunei|Brunei Darussalam': 'Brunei', 'Cayman Islands': 'Cayman Islands (United Kingdom)', 'Cayman Islands (United Kingdom)': 'Cayman Islands (United Kingdom)', 'Czech Rep.': 'Czech Republic', 'Democratic People\'s Republic of Korea|Korea, North': 'North Korea', 'Denmark (Kingdom of Denmark)': 'Denmark', 'Faroe Islands (Kingdom of Denmark)': 'Faroe Islands (Denmark)', 'French Polynesia (France)': 'French Polynesia', 'Gabon|Gabonese Republic': 'Gabon', 'Georgia (country)|Georgia': 'Georgia', 'Gibraltar': 'Gibraltar (United Kingdom)', 'Gibraltar (United Kingdom)': 'Gibraltar (United Kingdom)', 'Greenland (Kingdom of Denmark)': 'Greenland (Denmark)', 'Guadeloupe': 'Guadeloupe (France)', 'Hong Kong (People\'s Republic of China|PRC)': 'Hong Kong (China)', 'Hong Kong (Special Administrative Region of People\'s Republic of China)': 'Hong Kong (China)', 'Korea (Rep. of)': 'South Korea', 'Kyrgyz Republic': 'Kyrgyzstan', 'Lao People\'s Democratic Republic|Laos': 'Laos', 'Macau (People\'s Republic of China)': 'Macau (China)', 'Macau (People\'s Republic of China|PRC)': 'Macau (China)', 'Martinique': 'Martinique (France)', 'Moldova (Republic of)': 'Moldova', 'Montenegro (Republic of)': 'Montenegro', 'Netherlands (Kingdom of the Netherlands)': 'Netherlands', 'Palestinian Authority': 'Palestinian territories', 'Palestinian territories|Palestine': 'Palestinian territories', 'People\'s Republic of China|China': 'China', 'Puerto Rico (United States)': 'Puerto Rico', 'Republic of Ireland|Ireland': 'Ireland', 'Republic of Korea|Korea, South': 'South Korea', 'Russian Federation': 'Russian Federation', 'Rwanda|Rwandese Republic': 'Rwanda', 'Serbia (Republic of)': 'Serbia', 'Somali Democratic Republic|Somalia': 'Somalia', 'Syrian Arab Republic': 'Syria', 'Syrian Arab Republic|Syria': 'Syria', 'Turks and Caicos Islands (United Kingdom)': 'Turks and Caicos Islands', 'United States': 'United States of America', 'United States Virgin Islands (United States)': 'United States Virgin Islands', 'Venezuela (Bolivarian Republic of)': 'Venezuela', 'Vietnam|Viet Nam': 'Vietnam', } def cleanup_value(val): """Remove unneeded markup from the value.""" # remove uninteresting things from value val = val.replace('[', '').replace(']', '').strip() # replace value val = val.replace('United Kingdom|UK', 'United Kingdom') val = val.replace('United States|US', 'United States') val = val.replace('New Zealand|NZ', 'New Zealand') return cleanup_replacements.get(val, val) def update_mccs(mccs, mcc, **kwargs): """Merge provided information in kwrags with the already stored information in mccs.""" if mcc not in mccs: mccs[mcc] = dict() mccs[mcc].update(dict((k, cleanup_value(v)) for k,v in kwargs.items() if v)) def update_mncs(mccs, mcc, mnc, **kwargs): """Merge provided mnc information with the data that is already stored in mccs.""" if mcc not in mccs: mccs[mcc] = dict() mncs = mccs[mcc] if mnc not in mncs: mncs[mnc] = dict() mncs[mnc].update(dict((k, cleanup_value(v)) for k,v in kwargs.items() if v)) def get_mccs_from_wikipedia(mccs): """Returns a dictionary of Mobile Country Codes mapping to a dictionary that holds the cc (country code) and country (country name) keys. This function parses a Wikipedia page.""" mcc_line_re = re.compile('^\|\s*(?P[0-9]+)\s*\|\|\s*(?P[^\s]+)\s*\|\|\s*(?P.*)\s*$') f = urllib.urlopen(mcc_list_url) for line in f.readlines(): # search for lines that are part of the table match = mcc_line_re.search(line) if match: update_mccs(mccs, match.group('mcc'), cc=match.group('cc').lower(), country=match.group('country')) def get_mncs_from_itu(mccs): """This parses a text file that contains the copy-pasted table from the "Mobile Network Codes (MNC) for the international identification plan for public networks and subscriptions" document by the TELECOMMUNICATION STANDARDIZATION BUREAU OF ITU downloaded from http://www.itu.int/itu-t/bulletin/annex.html""" twonumbers_re = re.compile('^\s*(?P[0-9]+)\s+(?P[0-9]+)\s*$') f = open('imsi.info', 'r') country = operator = '' for line in f.readlines(): line = line.strip() if not line: country = operator else: match = twonumbers_re.search(line) if not match: operator = line else: update_mncs(mccs, match.group('mcc'), match.group('mnc'), country=country, operator=operator) def get_mncs_from_wikipedia(mccs): """Returns a dictionary of Mobile Country Codes mapping to a dictionary that holds the cc (country code) and country (country name) keys. This function parses a Wikipedia page.""" mnc_country_re = re.compile('^====\s+(?P.*?)(\s+-\s+(?P[^\s]{2}))?\s+====$') mnc_line_re = re.compile('^\|\s+(?P[0-9]+)\s+\|\|\s+(?P[0-9]+)' + '(\s+\|\|\s+(?P[^|]*)' + '(\s+\|\|\s+(?P[^|]*)' + '(\s+\|\|\s+(?P[^|]*)' + '(\s+\|\|\s+(?P[^|]*)' + '))))') f = urllib.urlopen(mnc_list_url) country = cc = '' for line in f.readlines(): line = line.strip() match = mnc_country_re.match(line) if match: country = match.group('country') cc = (match.group('cc') or '').lower() match = mnc_line_re.match(line) if match: update_mncs(mccs, match.group('mcc'), match.group('mnc'), country=country, cc=cc, brand=match.group('brand'), operator=match.group('operator'), status=match.group('status'), bands=match.group('bands')) if __name__ == '__main__': # download/parse the information mccs_info = {} get_mccs_from_wikipedia(mccs_info) mccs_mncs_info = {} get_mncs_from_itu(mccs_mncs_info) get_mncs_from_wikipedia(mccs_mncs_info) # print header print '# generated from various sources' print '# %s' % mcc_list_url print '# %s' % mnc_list_url print '# http://www.itu.int/itu-t/bulletin/annex.html' # build an ordered list of mccs mccs = list(set(mccs_info.keys() + mccs_mncs_info.keys())) mccs.sort() # go over mccs for mcc in mccs: mcci = mccs_info.get(mcc, {}) cc = mcci.get('cc', '') country = mcci.get('country', None) print '%s%s%s' % ( mcc, ' cc="%s"' % cc if cc else '', ' country="%s"' % country if country else '') # build an ordered list of mncs mncs = mccs_mncs_info.get(mcc, {}).keys() mncs.sort() for mnc in mncs: info = mccs_mncs_info[mcc][mnc] if cc and info.get('cc', '') == cc: del info['cc'] if country and info.get('country', None) == country: del info['country'] infokeys = info.keys() infokeys.sort() print ' %s%s' % (mnc, ''.join([' %s="%s"' % (k, info[k]) for k in infokeys])) # try to get the length of mnc's try: l = len(mncs[0]) print ' %s-%s' % (l * '0', l * '9') except IndexError: pass # ignore