Update getimsi script

This updates the script due to the Wikipedia article change and removes the code for getting the data from ITU for now. See: https://github.com/arthurdejong/python-stdnum/issues/1
author: Arthur de Jong <arthur@arthurdejong.org> 2013-12-04 20:54:12 +0100
committer: Arthur de Jong <arthur@arthurdejong.org> 2013-12-04 20:59:38 +0100
commit: 7f30979a49c2605c9a5825281f73ef57bfa66192 (patch)
tree: 4c895f1d45be2c23697178f6ec95f6cbd63feeb2 /getimsi.py
parent: b0c47d5e2889fe7ead576c9b4e8334e9699e5fda (diff)
1 files changed, 85 insertions, 140 deletions
diff --git a/getimsi.py b/getimsi.py
index 4c6b9d2..4e3b4a4 100755
--- a/getimsi.py
+++ b/getimsi.py
@@ -19,144 +19,102 @@
 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
 # 02110-1301 USA
 
-import urllib
+from collections import defaultdict
 import re
+import urllib
 
 
 # URLs that are downloaded
-mcc_list_url = 'http://en.wikipedia.org/w/index.php?title=List_of_mobile_country_codes&action=raw'
-mnc_list_url = 'http://en.wikipedia.org/w/index.php?title=Mobile_Network_Code&action=raw'
+mcc_list_url = 'https://en.wikipedia.org/w/index.php?title=Mobile_country_code&action=raw'
 
 
 cleanup_replacements = {
-  'Anguilla (United Kingdom)': 'Anguilla',
-  'Argentina|Argentine Republic': 'Argentina',
-  'Aruba (Kingdom of the Netherlands|Netherlands)': 'Aruba',
-  'Azerbaijan|Azerbaijani Republic': 'Azerbaijan',
-  'Bermuda       (United Kingdom)': 'Bermuda',
-  'British Virgin Islands (United Kingdom)': 'British Virgin Islands',
-  'Brunei|Brunei Darussalam': 'Brunei',
-  'Cayman Islands': 'Cayman Islands (United Kingdom)',
-  'Cayman Islands (United Kingdom)': 'Cayman Islands (United Kingdom)',
-  'Czech Rep.': 'Czech Republic',
-  'Democratic People\'s Republic of Korea|Korea, North': 'North Korea',
-  'Denmark (Kingdom of Denmark)': 'Denmark',
-  'Faroe Islands (Kingdom of Denmark)': 'Faroe Islands (Denmark)',
-  'French Polynesia (France)': 'French Polynesia',
-  'Gabon|Gabonese Republic': 'Gabon',
-  'Georgia (country)|Georgia': 'Georgia',
-  'Gibraltar': 'Gibraltar (United Kingdom)',
-  'Gibraltar (United Kingdom)': 'Gibraltar (United Kingdom)',
-  'Greenland (Kingdom of Denmark)': 'Greenland (Denmark)',
-  'Guadeloupe': 'Guadeloupe (France)',
-  'Hong Kong (People\'s Republic of China|PRC)': 'Hong Kong (China)',
-  'Hong Kong (Special Administrative Region of People\'s Republic of China)': 'Hong Kong (China)',
-  'Korea (Rep. of)': 'South Korea',
-  'Kyrgyz Republic': 'Kyrgyzstan',
-  'Lao People\'s Democratic Republic|Laos': 'Laos',
-  'Macau (People\'s Republic of China)': 'Macau (China)',
-  'Macau (People\'s Republic of China|PRC)': 'Macau (China)',
-  'Martinique': 'Martinique (France)',
-  'Moldova (Republic of)': 'Moldova',
-  'Montenegro (Republic of)': 'Montenegro',
-  'Netherlands (Kingdom of the Netherlands)': 'Netherlands',
-  'Palestinian Authority': 'Palestinian territories',
-  'Palestinian territories|Palestine': 'Palestinian territories',
-  'People\'s Republic of China|China': 'China',
-  'Puerto Rico (United States)': 'Puerto Rico',
-  'Republic of Ireland|Ireland': 'Ireland',
-  'Republic of Korea|Korea, South': 'South Korea',
-  'Russian Federation': 'Russian Federation',
-  'Rwanda|Rwandese Republic': 'Rwanda',
-  'Serbia (Republic of)': 'Serbia',
-  'Somali Democratic Republic|Somalia': 'Somalia',
-  'Syrian Arab Republic': 'Syria',
-  'Syrian Arab Republic|Syria': 'Syria',
-  'Turks and Caicos Islands (United Kingdom)': 'Turks and Caicos Islands',
-  'United States': 'United States of America',
-  'United States Virgin Islands (United States)': 'United States Virgin Islands',
-  'Venezuela (Bolivarian Republic of)': 'Venezuela',
-  'Vietnam|Viet Nam': 'Vietnam',
+    'Anguilla (United Kingdom)': 'Anguilla',
+    'Argentina|Argentine Republic': 'Argentina',
+    'Aruba (Kingdom of the Netherlands|Netherlands)': 'Aruba',
+    'Azerbaijan|Azerbaijani Republic': 'Azerbaijan',
+    'Bermuda       (United Kingdom)': 'Bermuda',
+    'British Virgin Islands (United Kingdom)': 'British Virgin Islands',
+    'Brunei|Brunei Darussalam': 'Brunei',
+    'Cayman Islands': 'Cayman Islands (United Kingdom)',
+    'Cayman Islands (United Kingdom)': 'Cayman Islands (United Kingdom)',
+    'Czech Rep.': 'Czech Republic',
+    'Democratic People\'s Republic of Korea|Korea, North': 'North Korea',
+    'Denmark (Kingdom of Denmark)': 'Denmark',
+    'Faroe Islands (Kingdom of Denmark)': 'Faroe Islands (Denmark)',
+    'French Polynesia (France)': 'French Polynesia',
+    'Gabon|Gabonese Republic': 'Gabon',
+    'Georgia (country)|Georgia': 'Georgia',
+    'Gibraltar': 'Gibraltar (United Kingdom)',
+    'Gibraltar (United Kingdom)': 'Gibraltar (United Kingdom)',
+    'Greenland (Kingdom of Denmark)': 'Greenland (Denmark)',
+    'Guadeloupe': 'Guadeloupe (France)',
+    'Hong Kong (People\'s Republic of China|PRC)': 'Hong Kong (China)',
+    'Hong Kong (Special Administrative Region of People\'s Republic of China)': 'Hong Kong (China)',
+    'Korea (Rep. of)': 'South Korea',
+    'Kyrgyz Republic': 'Kyrgyzstan',
+    'Lao People\'s Democratic Republic|Laos': 'Laos',
+    'Macau (People\'s Republic of China)': 'Macau (China)',
+    'Macau (People\'s Republic of China|PRC)': 'Macau (China)',
+    'Martinique': 'Martinique (France)',
+    'Moldova (Republic of)': 'Moldova',
+    'Montenegro (Republic of)': 'Montenegro',
+    'Netherlands (Kingdom of the Netherlands)': 'Netherlands',
+    'Palestinian Authority': 'Palestinian territories',
+    'Palestinian territories|Palestine': 'Palestinian territories',
+    'People\'s Republic of China|China': 'China',
+    'Puerto Rico (United States)': 'Puerto Rico',
+    'Republic of Ireland|Ireland': 'Ireland',
+    'Republic of Korea|Korea, South': 'South Korea',
+    'Russian Federation': 'Russian Federation',
+    'Rwanda|Rwandese Republic': 'Rwanda',
+    'Serbia (Republic of)': 'Serbia',
+    'Somali Democratic Republic|Somalia': 'Somalia',
+    'Syrian Arab Republic': 'Syria',
+    'Syrian Arab Republic|Syria': 'Syria',
+    'Turks and Caicos Islands (United Kingdom)': 'Turks and Caicos Islands',
+    'United States': 'United States of America',
+    'United States Virgin Islands (United States)': 'United States Virgin Islands',
+    'Venezuela (Bolivarian Republic of)': 'Venezuela',
+    'Vietnam|Viet Nam': 'Vietnam',
 }
 
 
+remove_ref_re = re.compile(r'<ref>.*?</ref>')
+
+
 def cleanup_value(val):
     """Remove unneeded markup from the value."""
     # remove uninteresting things from value
     val = val.replace('[', '').replace(']', '').strip()
+    val = remove_ref_re.sub('', val)
     # replace value
     val = val.replace('United Kingdom|UK', 'United Kingdom')
     val = val.replace('United States|US', 'United States')
-    val = val.replace('New Zealand|NZ', 'New Zealand')
+    val = val.replace('New Zealand|NZ', 'New Zealand').strip()
     return cleanup_replacements.get(val, val)
 
 
-def update_mccs(mccs, mcc, **kwargs):
-    """Merge provided information in kwrags with the already stored
-    information in mccs."""
-    if mcc not in mccs:
-        mccs[mcc] = dict()
-    mccs[mcc].update(dict((k, cleanup_value(v)) for k,v in kwargs.items() if v))
-
-
-def update_mncs(mccs, mcc, mnc, **kwargs):
+def update_mncs(data, mcc, mnc, **kwargs):
     """Merge provided mnc information with the data that is already stored
     in mccs."""
-    if mcc not in mccs:
-        mccs[mcc] = dict()
-    mncs = mccs[mcc]
-    if mnc not in mncs:
-        mncs[mnc] = dict()
-    mncs[mnc].update(dict((k, cleanup_value(v)) for k,v in kwargs.items() if v))
-
-
-def get_mccs_from_wikipedia(mccs):
-    """Returns a dictionary of Mobile Country Codes mapping to a dictionary
-    that holds the cc (country code) and country (country name) keys. This
-    function parses a Wikipedia page."""
-    mcc_line_re = re.compile('^\|\s*(?P<mcc>[0-9]+)\s*\|\|\s*(?P<cc>[^\s]+)\s*\|\|\s*(?P<country>.*)\s*$')
+    data[mcc][mnc].update(dict((k, cleanup_value(v)) for k, v in kwargs.items() if v))
+
+
+def get_mncs_from_wikipedia(data):
+    """Update the collection of Mobile Country Codes from Wikipedia.
+    This parses a Wikipedia page to extract the MCC and MNC, the first
+    part of any IMSI, and stores the results."""
+    mnc_country_re = re.compile(r'^====\s+(?P<country>.*?)(\s+-\s+(?P<cc>[^\s]{2}))?\s+====$')
+    mnc_line_re = re.compile(r'^\|\s+(?P<mcc>[0-9]+)' +
+                             r'\s+\|\|\s+(?P<mnc>[0-9]+)' +
+                             r'(\s+\|\|\s+(?P<brand>[^|]*)' +
+                             r'(\s+\|\|\s+(?P<operator>[^|]*)' +
+                             r'(\s+\|\|\s+(?P<status>[^|]*)' +
+                             r'(\s+\|\|\s+(?P<bands>[^|]*)' +
+                             r'(\s+\|\|\s+(?P<notes>[^|]*)' +
+                             r')?)?)?)?)?')
     f = urllib.urlopen(mcc_list_url)
-    for line in f.readlines():
-        # search for lines that are part of the table
-        match = mcc_line_re.search(line)
-        if match:
-            update_mccs(mccs, match.group('mcc'), cc=match.group('cc').lower(),
-                        country=match.group('country'))
-
-
-def get_mncs_from_itu(mccs):
-    """This parses a text file that contains the copy-pasted table from the
-    "Mobile Network Codes (MNC) for the international identification plan
-    for public networks and subscriptions" document by the
-    TELECOMMUNICATION STANDARDIZATION BUREAU OF ITU downloaded from
-    http://www.itu.int/itu-t/bulletin/annex.html"""
-    twonumbers_re = re.compile('^\s*(?P<mcc>[0-9]+)\s+(?P<mnc>[0-9]+)\s*$')
-    f = open('imsi.info', 'r')
-    country = operator = ''
-    for line in f.readlines():
-        line = line.strip()
-        if not line:
-            country = operator
-        else:
-            match = twonumbers_re.search(line)
-            if not match:
-                operator = line
-            else:
-                update_mncs(mccs, match.group('mcc'), match.group('mnc'),
-                            country=country, operator=operator)
-
-
-def get_mncs_from_wikipedia(mccs):
-    """Returns a dictionary of Mobile Country Codes mapping to a dictionary
-    that holds the cc (country code) and country (country name) keys. This
-    function parses a Wikipedia page."""
-    mnc_country_re = re.compile('^====\s+(?P<country>.*?)(\s+-\s+(?P<cc>[^\s]{2}))?\s+====$')
-    mnc_line_re = re.compile('^\|\s+(?P<mcc>[0-9]+)\s+\|\|\s+(?P<mnc>[0-9]+)' +
-                             '(\s+\|\|\s+(?P<brand>[^|]*)' +
-                             '(\s+\|\|\s+(?P<operator>[^|]*)' +
-                             '(\s+\|\|\s+(?P<status>[^|]*)' +
-                             '(\s+\|\|\s+(?P<bands>[^|]*)' + '))))')
-    f = urllib.urlopen(mnc_list_url)
     country = cc = ''
     for line in f.readlines():
         line = line.strip()
@@ -166,7 +124,7 @@ def get_mncs_from_wikipedia(mccs):
             cc = (match.group('cc') or '').lower()
         match = mnc_line_re.match(line)
         if match:
-            update_mncs(mccs, match.group('mcc'), match.group('mnc'),
+            update_mncs(data, match.group('mcc'), match.group('mnc'),
                         country=country, cc=cc, brand=match.group('brand'),
                         operator=match.group('operator'),
                         status=match.group('status'),
@@ -175,41 +133,28 @@ def get_mncs_from_wikipedia(mccs):
 
 if __name__ == '__main__':
     # download/parse the information
-    mccs_info = {}
-    get_mccs_from_wikipedia(mccs_info)
-    mccs_mncs_info = {}
-    get_mncs_from_itu(mccs_mncs_info)
-    get_mncs_from_wikipedia(mccs_mncs_info)
+    data = defaultdict(lambda: defaultdict(dict))
+    get_mncs_from_wikipedia(data)
     # print header
     print '# generated from various sources'
     print '# %s' % mcc_list_url
-    print '# %s' % mnc_list_url
-    print '# http://www.itu.int/itu-t/bulletin/annex.html'
     # build an ordered list of mccs
-    mccs = list(set(mccs_info.keys() + mccs_mncs_info.keys()))
-    mccs.sort()
+    mcc_list = list(data.keys())
+    mcc_list.sort()
     # go over mccs
-    for mcc in mccs:
-        mcci = mccs_info.get(mcc, {})
-        cc = mcci.get('cc', '')
-        country = mcci.get('country', None)
-        print '%s%s%s' % ( mcc, ' cc="%s"' % cc if cc else '',
-                           ' country="%s"' % country if country else '')
+    for mcc in mcc_list:
+        print '%s' % mcc
         # build an ordered list of mncs
-        mncs = mccs_mncs_info.get(mcc, {}).keys()
-        mncs.sort()
-        for mnc in mncs:
-            info = mccs_mncs_info[mcc][mnc]
-            if cc and info.get('cc', '') == cc:
-                del info['cc']
-            if country and info.get('country', None) == country:
-                del info['country']
+        mnc_list = data[mcc].keys()
+        mnc_list.sort()
+        for mnc in mnc_list:
+            info = data[mcc][mnc]
             infokeys = info.keys()
             infokeys.sort()
             print ' %s%s' % (mnc, ''.join([' %s="%s"' % (k, info[k]) for k in infokeys]))
         # try to get the length of mnc's
         try:
-            l = len(mncs[0])
+            l = len(mnc_list[0])
             print ' %s-%s' % (l * '0', l * '9')
         except IndexError:
             pass  # ignore
author	Arthur de Jong <arthur@arthurdejong.org>	2013-12-04 20:54:12 +0100
committer	Arthur de Jong <arthur@arthurdejong.org>	2013-12-04 20:59:38 +0100
commit	7f30979a49c2605c9a5825281f73ef57bfa66192 (patch)
tree	4c895f1d45be2c23697178f6ec95f6cbd63feeb2 /getimsi.py
parent	b0c47d5e2889fe7ead576c9b4e8334e9699e5fda (diff)