Arthur de Jong

Open Source / Free Software developer

summaryrefslogtreecommitdiffstats
path: root/stdnum/util.py
blob: 5b0e4e8885d85f64678e028d58b47faf0b290104 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
# util.py - common utility functions
# coding: utf-8
#
# Copyright (C) 2012-2021 Arthur de Jong
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
# 02110-1301 USA

"""Common utility functions for other stdnum modules.

This module is meant for internal use by stdnum modules and is not
guaranteed to remain stable and as such not part of the public API of
stdnum.
"""

import pkgutil
import pydoc
import re
import sys
import unicodedata
import warnings

from stdnum.exceptions import *


# Regular expression to match doctests in docstrings
_strip_doctest_re = re.compile(r'^>>> .*\Z', re.DOTALL | re.MULTILINE)


# Regular expression to match digits
_digits_re = re.compile(r'^[0-9]+$')


def _mk_char_map(mapping):
    """Transform a dictionary with comma separated uniode chracter names
    to tuples with unicode characters as key."""
    for key, value in mapping.items():
        for char in key.split(','):
            try:
                yield (unicodedata.lookup(char), value)
            except KeyError:  # pragma: no cover (does not happen on Python3)
                pass


# build mapping of Unicode characters to equivalent ASCII characters
_char_map = dict(_mk_char_map({
    'HYPHEN-MINUS,ARMENIAN HYPHEN,HEBREW PUNCTUATION MAQAF,HYPHEN,'
    'NON-BREAKING HYPHEN,FIGURE DASH,EN DASH,EM DASH,HORIZONTAL BAR,'
    'SMALL HYPHEN-MINUS,FULLWIDTH HYPHEN-MINUS,MONGOLIAN NIRUGU,OVERLINE,'
    'HYPHEN BULLET,MACRON,MODIFIER LETTER MINUS SIGN,FULLWIDTH MACRON,'
    'OGHAM SPACE MARK,SUPERSCRIPT MINUS,SUBSCRIPT MINUS,MINUS SIGN,'
    'HORIZONTAL LINE EXTENSION,HORIZONTAL SCAN LINE-1,HORIZONTAL SCAN LINE-3,'
    'HORIZONTAL SCAN LINE-7,HORIZONTAL SCAN LINE-9,STRAIGHTNESS':
        '-',
    'ASTERISK,ARABIC FIVE POINTED STAR,SYRIAC HARKLEAN ASTERISCUS,'
    'FLOWER PUNCTUATION MARK,VAI FULL STOP,SMALL ASTERISK,FULLWIDTH ASTERISK,'
    'ASTERISK OPERATOR,STAR OPERATOR,HEAVY ASTERISK,LOW ASTERISK,'
    'OPEN CENTRE ASTERISK,EIGHT SPOKED ASTERISK,SIXTEEN POINTED ASTERISK,'
    'TEARDROP-SPOKED ASTERISK,OPEN CENTRE TEARDROP-SPOKED ASTERISK,'
    'HEAVY TEARDROP-SPOKED ASTERISK,EIGHT TEARDROP-SPOKED PROPELLER ASTERISK,'
    'HEAVY EIGHT TEARDROP-SPOKED PROPELLER ASTERISK,'
    'ARABIC FIVE POINTED STAR':
        '*',
    'COMMA,ARABIC COMMA,SINGLE LOW-9 QUOTATION MARK,IDEOGRAPHIC COMMA,'
    'ARABIC DECIMAL SEPARATOR,ARABIC THOUSANDS SEPARATOR,PRIME,RAISED COMMA,'
    'PRESENTATION FORM FOR VERTICAL COMMA,SMALL COMMA,'
    'SMALL IDEOGRAPHIC COMMA,FULLWIDTH COMMA,CEDILLA':
        ',',
    'FULL STOP,MIDDLE DOT,GREEK ANO TELEIA,ARABIC FULL STOP,'
    'IDEOGRAPHIC FULL STOP,SYRIAC SUPRALINEAR FULL STOP,'
    'SYRIAC SUBLINEAR FULL STOP,SAMARITAN PUNCTUATION NEQUDAA,'
    'TIBETAN MARK INTERSYLLABIC TSHEG,TIBETAN MARK DELIMITER TSHEG BSTAR,'
    'RUNIC SINGLE PUNCTUATION,BULLET,ONE DOT LEADER,HYPHENATION POINT,'
    'WORD SEPARATOR MIDDLE DOT,RAISED DOT,KATAKANA MIDDLE DOT,'
    'SMALL FULL STOP,FULLWIDTH FULL STOP,HALFWIDTH KATAKANA MIDDLE DOT,'
    'AEGEAN WORD SEPARATOR DOT,PHOENICIAN WORD SEPARATOR,'
    'KHAROSHTHI PUNCTUATION DOT,DOT ABOVE,ARABIC SYMBOL DOT ABOVE,'
    'ARABIC SYMBOL DOT BELOW,BULLET OPERATOR,DOT OPERATOR':
        '.',
    'SOLIDUS,SAMARITAN PUNCTUATION ARKAANU,FULLWIDTH SOLIDUS,DIVISION SLASH,'
    'MATHEMATICAL RISING DIAGONAL,BIG SOLIDUS,FRACTION SLASH':
        '/',
    'COLON,ETHIOPIC WORDSPACE,RUNIC MULTIPLE PUNCTUATION,MONGOLIAN COLON,'
    'PRESENTATION FORM FOR VERTICAL COLON,FULLWIDTH COLON,'
    'PRESENTATION FORM FOR VERTICAL TWO DOT LEADER,SMALL COLON':
        ':',
    'SPACE,NO-BREAK SPACE,EN QUAD,EM QUAD,EN SPACE,EM SPACE,'
    'THREE-PER-EM SPACE,FOUR-PER-EM SPACE,SIX-PER-EM SPACE,FIGURE SPACE,'
    'PUNCTUATION SPACE,THIN SPACE,HAIR SPACE,NARROW NO-BREAK SPACE,'
    'MEDIUM MATHEMATICAL SPACE,IDEOGRAPHIC SPACE':
        ' ',
    'FULLWIDTH DIGIT ZERO,MATHEMATICAL BOLD DIGIT ZERO,'
    'MATHEMATICAL DOUBLE-STRUCK DIGIT ZERO,MATHEMATICAL SANS-SERIF DIGIT ZERO,'
    'MATHEMATICAL SANS-SERIF BOLD DIGIT ZERO,MATHEMATICAL MONOSPACE DIGIT ZERO':
        '0',
    'FULLWIDTH DIGIT ONE,MATHEMATICAL BOLD DIGIT ONE,'
    'MATHEMATICAL DOUBLE-STRUCK DIGIT ONE,MATHEMATICAL SANS-SERIF DIGIT ONE,'
    'MATHEMATICAL SANS-SERIF BOLD DIGIT ONE,MATHEMATICAL MONOSPACE DIGIT ONE':
        '1',
    'FULLWIDTH DIGIT TWO,MATHEMATICAL BOLD DIGIT TWO,'
    'MATHEMATICAL DOUBLE-STRUCK DIGIT TWO,MATHEMATICAL SANS-SERIF DIGIT TWO,'
    'MATHEMATICAL SANS-SERIF BOLD DIGIT TWO,MATHEMATICAL MONOSPACE DIGIT TWO':
        '2',
    'FULLWIDTH DIGIT THREE,MATHEMATICAL BOLD DIGIT THREE,'
    'MATHEMATICAL DOUBLE-STRUCK DIGIT THREE,MATHEMATICAL SANS-SERIF DIGIT THREE,'
    'MATHEMATICAL SANS-SERIF BOLD DIGIT THREE,MATHEMATICAL MONOSPACE DIGIT THREE':
        '3',
    'FULLWIDTH DIGIT FOUR,MATHEMATICAL BOLD DIGIT FOUR,'
    'MATHEMATICAL DOUBLE-STRUCK DIGIT FOUR,MATHEMATICAL SANS-SERIF DIGIT FOUR,'
    'MATHEMATICAL SANS-SERIF BOLD DIGIT FOUR,MATHEMATICAL MONOSPACE DIGIT FOUR':
        '4',
    'FULLWIDTH DIGIT FIVE,MATHEMATICAL BOLD DIGIT FIVE,'
    'MATHEMATICAL DOUBLE-STRUCK DIGIT FIVE,MATHEMATICAL SANS-SERIF DIGIT FIVE,'
    'MATHEMATICAL SANS-SERIF BOLD DIGIT FIVE,MATHEMATICAL MONOSPACE DIGIT FIVE':
        '5',
    'FULLWIDTH DIGIT SIX,MATHEMATICAL BOLD DIGIT SIX,'
    'MATHEMATICAL DOUBLE-STRUCK DIGIT SIX,MATHEMATICAL SANS-SERIF DIGIT SIX,'
    'MATHEMATICAL SANS-SERIF BOLD DIGIT SIX,MATHEMATICAL MONOSPACE DIGIT SIX':
        '6',
    'FULLWIDTH DIGIT SEVEN,MATHEMATICAL BOLD DIGIT SEVEN,'
    'MATHEMATICAL DOUBLE-STRUCK DIGIT SEVEN,MATHEMATICAL SANS-SERIF DIGIT SEVEN,'
    'MATHEMATICAL SANS-SERIF BOLD DIGIT SEVEN,MATHEMATICAL MONOSPACE DIGIT SEVEN':
        '7',
    'FULLWIDTH DIGIT EIGHT,MATHEMATICAL BOLD DIGIT EIGHT,'
    'MATHEMATICAL DOUBLE-STRUCK DIGIT EIGHT,MATHEMATICAL SANS-SERIF DIGIT EIGHT,'
    'MATHEMATICAL SANS-SERIF BOLD DIGIT EIGHT,MATHEMATICAL MONOSPACE DIGIT EIGHT':
        '8',
    'FULLWIDTH DIGIT NINE,MATHEMATICAL BOLD DIGIT NINE,'
    'MATHEMATICAL DOUBLE-STRUCK DIGIT NINE,MATHEMATICAL SANS-SERIF DIGIT NINE,'
    'MATHEMATICAL SANS-SERIF BOLD DIGIT NINE,MATHEMATICAL MONOSPACE DIGIT NINE':
        '9',
    'APOSTROPHE,GRAVE ACCENT,ACUTE ACCENT,MODIFIER LETTER RIGHT HALF RING,'
    'MODIFIER LETTER LEFT HALF RING,MODIFIER LETTER PRIME,'
    'MODIFIER LETTER TURNED COMMA,MODIFIER LETTER APOSTROPHE,'
    'MODIFIER LETTER VERTICAL LINE,COMBINING GRAVE ACCENT,'
    'COMBINING ACUTE ACCENT,COMBINING TURNED COMMA ABOVE,'
    'COMBINING COMMA ABOVE,ARMENIAN APOSTROPHE,'
    'SINGLE HIGH-REVERSED-9 QUOTATION MARK,LEFT SINGLE QUOTATION MARK,'
    'RIGHT SINGLE QUOTATION MARK':
        "'",
}))


def _clean_chars(number):
    """Replace various Unicode characters with their ASCII counterpart."""
    return ''.join(_char_map.get(x, x) for x in number)


def clean(number, deletechars=''):
    """Remove the specified characters from the supplied number.

    >>> clean('123-456:78 9', ' -:')
    '123456789'
    >>> clean('1–2—3―4')
    '1-2-3-4'
    """
    try:
        number = ''.join(x for x in number)
    except Exception:  # noqa: B902
        raise InvalidFormat()
    if sys.version < '3' and isinstance(number, str):  # pragma: no cover (Python 2 specific code)
        try:
            number = _clean_chars(number.decode()).encode()
        except UnicodeError:
            try:
                number = _clean_chars(number.decode('utf-8')).encode('utf-8')
            except UnicodeError:
                pass
    else:  # pragma: no cover (Python 3 specific code)
        number = _clean_chars(number)
    return ''.join(x for x in number if x not in deletechars)


def isdigits(number):
    """Check whether the provided string only consists of digits."""
    # This function is meant to replace str.isdigit() which will also return
    # True for all kind of unicode digits which is generally not what we want
    return bool(_digits_re.match(number))


def to_unicode(text):
    """Convert the specified text to a unicode string."""
    if not isinstance(text, type(u'')):
        try:
            return text.decode('utf-8')
        except UnicodeDecodeError:
            return text.decode('iso-8859-15')
    return text


def get_number_modules(base='stdnum'):
    """Yield all the number validation modules under the specified module."""
    __import__(base)
    module = sys.modules[base]
    # we ignore deprecation warnings from transitional modules
    with warnings.catch_warnings():
        warnings.filterwarnings('ignore', category=DeprecationWarning, module=r'stdnum\..*')
        for _loader, name, _is_pkg in pkgutil.walk_packages(
                module.__path__, module.__name__ + '.'):
            __import__(name)
            module = sys.modules[name]
            if hasattr(module, 'validate') and module.__name__ == name:
                yield module


def get_module_name(module):
    """Return the short description of the number."""
    return pydoc.splitdoc(pydoc.getdoc(module))[0].strip('.')


def get_module_description(module):
    """Return a description of the number."""
    doc = pydoc.splitdoc(pydoc.getdoc(module))[1]
    # remove the doctests
    return _strip_doctest_re.sub('', doc).strip()


def get_cc_module(cc, name):
    """Find the country-specific named module."""
    cc = cc.lower()
    # add suffix for python reserved words
    if cc in ('in', 'is', 'if'):
        cc += '_'
    try:
        mod = __import__('stdnum.%s' % cc, globals(), locals(), [str(name)])
        return getattr(mod, name, None)
    except ImportError:
        return


# this is a cache of SOAP clients
_soap_clients = {}


def get_soap_client(wsdlurl, timeout=30):  # pragma: no cover (not part of normal test suite)
    """Get a SOAP client for performing requests. The client is cached. The
    timeout is in seconds."""
    # this function isn't automatically tested because the functions using
    # it are not automatically tested
    if (wsdlurl, timeout) not in _soap_clients:
        # try zeep first
        try:
            from zeep.transports import Transport
            transport = Transport(timeout=timeout)
            from zeep import CachingClient
            client = CachingClient(wsdlurl, transport=transport).service
        except ImportError:
            # fall back to non-caching zeep client
            try:
                from zeep import Client
                client = Client(wsdlurl, transport=transport).service
            except ImportError:
                # other implementations require passing the proxy config
                try:
                    from urllib import getproxies
                except ImportError:
                    from urllib.request import getproxies
                # fall back to suds
                try:
                    from suds.client import Client
                    client = Client(
                        wsdlurl, proxy=getproxies(), timeout=timeout).service
                except ImportError:
                    # use pysimplesoap as last resort
                    try:
                        from pysimplesoap.client import SoapClient
                        client = SoapClient(
                            wsdl=wsdlurl, proxy=getproxies(), timeout=timeout)
                    except ImportError:
                        raise ImportError(
                            'No SOAP library (such as zeep) found')
        _soap_clients[(wsdlurl, timeout)] = client
    return _soap_clients[(wsdlurl, timeout)]