Arthur de Jong

Open Source / Free Software developer

summaryrefslogtreecommitdiffstats
path: root/update/do_whitelists.py
blob: 3f55d076791419db93097e5eb5bf2147008d054c (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
#!/usr/bin/env python3
# coding: utf-8

# update/do_whitelists.py - script to update do.rnc and do.cedula whitelists
#
# Copyright (C) 2017-2019 Arthur de Jong
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
# 02110-1301 USA

"""This script downloads a ZIP file from the Dirección General de Impuestos
Internos (DGII) web site with lists of all RNC and Cedula values and outputs
new whitelists for these modules."""

import io
import os.path
import sys
import tempfile
import textwrap
import zipfile

import requests


# Ensure that our local stdnum implementation is used
sys.path.insert(0, os.path.normpath(
    os.path.join(os.path.dirname(os.path.abspath(__file__)), '..')))

from stdnum.do import cedula, rnc  # noqa, isort:skip


# The URL of the zip file with all valid numbers
download_url = 'https://www.dgii.gov.do/app/WebApps/Consultas/rnc/DGII_RNC.zip'


def handle_zipfile(f):
    """Parse the ZIP file and return a set of invalid RNC and Cedula."""
    # collections of invalid numbers found
    invalidrnc = set()
    invalidcedula = set()
    # read the information from the ZIP file
    z = zipfile.ZipFile(f, 'r')
    for line in io.TextIOWrapper(z.open('TMP/DGII_RNC.TXT'), encoding='iso8859-15'):
        number = line.split('|', 1)[0].strip()
        if number.isdigit():
            if len(number) <= 9:
                if not rnc.is_valid(number):
                    invalidrnc.add(number)
            else:
                if not cedula.is_valid(number):
                    invalidcedula.add(number)
    # return known but invalid numbers
    return invalidrnc, invalidcedula


if __name__ == '__main__':

    # Download and read the ZIP file with valid data
    with tempfile.TemporaryFile() as tmp:
        # Download the zip file to a temporary file
        response = requests.get(download_url, stream=True, timeout=30)
        response.raise_for_status()
        print('%s: %s' % (
            os.path.basename(download_url),
            response.headers.get('last-modified')))
        tmp.write(response.content)
        # Open the temporary file as a zip file and read contents
        # (we cannot do this streaming because zipfile requires seek)
        invalidrnc, invalidcedula = handle_zipfile(tmp)

    # Output new RNC whitelist if changed
    if not invalidrnc:
        print('NO NEW WHITELISTED RNC')
    else:
        print('NEW RNC WHITELIST:')
        print('\n'.join(textwrap.wrap(
            ' '.join(sorted(rnc.whitelist | invalidrnc)), 77)))

    # Output new Cedula whitelist if changed
    if not invalidrnc:
        print('NO NEW WHITELISTED CEDULA')
    else:
        print('NEW CEDULA WHITELIST:')
        print('\n'.join(textwrap.wrap(
            ' '.join(sorted(cedula.whitelist | invalidcedula)), 77)))