diff options
author | Arthur de Jong <arthur@arthurdejong.org> | 2019-03-10 17:23:44 +0100 |
---|---|---|
committer | Arthur de Jong <arthur@arthurdejong.org> | 2019-03-10 17:23:44 +0100 |
commit | fbbb5503b1ed31b350c16b8c60f7de08c7a2ad5e (patch) | |
tree | 41dc0d25ca5e6917249c69e2bf0d4182c4eddee4 /update | |
parent | 61a8a94146ea9bc03fa94af44957b14ad673dc49 (diff) |
Switch update scripts to beautifulsoup4
Diffstat (limited to 'update')
-rwxr-xr-x | update/at_postleitzahl.py | 10 | ||||
-rwxr-xr-x | update/isil.py | 10 | ||||
-rwxr-xr-x | update/my_bp.py | 11 | ||||
-rw-r--r-- | update/requirements.txt | 2 |
4 files changed, 23 insertions, 10 deletions
diff --git a/update/at_postleitzahl.py b/update/at_postleitzahl.py index 4d2a993..173ee95 100755 --- a/update/at_postleitzahl.py +++ b/update/at_postleitzahl.py @@ -3,7 +3,7 @@ # update/at_postleitzahl.py - download list of Austrian postal codes # -# Copyright (C) 2018 Arthur de Jong +# Copyright (C) 2018-2019 Arthur de Jong # # This library is free software; you can redistribute it and/or # modify it under the terms of the GNU Lesser General Public @@ -29,7 +29,6 @@ import os.path import re import urllib -import BeautifulSoup import xlrd @@ -38,6 +37,11 @@ try: except ImportError: from urlparse import urljoin +try: + from bs4 import BeautifulSoup +except ImportError: + from BeautifulSoup import BeautifulSoup + # The page that contains a link to the downloadable spreadsheet with current # Austrian postal codes @@ -60,7 +64,7 @@ regions = { def find_download_url(): """Extract the spreadsheet URL from the Austrian Post website.""" f = urllib.urlopen(base_url) - soup = BeautifulSoup.BeautifulSoup(f, convertEntities='html') + soup = BeautifulSoup(f) url = soup.find( 'a', attrs=dict( diff --git a/update/isil.py b/update/isil.py index 3ef51b6..efa4163 100755 --- a/update/isil.py +++ b/update/isil.py @@ -2,7 +2,7 @@ # update/isil.py - script to donwload ISIL agencies # -# Copyright (C) 2011-2018 Arthur de Jong +# Copyright (C) 2011-2019 Arthur de Jong # # This library is free software; you can redistribute it and/or # modify it under the terms of the GNU Lesser General Public @@ -26,7 +26,11 @@ code prefixes.""" import re import urllib -import BeautifulSoup + +try: + from bs4 import BeautifulSoup +except ImportError: + from BeautifulSoup import BeautifulSoup spaces_re = re.compile(r'\s+', re.UNICODE) @@ -46,7 +50,7 @@ def parse(f): print('# %s' % download_url) # We hack the HTML to insert missing <TR> elements content = f.read().replace('</TR>', '</TR><TR>') - soup = BeautifulSoup.BeautifulSoup(content, convertEntities='html') + soup = BeautifulSoup(content) # find all table rows for tr in soup.findAll('tr'): # find the rows with four columns of text diff --git a/update/my_bp.py b/update/my_bp.py index ad9bc60..7337db3 100755 --- a/update/my_bp.py +++ b/update/my_bp.py @@ -2,7 +2,7 @@ # update/my_bp.py - script to download data from Malaysian government site # -# Copyright (C) 2013-2018 Arthur de Jong +# Copyright (C) 2013-2019 Arthur de Jong # # This library is free software; you can redistribute it and/or # modify it under the terms of the GNU Lesser General Public @@ -25,10 +25,15 @@ birthplace code from the National Registration Department of Malaysia.""" import re from collections import defaultdict -import BeautifulSoup import requests +try: + from bs4 import BeautifulSoup +except ImportError: + from BeautifulSoup import BeautifulSoup + + # URLs that are downloaded state_list_url = 'http://www.jpn.gov.my/informasi/kod-negeri/' country_list_url = 'http://www.jpn.gov.my/en/informasi/kod-negara/' @@ -48,7 +53,7 @@ def clean(s): def parse(f): """Parse the specified file.""" - soup = BeautifulSoup.BeautifulSoup(f, convertEntities='html') + soup = BeautifulSoup(f) # find all table rows for tr in soup.find('div', {'class': 'box-content'}).findAll('tr'): # find the rows with four columns of text diff --git a/update/requirements.txt b/update/requirements.txt index 0535937..c74ee0c 100644 --- a/update/requirements.txt +++ b/update/requirements.txt @@ -1,3 +1,3 @@ -BeautifulSoup +beautifulsoup4 requests xlrd |