diff options
author | Arthur de Jong <arthur@arthurdejong.org> | 2019-03-10 17:23:44 +0100 |
---|---|---|
committer | Arthur de Jong <arthur@arthurdejong.org> | 2019-03-10 17:23:44 +0100 |
commit | fbbb5503b1ed31b350c16b8c60f7de08c7a2ad5e (patch) | |
tree | 41dc0d25ca5e6917249c69e2bf0d4182c4eddee4 /update/isil.py | |
parent | 61a8a94146ea9bc03fa94af44957b14ad673dc49 (diff) |
Switch update scripts to beautifulsoup4
Diffstat (limited to 'update/isil.py')
-rwxr-xr-x | update/isil.py | 10 |
1 files changed, 7 insertions, 3 deletions
diff --git a/update/isil.py b/update/isil.py index 3ef51b6..efa4163 100755 --- a/update/isil.py +++ b/update/isil.py @@ -2,7 +2,7 @@ # update/isil.py - script to donwload ISIL agencies # -# Copyright (C) 2011-2018 Arthur de Jong +# Copyright (C) 2011-2019 Arthur de Jong # # This library is free software; you can redistribute it and/or # modify it under the terms of the GNU Lesser General Public @@ -26,7 +26,11 @@ code prefixes.""" import re import urllib -import BeautifulSoup + +try: + from bs4 import BeautifulSoup +except ImportError: + from BeautifulSoup import BeautifulSoup spaces_re = re.compile(r'\s+', re.UNICODE) @@ -46,7 +50,7 @@ def parse(f): print('# %s' % download_url) # We hack the HTML to insert missing <TR> elements content = f.read().replace('</TR>', '</TR><TR>') - soup = BeautifulSoup.BeautifulSoup(content, convertEntities='html') + soup = BeautifulSoup(content) # find all table rows for tr in soup.findAll('tr'): # find the rows with four columns of text |