Arthur de Jong

Open Source / Free Software developer

summaryrefslogtreecommitdiffstats
path: root/parsers/html/beautifulsoup.py
diff options
context:
space:
mode:
authorArthur de Jong <arthur@arthurdejong.org>2008-06-15 23:17:34 +0200
committerArthur de Jong <arthur@arthurdejong.org>2008-06-15 23:17:34 +0200
commit755e6887ffbf62b367c83b4510414e2e9eb28fa5 (patch)
tree12797ff3952dce753f18f5997a751957fe03ff62 /parsers/html/beautifulsoup.py
parentdec87676f410a10ccb53acd357509edbcedb1b76 (diff)
add parsing of script tag and background attributes, based on a patch by Robert M. Jansen <dutch12154@yahoo.com>
git-svn-id: http://arthurdejong.org/svn/webcheck/webcheck@380 86f53f14-5ff3-0310-afe5-9b438ce3f40c
Diffstat (limited to 'parsers/html/beautifulsoup.py')
-rw-r--r--parsers/html/beautifulsoup.py10
1 files changed, 10 insertions, 0 deletions
diff --git a/parsers/html/beautifulsoup.py b/parsers/html/beautifulsoup.py
index fb39873..a3536f0 100644
--- a/parsers/html/beautifulsoup.py
+++ b/parsers/html/beautifulsoup.py
@@ -164,5 +164,15 @@ def parse(content, link):
# delegate handling of inline css to css module
import parsers.css
parsers.css.parse(htmlunescape(style.string), link)
+ # <script src="url">
+ for script in soup.findAll('img', src=True):
+ embed = myurllib.normalizeurl(htmlunescape(script['src']).strip())
+ if embed:
+ link.add_embed(urlparse.urljoin(base, embed))
+ # <body|table|td background="url">
+ for t in soup.findAll( ('body', 'table', 'td'), background=True):
+ embed = myurllib.normalizeurl(htmlunescape(t['background']).strip())
+ if embed:
+ link.add_embed(urlparse.urljoin(base, embed))
# flag that the link contains a valid page
link.ispage = True