diff options
author | Arthur de Jong <arthur@arthurdejong.org> | 2008-06-15 23:17:34 +0200 |
---|---|---|
committer | Arthur de Jong <arthur@arthurdejong.org> | 2008-06-15 23:17:34 +0200 |
commit | 755e6887ffbf62b367c83b4510414e2e9eb28fa5 (patch) | |
tree | 12797ff3952dce753f18f5997a751957fe03ff62 /parsers/html/beautifulsoup.py | |
parent | dec87676f410a10ccb53acd357509edbcedb1b76 (diff) |
add parsing of script tag and background attributes, based on a patch by Robert M. Jansen <dutch12154@yahoo.com>
git-svn-id: http://arthurdejong.org/svn/webcheck/webcheck@380 86f53f14-5ff3-0310-afe5-9b438ce3f40c
Diffstat (limited to 'parsers/html/beautifulsoup.py')
-rw-r--r-- | parsers/html/beautifulsoup.py | 10 |
1 files changed, 10 insertions, 0 deletions
diff --git a/parsers/html/beautifulsoup.py b/parsers/html/beautifulsoup.py index fb39873..a3536f0 100644 --- a/parsers/html/beautifulsoup.py +++ b/parsers/html/beautifulsoup.py @@ -164,5 +164,15 @@ def parse(content, link): # delegate handling of inline css to css module import parsers.css parsers.css.parse(htmlunescape(style.string), link) + # <script src="url"> + for script in soup.findAll('img', src=True): + embed = myurllib.normalizeurl(htmlunescape(script['src']).strip()) + if embed: + link.add_embed(urlparse.urljoin(base, embed)) + # <body|table|td background="url"> + for t in soup.findAll( ('body', 'table', 'td'), background=True): + embed = myurllib.normalizeurl(htmlunescape(t['background']).strip()) + if embed: + link.add_embed(urlparse.urljoin(base, embed)) # flag that the link contains a valid page link.ispage = True |