Arthur de Jong

Open Source / Free Software developer

summaryrefslogtreecommitdiffstats
path: root/parsers
diff options
context:
space:
mode:
authorArthur de Jong <arthur@arthurdejong.org>2008-06-15 23:17:34 +0200
committerArthur de Jong <arthur@arthurdejong.org>2008-06-15 23:17:34 +0200
commit755e6887ffbf62b367c83b4510414e2e9eb28fa5 (patch)
tree12797ff3952dce753f18f5997a751957fe03ff62 /parsers
parentdec87676f410a10ccb53acd357509edbcedb1b76 (diff)
add parsing of script tag and background attributes, based on a patch by Robert M. Jansen <dutch12154@yahoo.com>
git-svn-id: http://arthurdejong.org/svn/webcheck/webcheck@380 86f53f14-5ff3-0310-afe5-9b438ce3f40c
Diffstat (limited to 'parsers')
-rw-r--r--parsers/html/beautifulsoup.py10
-rw-r--r--parsers/html/htmlparser.py6
2 files changed, 16 insertions, 0 deletions
diff --git a/parsers/html/beautifulsoup.py b/parsers/html/beautifulsoup.py
index fb39873..a3536f0 100644
--- a/parsers/html/beautifulsoup.py
+++ b/parsers/html/beautifulsoup.py
@@ -164,5 +164,15 @@ def parse(content, link):
# delegate handling of inline css to css module
import parsers.css
parsers.css.parse(htmlunescape(style.string), link)
+ # <script src="url">
+ for script in soup.findAll('img', src=True):
+ embed = myurllib.normalizeurl(htmlunescape(script['src']).strip())
+ if embed:
+ link.add_embed(urlparse.urljoin(base, embed))
+ # <body|table|td background="url">
+ for t in soup.findAll( ('body', 'table', 'td'), background=True):
+ embed = myurllib.normalizeurl(htmlunescape(t['background']).strip())
+ if embed:
+ link.add_embed(urlparse.urljoin(base, embed))
# flag that the link contains a valid page
link.ispage = True
diff --git a/parsers/html/htmlparser.py b/parsers/html/htmlparser.py
index 00ae2e4..6dbd932 100644
--- a/parsers/html/htmlparser.py
+++ b/parsers/html/htmlparser.py
@@ -200,6 +200,12 @@ class _MyHTMLParser(HTMLParser.HTMLParser):
# <style>content</style>
elif tag == 'style':
self.collect = ''
+ # <script src="url">
+ elif tag == 'script' and attrs.has_key('src'):
+ self.embedded.append(self._cleanurl(attrs['src']))
+ # <body|table|td background="url">
+ elif tag in ('body', 'table', 'td') and attrs.has_key('background'):
+ self.embedded.append(self._cleanurl(attrs['background']))
def handle_endtag(self, tag):
"""Handle end tags in html."""