diff options
author | Arthur de Jong <arthur@arthurdejong.org> | 2008-06-15 23:17:34 +0200 |
---|---|---|
committer | Arthur de Jong <arthur@arthurdejong.org> | 2008-06-15 23:17:34 +0200 |
commit | 755e6887ffbf62b367c83b4510414e2e9eb28fa5 (patch) | |
tree | 12797ff3952dce753f18f5997a751957fe03ff62 /parsers | |
parent | dec87676f410a10ccb53acd357509edbcedb1b76 (diff) |
add parsing of script tag and background attributes, based on a patch by Robert M. Jansen <dutch12154@yahoo.com>
git-svn-id: http://arthurdejong.org/svn/webcheck/webcheck@380 86f53f14-5ff3-0310-afe5-9b438ce3f40c
Diffstat (limited to 'parsers')
-rw-r--r-- | parsers/html/beautifulsoup.py | 10 | ||||
-rw-r--r-- | parsers/html/htmlparser.py | 6 |
2 files changed, 16 insertions, 0 deletions
diff --git a/parsers/html/beautifulsoup.py b/parsers/html/beautifulsoup.py index fb39873..a3536f0 100644 --- a/parsers/html/beautifulsoup.py +++ b/parsers/html/beautifulsoup.py @@ -164,5 +164,15 @@ def parse(content, link): # delegate handling of inline css to css module import parsers.css parsers.css.parse(htmlunescape(style.string), link) + # <script src="url"> + for script in soup.findAll('img', src=True): + embed = myurllib.normalizeurl(htmlunescape(script['src']).strip()) + if embed: + link.add_embed(urlparse.urljoin(base, embed)) + # <body|table|td background="url"> + for t in soup.findAll( ('body', 'table', 'td'), background=True): + embed = myurllib.normalizeurl(htmlunescape(t['background']).strip()) + if embed: + link.add_embed(urlparse.urljoin(base, embed)) # flag that the link contains a valid page link.ispage = True diff --git a/parsers/html/htmlparser.py b/parsers/html/htmlparser.py index 00ae2e4..6dbd932 100644 --- a/parsers/html/htmlparser.py +++ b/parsers/html/htmlparser.py @@ -200,6 +200,12 @@ class _MyHTMLParser(HTMLParser.HTMLParser): # <style>content</style> elif tag == 'style': self.collect = '' + # <script src="url"> + elif tag == 'script' and attrs.has_key('src'): + self.embedded.append(self._cleanurl(attrs['src'])) + # <body|table|td background="url"> + elif tag in ('body', 'table', 'td') and attrs.has_key('background'): + self.embedded.append(self._cleanurl(attrs['background'])) def handle_endtag(self, tag): """Handle end tags in html.""" |