diff options
author | Arthur de Jong <arthur@arthurdejong.org> | 2007-04-24 20:53:11 +0200 |
---|---|---|
committer | Arthur de Jong <arthur@arthurdejong.org> | 2007-04-24 20:53:11 +0200 |
commit | 44ac994edca503d6b41d04a64fe67d581deb5926 (patch) | |
tree | 00a103ceb902084d90f30ce11b5412b8311a3a5e | |
parent | 3a2e27dc60739c3ab6a08767a8088fccb81df200 (diff) |
handle ID attribute as anchor on any tag
git-svn-id: http://arthurdejong.org/svn/webcheck/webcheck@326 86f53f14-5ff3-0310-afe5-9b438ce3f40c
-rw-r--r-- | parsers/html/beautifulsoup.py | 10 |
1 files changed, 5 insertions, 5 deletions
diff --git a/parsers/html/beautifulsoup.py b/parsers/html/beautifulsoup.py index a18d1cc..61abb59 100644 --- a/parsers/html/beautifulsoup.py +++ b/parsers/html/beautifulsoup.py @@ -91,13 +91,13 @@ def parse(content, link): link.add_anchor(crawler.urlescape(htmlunescape(a['id']).strip())) # add the anchor link.add_anchor(a_name) - # <a id="ID"> - for a in soup.findAll('a', id=True): - # skip entries that have a name - if a.has_key('name'): + # <ANY id="ID"> + for elem in soup.findAll(id=True): + # skip anchor that have a name + if elem.name == 'a' and elem.has_key('name'): continue # add the anchor - link.add_anchor(crawler.urlescape(htmlunescape(a['id']).strip())) + link.add_anchor(crawler.urlescape(htmlunescape(elem['id']).strip())) # <frameset><frame src="URL"...>...</frameset> for frame in soup.findAll('frame', src=True): embed = crawler.urlescape(htmlunescape(frame['src']).strip()) |