Arthur de Jong

Open Source / Free Software developer

summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorArthur de Jong <arthur@arthurdejong.org>2007-04-24 20:53:11 +0200
committerArthur de Jong <arthur@arthurdejong.org>2007-04-24 20:53:11 +0200
commit44ac994edca503d6b41d04a64fe67d581deb5926 (patch)
tree00a103ceb902084d90f30ce11b5412b8311a3a5e
parent3a2e27dc60739c3ab6a08767a8088fccb81df200 (diff)
handle ID attribute as anchor on any tag
git-svn-id: http://arthurdejong.org/svn/webcheck/webcheck@326 86f53f14-5ff3-0310-afe5-9b438ce3f40c
-rw-r--r--parsers/html/beautifulsoup.py10
1 files changed, 5 insertions, 5 deletions
diff --git a/parsers/html/beautifulsoup.py b/parsers/html/beautifulsoup.py
index a18d1cc..61abb59 100644
--- a/parsers/html/beautifulsoup.py
+++ b/parsers/html/beautifulsoup.py
@@ -91,13 +91,13 @@ def parse(content, link):
link.add_anchor(crawler.urlescape(htmlunescape(a['id']).strip()))
# add the anchor
link.add_anchor(a_name)
- # <a id="ID">
- for a in soup.findAll('a', id=True):
- # skip entries that have a name
- if a.has_key('name'):
+ # <ANY id="ID">
+ for elem in soup.findAll(id=True):
+ # skip anchor that have a name
+ if elem.name == 'a' and elem.has_key('name'):
continue
# add the anchor
- link.add_anchor(crawler.urlescape(htmlunescape(a['id']).strip()))
+ link.add_anchor(crawler.urlescape(htmlunescape(elem['id']).strip()))
# <frameset><frame src="URL"...>...</frameset>
for frame in soup.findAll('frame', src=True):
embed = crawler.urlescape(htmlunescape(frame['src']).strip())