Arthur de Jong

Open Source / Free Software developer

summaryrefslogtreecommitdiffstats
path: root/django/utils/html_parser.py
blob: a69f9149217b9014efa11b8bb441c62126185c45 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import re
import sys

from django.utils import six
from django.utils.six.moves import html_parser as _html_parser

current_version = sys.version_info

use_workaround = current_version < (2, 7, 3)

try:
    HTMLParseError = _html_parser.HTMLParseError
except AttributeError:
    # create a dummy class for Python 3.5+ where it's been removed
    class HTMLParseError(Exception):
        pass

if not use_workaround:
    if six.PY3:
        class HTMLParser(_html_parser.HTMLParser):
            """Explicitly set convert_charrefs to be False.

            This silences a deprecation warning on Python 3.4, but we can't do
            it at call time because Python 2.7 does not have the keyword
            argument.
            """
            def __init__(self, convert_charrefs=False, **kwargs):
                _html_parser.HTMLParser.__init__(self, convert_charrefs=convert_charrefs, **kwargs)
    else:
        HTMLParser = _html_parser.HTMLParser
else:
    tagfind = re.compile('([a-zA-Z][-.a-zA-Z0-9:_]*)(?:\s|/(?!>))*')

    class HTMLParser(_html_parser.HTMLParser):
        """
        Patched version of stdlib's HTMLParser with patch from:
        http://bugs.python.org/issue670664
        """
        def __init__(self):
            _html_parser.HTMLParser.__init__(self)
            self.cdata_tag = None

        def set_cdata_mode(self, tag):
            try:
                self.interesting = _html_parser.interesting_cdata
            except AttributeError:
                self.interesting = re.compile(r'</\s*%s\s*>' % tag.lower(), re.I)
            self.cdata_tag = tag.lower()

        def clear_cdata_mode(self):
            self.interesting = _html_parser.interesting_normal
            self.cdata_tag = None

        # Internal -- handle starttag, return end or -1 if not terminated
        def parse_starttag(self, i):
            self.__starttag_text = None
            endpos = self.check_for_whole_start_tag(i)
            if endpos < 0:
                return endpos
            rawdata = self.rawdata
            self.__starttag_text = rawdata[i:endpos]

            # Now parse the data between i+1 and j into a tag and attrs
            attrs = []
            match = tagfind.match(rawdata, i + 1)
            assert match, 'unexpected call to parse_starttag()'
            k = match.end()
            self.lasttag = tag = match.group(1).lower()

            while k < endpos:
                m = _html_parser.attrfind.match(rawdata, k)
                if not m:
                    break
                attrname, rest, attrvalue = m.group(1, 2, 3)
                if not rest:
                    attrvalue = None
                elif (attrvalue[:1] == '\'' == attrvalue[-1:] or
                        attrvalue[:1] == '"' == attrvalue[-1:]):
                    attrvalue = attrvalue[1:-1]
                if attrvalue:
                    attrvalue = self.unescape(attrvalue)
                attrs.append((attrname.lower(), attrvalue))
                k = m.end()

            end = rawdata[k:endpos].strip()
            if end not in (">", "/>"):
                lineno, offset = self.getpos()
                if "\n" in self.__starttag_text:
                    lineno = lineno + self.__starttag_text.count("\n")
                    offset = (len(self.__starttag_text)
                             - self.__starttag_text.rfind("\n"))
                else:
                    offset = offset + len(self.__starttag_text)
                self.error("junk characters in start tag: %r"
                           % (rawdata[k:endpos][:20],))
            if end.endswith('/>'):
                # XHTML-style empty tag: <span attr="value" />
                self.handle_startendtag(tag, attrs)
            else:
                self.handle_starttag(tag, attrs)
                if tag in self.CDATA_CONTENT_ELEMENTS:
                    self.set_cdata_mode(tag)  # <--------------------------- Changed
            return endpos

        # Internal -- parse endtag, return end or -1 if incomplete
        def parse_endtag(self, i):
            rawdata = self.rawdata
            assert rawdata[i:i + 2] == "</", "unexpected call to parse_endtag"
            match = _html_parser.endendtag.search(rawdata, i + 1)  # >
            if not match:
                return -1
            j = match.end()
            match = _html_parser.endtagfind.match(rawdata, i)  # </ + tag + >
            if not match:
                if self.cdata_tag is not None:  # *** add ***
                    self.handle_data(rawdata[i:j])  # *** add ***
                    return j  # *** add ***
                self.error("bad end tag: %r" % (rawdata[i:j],))
            # --- changed start ---------------------------------------------------
            tag = match.group(1).strip()
            if self.cdata_tag is not None:
                if tag.lower() != self.cdata_tag:
                    self.handle_data(rawdata[i:j])
                    return j
            # --- changed end -----------------------------------------------------
            self.handle_endtag(tag.lower())
            self.clear_cdata_mode()
            return j