• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1"""A parser for HTML and XHTML."""
2
3# This file is based on sgmllib.py, but the API is slightly different.
4
5# XXX There should be a way to distinguish between PCDATA (parsed
6# character data -- the normal case), RCDATA (replaceable character
7# data -- only char and entity references and end tags are special)
8# and CDATA (character data -- only end tags are special).
9
10
11import re
12import _markupbase
13
14from html import unescape
15
16
17__all__ = ['HTMLParser']
18
19# Regular expressions used for parsing
20
21interesting_normal = re.compile('[&<]')
22incomplete = re.compile('&[a-zA-Z#]')
23
24entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
25charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
26
27starttagopen = re.compile('<[a-zA-Z]')
28piclose = re.compile('>')
29commentclose = re.compile(r'--\s*>')
30# Note:
31#  1) if you change tagfind/attrfind remember to update locatestarttagend too;
32#  2) if you change tagfind/attrfind and/or locatestarttagend the parser will
33#     explode, so don't do it.
34# see http://www.w3.org/TR/html5/tokenization.html#tag-open-state
35# and http://www.w3.org/TR/html5/tokenization.html#tag-name-state
36tagfind_tolerant = re.compile(r'([a-zA-Z][^\t\n\r\f />\x00]*)(?:\s|/(?!>))*')
37attrfind_tolerant = re.compile(
38    r'((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*'
39    r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*')
40locatestarttagend_tolerant = re.compile(r"""
41  <[a-zA-Z][^\t\n\r\f />\x00]*       # tag name
42  (?:[\s/]*                          # optional whitespace before attribute name
43    (?:(?<=['"\s/])[^\s/>][^\s/=>]*  # attribute name
44      (?:\s*=+\s*                    # value indicator
45        (?:'[^']*'                   # LITA-enclosed value
46          |"[^"]*"                   # LIT-enclosed value
47          |(?!['"])[^>\s]*           # bare value
48         )
49        \s*                          # possibly followed by a space
50       )?(?:\s|/(?!>))*
51     )*
52   )?
53  \s*                                # trailing whitespace
54""", re.VERBOSE)
55endendtag = re.compile('>')
56# the HTML 5 spec, section 8.1.2.2, doesn't allow spaces between
57# </ and the tag name, so maybe this should be fixed
58endtagfind = re.compile(r'</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
59
60
61
62class HTMLParser(_markupbase.ParserBase):
63    """Find tags and other markup and call handler functions.
64
65    Usage:
66        p = HTMLParser()
67        p.feed(data)
68        ...
69        p.close()
70
71    Start tags are handled by calling self.handle_starttag() or
72    self.handle_startendtag(); end tags by self.handle_endtag().  The
73    data between tags is passed from the parser to the derived class
74    by calling self.handle_data() with the data as argument (the data
75    may be split up in arbitrary chunks).  If convert_charrefs is
76    True the character references are converted automatically to the
77    corresponding Unicode character (and self.handle_data() is no
78    longer split in chunks), otherwise they are passed by calling
79    self.handle_entityref() or self.handle_charref() with the string
80    containing respectively the named or numeric reference as the
81    argument.
82    """
83
84    CDATA_CONTENT_ELEMENTS = ("script", "style")
85
86    def __init__(self, *, convert_charrefs=True):
87        """Initialize and reset this instance.
88
89        If convert_charrefs is True (the default), all character references
90        are automatically converted to the corresponding Unicode characters.
91        """
92        super().__init__()
93        self.convert_charrefs = convert_charrefs
94        self.reset()
95
96    def reset(self):
97        """Reset this instance.  Loses all unprocessed data."""
98        self.rawdata = ''
99        self.lasttag = '???'
100        self.interesting = interesting_normal
101        self.cdata_elem = None
102        super().reset()
103
104    def feed(self, data):
105        r"""Feed data to the parser.
106
107        Call this as often as you want, with as little or as much text
108        as you want (may include '\n').
109        """
110        self.rawdata = self.rawdata + data
111        self.goahead(0)
112
113    def close(self):
114        """Handle any buffered data."""
115        self.goahead(1)
116
117    __starttag_text = None
118
119    def get_starttag_text(self):
120        """Return full source of start tag: '<...>'."""
121        return self.__starttag_text
122
123    def set_cdata_mode(self, elem):
124        self.cdata_elem = elem.lower()
125        self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
126
127    def clear_cdata_mode(self):
128        self.interesting = interesting_normal
129        self.cdata_elem = None
130
131    # Internal -- handle data as far as reasonable.  May leave state
132    # and data to be processed by a subsequent call.  If 'end' is
133    # true, force handling all data as if followed by EOF marker.
134    def goahead(self, end):
135        rawdata = self.rawdata
136        i = 0
137        n = len(rawdata)
138        while i < n:
139            if self.convert_charrefs and not self.cdata_elem:
140                j = rawdata.find('<', i)
141                if j < 0:
142                    # if we can't find the next <, either we are at the end
143                    # or there's more text incoming.  If the latter is True,
144                    # we can't pass the text to handle_data in case we have
145                    # a charref cut in half at end.  Try to determine if
146                    # this is the case before proceeding by looking for an
147                    # & near the end and see if it's followed by a space or ;.
148                    amppos = rawdata.rfind('&', max(i, n-34))
149                    if (amppos >= 0 and
150                        not re.compile(r'[\s;]').search(rawdata, amppos)):
151                        break  # wait till we get all the text
152                    j = n
153            else:
154                match = self.interesting.search(rawdata, i)  # < or &
155                if match:
156                    j = match.start()
157                else:
158                    if self.cdata_elem:
159                        break
160                    j = n
161            if i < j:
162                if self.convert_charrefs and not self.cdata_elem:
163                    self.handle_data(unescape(rawdata[i:j]))
164                else:
165                    self.handle_data(rawdata[i:j])
166            i = self.updatepos(i, j)
167            if i == n: break
168            startswith = rawdata.startswith
169            if startswith('<', i):
170                if starttagopen.match(rawdata, i): # < + letter
171                    k = self.parse_starttag(i)
172                elif startswith("</", i):
173                    k = self.parse_endtag(i)
174                elif startswith("<!--", i):
175                    k = self.parse_comment(i)
176                elif startswith("<?", i):
177                    k = self.parse_pi(i)
178                elif startswith("<!", i):
179                    k = self.parse_html_declaration(i)
180                elif (i + 1) < n:
181                    self.handle_data("<")
182                    k = i + 1
183                else:
184                    break
185                if k < 0:
186                    if not end:
187                        break
188                    k = rawdata.find('>', i + 1)
189                    if k < 0:
190                        k = rawdata.find('<', i + 1)
191                        if k < 0:
192                            k = i + 1
193                    else:
194                        k += 1
195                    if self.convert_charrefs and not self.cdata_elem:
196                        self.handle_data(unescape(rawdata[i:k]))
197                    else:
198                        self.handle_data(rawdata[i:k])
199                i = self.updatepos(i, k)
200            elif startswith("&#", i):
201                match = charref.match(rawdata, i)
202                if match:
203                    name = match.group()[2:-1]
204                    self.handle_charref(name)
205                    k = match.end()
206                    if not startswith(';', k-1):
207                        k = k - 1
208                    i = self.updatepos(i, k)
209                    continue
210                else:
211                    if ";" in rawdata[i:]:  # bail by consuming &#
212                        self.handle_data(rawdata[i:i+2])
213                        i = self.updatepos(i, i+2)
214                    break
215            elif startswith('&', i):
216                match = entityref.match(rawdata, i)
217                if match:
218                    name = match.group(1)
219                    self.handle_entityref(name)
220                    k = match.end()
221                    if not startswith(';', k-1):
222                        k = k - 1
223                    i = self.updatepos(i, k)
224                    continue
225                match = incomplete.match(rawdata, i)
226                if match:
227                    # match.group() will contain at least 2 chars
228                    if end and match.group() == rawdata[i:]:
229                        k = match.end()
230                        if k <= i:
231                            k = n
232                        i = self.updatepos(i, i + 1)
233                    # incomplete
234                    break
235                elif (i + 1) < n:
236                    # not the end of the buffer, and can't be confused
237                    # with some other construct
238                    self.handle_data("&")
239                    i = self.updatepos(i, i + 1)
240                else:
241                    break
242            else:
243                assert 0, "interesting.search() lied"
244        # end while
245        if end and i < n and not self.cdata_elem:
246            if self.convert_charrefs and not self.cdata_elem:
247                self.handle_data(unescape(rawdata[i:n]))
248            else:
249                self.handle_data(rawdata[i:n])
250            i = self.updatepos(i, n)
251        self.rawdata = rawdata[i:]
252
253    # Internal -- parse html declarations, return length or -1 if not terminated
254    # See w3.org/TR/html5/tokenization.html#markup-declaration-open-state
255    # See also parse_declaration in _markupbase
256    def parse_html_declaration(self, i):
257        rawdata = self.rawdata
258        assert rawdata[i:i+2] == '<!', ('unexpected call to '
259                                        'parse_html_declaration()')
260        if rawdata[i:i+4] == '<!--':
261            # this case is actually already handled in goahead()
262            return self.parse_comment(i)
263        elif rawdata[i:i+3] == '<![':
264            return self.parse_marked_section(i)
265        elif rawdata[i:i+9].lower() == '<!doctype':
266            # find the closing >
267            gtpos = rawdata.find('>', i+9)
268            if gtpos == -1:
269                return -1
270            self.handle_decl(rawdata[i+2:gtpos])
271            return gtpos+1
272        else:
273            return self.parse_bogus_comment(i)
274
275    # Internal -- parse bogus comment, return length or -1 if not terminated
276    # see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state
277    def parse_bogus_comment(self, i, report=1):
278        rawdata = self.rawdata
279        assert rawdata[i:i+2] in ('<!', '</'), ('unexpected call to '
280                                                'parse_comment()')
281        pos = rawdata.find('>', i+2)
282        if pos == -1:
283            return -1
284        if report:
285            self.handle_comment(rawdata[i+2:pos])
286        return pos + 1
287
288    # Internal -- parse processing instr, return end or -1 if not terminated
289    def parse_pi(self, i):
290        rawdata = self.rawdata
291        assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()'
292        match = piclose.search(rawdata, i+2) # >
293        if not match:
294            return -1
295        j = match.start()
296        self.handle_pi(rawdata[i+2: j])
297        j = match.end()
298        return j
299
300    # Internal -- handle starttag, return end or -1 if not terminated
301    def parse_starttag(self, i):
302        self.__starttag_text = None
303        endpos = self.check_for_whole_start_tag(i)
304        if endpos < 0:
305            return endpos
306        rawdata = self.rawdata
307        self.__starttag_text = rawdata[i:endpos]
308
309        # Now parse the data between i+1 and j into a tag and attrs
310        attrs = []
311        match = tagfind_tolerant.match(rawdata, i+1)
312        assert match, 'unexpected call to parse_starttag()'
313        k = match.end()
314        self.lasttag = tag = match.group(1).lower()
315        while k < endpos:
316            m = attrfind_tolerant.match(rawdata, k)
317            if not m:
318                break
319            attrname, rest, attrvalue = m.group(1, 2, 3)
320            if not rest:
321                attrvalue = None
322            elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
323                 attrvalue[:1] == '"' == attrvalue[-1:]:
324                attrvalue = attrvalue[1:-1]
325            if attrvalue:
326                attrvalue = unescape(attrvalue)
327            attrs.append((attrname.lower(), attrvalue))
328            k = m.end()
329
330        end = rawdata[k:endpos].strip()
331        if end not in (">", "/>"):
332            self.handle_data(rawdata[i:endpos])
333            return endpos
334        if end.endswith('/>'):
335            # XHTML-style empty tag: <span attr="value" />
336            self.handle_startendtag(tag, attrs)
337        else:
338            self.handle_starttag(tag, attrs)
339            if tag in self.CDATA_CONTENT_ELEMENTS:
340                self.set_cdata_mode(tag)
341        return endpos
342
343    # Internal -- check to see if we have a complete starttag; return end
344    # or -1 if incomplete.
345    def check_for_whole_start_tag(self, i):
346        rawdata = self.rawdata
347        m = locatestarttagend_tolerant.match(rawdata, i)
348        if m:
349            j = m.end()
350            next = rawdata[j:j+1]
351            if next == ">":
352                return j + 1
353            if next == "/":
354                if rawdata.startswith("/>", j):
355                    return j + 2
356                if rawdata.startswith("/", j):
357                    # buffer boundary
358                    return -1
359                # else bogus input
360                if j > i:
361                    return j
362                else:
363                    return i + 1
364            if next == "":
365                # end of input
366                return -1
367            if next in ("abcdefghijklmnopqrstuvwxyz=/"
368                        "ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
369                # end of input in or before attribute value, or we have the
370                # '/' from a '/>' ending
371                return -1
372            if j > i:
373                return j
374            else:
375                return i + 1
376        raise AssertionError("we should not get here!")
377
378    # Internal -- parse endtag, return end or -1 if incomplete
379    def parse_endtag(self, i):
380        rawdata = self.rawdata
381        assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag"
382        match = endendtag.search(rawdata, i+1) # >
383        if not match:
384            return -1
385        gtpos = match.end()
386        match = endtagfind.match(rawdata, i) # </ + tag + >
387        if not match:
388            if self.cdata_elem is not None:
389                self.handle_data(rawdata[i:gtpos])
390                return gtpos
391            # find the name: w3.org/TR/html5/tokenization.html#tag-name-state
392            namematch = tagfind_tolerant.match(rawdata, i+2)
393            if not namematch:
394                # w3.org/TR/html5/tokenization.html#end-tag-open-state
395                if rawdata[i:i+3] == '</>':
396                    return i+3
397                else:
398                    return self.parse_bogus_comment(i)
399            tagname = namematch.group(1).lower()
400            # consume and ignore other stuff between the name and the >
401            # Note: this is not 100% correct, since we might have things like
402            # </tag attr=">">, but looking for > after the name should cover
403            # most of the cases and is much simpler
404            gtpos = rawdata.find('>', namematch.end())
405            self.handle_endtag(tagname)
406            return gtpos+1
407
408        elem = match.group(1).lower() # script or style
409        if self.cdata_elem is not None:
410            if elem != self.cdata_elem:
411                self.handle_data(rawdata[i:gtpos])
412                return gtpos
413
414        self.handle_endtag(elem)
415        self.clear_cdata_mode()
416        return gtpos
417
418    # Overridable -- finish processing of start+end tag: <tag.../>
419    def handle_startendtag(self, tag, attrs):
420        self.handle_starttag(tag, attrs)
421        self.handle_endtag(tag)
422
423    # Overridable -- handle start tag
424    def handle_starttag(self, tag, attrs):
425        pass
426
427    # Overridable -- handle end tag
428    def handle_endtag(self, tag):
429        pass
430
431    # Overridable -- handle character reference
432    def handle_charref(self, name):
433        pass
434
435    # Overridable -- handle entity reference
436    def handle_entityref(self, name):
437        pass
438
439    # Overridable -- handle data
440    def handle_data(self, data):
441        pass
442
443    # Overridable -- handle comment
444    def handle_comment(self, data):
445        pass
446
447    # Overridable -- handle declaration
448    def handle_decl(self, decl):
449        pass
450
451    # Overridable -- handle processing instruction
452    def handle_pi(self, data):
453        pass
454
455    def unknown_decl(self, data):
456        pass
457