• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1"""A parser for HTML and XHTML."""
2
3# This file is based on sgmllib.py, but the API is slightly different.
4
5# XXX There should be a way to distinguish between PCDATA (parsed
6# character data -- the normal case), RCDATA (replaceable character
7# data -- only char and entity references and end tags are special)
8# and CDATA (character data -- only end tags are special).
9
10
11import re
12import _markupbase
13
14from html import unescape
15
16
17__all__ = ['HTMLParser']
18
19# Regular expressions used for parsing
20
21interesting_normal = re.compile('[&<]')
22incomplete = re.compile('&[a-zA-Z#]')
23
24entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
25charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
26
27starttagopen = re.compile('<[a-zA-Z]')
28piclose = re.compile('>')
29commentclose = re.compile(r'--\s*>')
30# Note:
31#  1) if you change tagfind/attrfind remember to update locatestarttagend too;
32#  2) if you change tagfind/attrfind and/or locatestarttagend the parser will
33#     explode, so don't do it.
34# see http://www.w3.org/TR/html5/tokenization.html#tag-open-state
35# and http://www.w3.org/TR/html5/tokenization.html#tag-name-state
36tagfind_tolerant = re.compile(r'([a-zA-Z][^\t\n\r\f />\x00]*)(?:\s|/(?!>))*')
37attrfind_tolerant = re.compile(
38    r'((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*'
39    r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*')
40locatestarttagend_tolerant = re.compile(r"""
41  <[a-zA-Z][^\t\n\r\f />\x00]*       # tag name
42  (?:[\s/]*                          # optional whitespace before attribute name
43    (?:(?<=['"\s/])[^\s/>][^\s/=>]*  # attribute name
44      (?:\s*=+\s*                    # value indicator
45        (?:'[^']*'                   # LITA-enclosed value
46          |"[^"]*"                   # LIT-enclosed value
47          |(?!['"])[^>\s]*           # bare value
48         )
49        \s*                          # possibly followed by a space
50       )?(?:\s|/(?!>))*
51     )*
52   )?
53  \s*                                # trailing whitespace
54""", re.VERBOSE)
55endendtag = re.compile('>')
56# the HTML 5 spec, section 8.1.2.2, doesn't allow spaces between
57# </ and the tag name, so maybe this should be fixed
58endtagfind = re.compile(r'</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
59
60
61
62class HTMLParser(_markupbase.ParserBase):
63    """Find tags and other markup and call handler functions.
64
65    Usage:
66        p = HTMLParser()
67        p.feed(data)
68        ...
69        p.close()
70
71    Start tags are handled by calling self.handle_starttag() or
72    self.handle_startendtag(); end tags by self.handle_endtag().  The
73    data between tags is passed from the parser to the derived class
74    by calling self.handle_data() with the data as argument (the data
75    may be split up in arbitrary chunks).  If convert_charrefs is
76    True the character references are converted automatically to the
77    corresponding Unicode character (and self.handle_data() is no
78    longer split in chunks), otherwise they are passed by calling
79    self.handle_entityref() or self.handle_charref() with the string
80    containing respectively the named or numeric reference as the
81    argument.
82    """
83
84    CDATA_CONTENT_ELEMENTS = ("script", "style")
85
86    def __init__(self, *, convert_charrefs=True):
87        """Initialize and reset this instance.
88
89        If convert_charrefs is True (the default), all character references
90        are automatically converted to the corresponding Unicode characters.
91        """
92        self.convert_charrefs = convert_charrefs
93        self.reset()
94
95    def reset(self):
96        """Reset this instance.  Loses all unprocessed data."""
97        self.rawdata = ''
98        self.lasttag = '???'
99        self.interesting = interesting_normal
100        self.cdata_elem = None
101        _markupbase.ParserBase.reset(self)
102
103    def feed(self, data):
104        r"""Feed data to the parser.
105
106        Call this as often as you want, with as little or as much text
107        as you want (may include '\n').
108        """
109        self.rawdata = self.rawdata + data
110        self.goahead(0)
111
112    def close(self):
113        """Handle any buffered data."""
114        self.goahead(1)
115
116    __starttag_text = None
117
118    def get_starttag_text(self):
119        """Return full source of start tag: '<...>'."""
120        return self.__starttag_text
121
122    def set_cdata_mode(self, elem):
123        self.cdata_elem = elem.lower()
124        self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
125
126    def clear_cdata_mode(self):
127        self.interesting = interesting_normal
128        self.cdata_elem = None
129
130    # Internal -- handle data as far as reasonable.  May leave state
131    # and data to be processed by a subsequent call.  If 'end' is
132    # true, force handling all data as if followed by EOF marker.
133    def goahead(self, end):
134        rawdata = self.rawdata
135        i = 0
136        n = len(rawdata)
137        while i < n:
138            if self.convert_charrefs and not self.cdata_elem:
139                j = rawdata.find('<', i)
140                if j < 0:
141                    # if we can't find the next <, either we are at the end
142                    # or there's more text incoming.  If the latter is True,
143                    # we can't pass the text to handle_data in case we have
144                    # a charref cut in half at end.  Try to determine if
145                    # this is the case before proceeding by looking for an
146                    # & near the end and see if it's followed by a space or ;.
147                    amppos = rawdata.rfind('&', max(i, n-34))
148                    if (amppos >= 0 and
149                        not re.compile(r'[\s;]').search(rawdata, amppos)):
150                        break  # wait till we get all the text
151                    j = n
152            else:
153                match = self.interesting.search(rawdata, i)  # < or &
154                if match:
155                    j = match.start()
156                else:
157                    if self.cdata_elem:
158                        break
159                    j = n
160            if i < j:
161                if self.convert_charrefs and not self.cdata_elem:
162                    self.handle_data(unescape(rawdata[i:j]))
163                else:
164                    self.handle_data(rawdata[i:j])
165            i = self.updatepos(i, j)
166            if i == n: break
167            startswith = rawdata.startswith
168            if startswith('<', i):
169                if starttagopen.match(rawdata, i): # < + letter
170                    k = self.parse_starttag(i)
171                elif startswith("</", i):
172                    k = self.parse_endtag(i)
173                elif startswith("<!--", i):
174                    k = self.parse_comment(i)
175                elif startswith("<?", i):
176                    k = self.parse_pi(i)
177                elif startswith("<!", i):
178                    k = self.parse_html_declaration(i)
179                elif (i + 1) < n:
180                    self.handle_data("<")
181                    k = i + 1
182                else:
183                    break
184                if k < 0:
185                    if not end:
186                        break
187                    k = rawdata.find('>', i + 1)
188                    if k < 0:
189                        k = rawdata.find('<', i + 1)
190                        if k < 0:
191                            k = i + 1
192                    else:
193                        k += 1
194                    if self.convert_charrefs and not self.cdata_elem:
195                        self.handle_data(unescape(rawdata[i:k]))
196                    else:
197                        self.handle_data(rawdata[i:k])
198                i = self.updatepos(i, k)
199            elif startswith("&#", i):
200                match = charref.match(rawdata, i)
201                if match:
202                    name = match.group()[2:-1]
203                    self.handle_charref(name)
204                    k = match.end()
205                    if not startswith(';', k-1):
206                        k = k - 1
207                    i = self.updatepos(i, k)
208                    continue
209                else:
210                    if ";" in rawdata[i:]:  # bail by consuming &#
211                        self.handle_data(rawdata[i:i+2])
212                        i = self.updatepos(i, i+2)
213                    break
214            elif startswith('&', i):
215                match = entityref.match(rawdata, i)
216                if match:
217                    name = match.group(1)
218                    self.handle_entityref(name)
219                    k = match.end()
220                    if not startswith(';', k-1):
221                        k = k - 1
222                    i = self.updatepos(i, k)
223                    continue
224                match = incomplete.match(rawdata, i)
225                if match:
226                    # match.group() will contain at least 2 chars
227                    if end and match.group() == rawdata[i:]:
228                        k = match.end()
229                        if k <= i:
230                            k = n
231                        i = self.updatepos(i, i + 1)
232                    # incomplete
233                    break
234                elif (i + 1) < n:
235                    # not the end of the buffer, and can't be confused
236                    # with some other construct
237                    self.handle_data("&")
238                    i = self.updatepos(i, i + 1)
239                else:
240                    break
241            else:
242                assert 0, "interesting.search() lied"
243        # end while
244        if end and i < n and not self.cdata_elem:
245            if self.convert_charrefs and not self.cdata_elem:
246                self.handle_data(unescape(rawdata[i:n]))
247            else:
248                self.handle_data(rawdata[i:n])
249            i = self.updatepos(i, n)
250        self.rawdata = rawdata[i:]
251
252    # Internal -- parse html declarations, return length or -1 if not terminated
253    # See w3.org/TR/html5/tokenization.html#markup-declaration-open-state
254    # See also parse_declaration in _markupbase
255    def parse_html_declaration(self, i):
256        rawdata = self.rawdata
257        assert rawdata[i:i+2] == '<!', ('unexpected call to '
258                                        'parse_html_declaration()')
259        if rawdata[i:i+4] == '<!--':
260            # this case is actually already handled in goahead()
261            return self.parse_comment(i)
262        elif rawdata[i:i+3] == '<![':
263            return self.parse_marked_section(i)
264        elif rawdata[i:i+9].lower() == '<!doctype':
265            # find the closing >
266            gtpos = rawdata.find('>', i+9)
267            if gtpos == -1:
268                return -1
269            self.handle_decl(rawdata[i+2:gtpos])
270            return gtpos+1
271        else:
272            return self.parse_bogus_comment(i)
273
274    # Internal -- parse bogus comment, return length or -1 if not terminated
275    # see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state
276    def parse_bogus_comment(self, i, report=1):
277        rawdata = self.rawdata
278        assert rawdata[i:i+2] in ('<!', '</'), ('unexpected call to '
279                                                'parse_comment()')
280        pos = rawdata.find('>', i+2)
281        if pos == -1:
282            return -1
283        if report:
284            self.handle_comment(rawdata[i+2:pos])
285        return pos + 1
286
287    # Internal -- parse processing instr, return end or -1 if not terminated
288    def parse_pi(self, i):
289        rawdata = self.rawdata
290        assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()'
291        match = piclose.search(rawdata, i+2) # >
292        if not match:
293            return -1
294        j = match.start()
295        self.handle_pi(rawdata[i+2: j])
296        j = match.end()
297        return j
298
299    # Internal -- handle starttag, return end or -1 if not terminated
300    def parse_starttag(self, i):
301        self.__starttag_text = None
302        endpos = self.check_for_whole_start_tag(i)
303        if endpos < 0:
304            return endpos
305        rawdata = self.rawdata
306        self.__starttag_text = rawdata[i:endpos]
307
308        # Now parse the data between i+1 and j into a tag and attrs
309        attrs = []
310        match = tagfind_tolerant.match(rawdata, i+1)
311        assert match, 'unexpected call to parse_starttag()'
312        k = match.end()
313        self.lasttag = tag = match.group(1).lower()
314        while k < endpos:
315            m = attrfind_tolerant.match(rawdata, k)
316            if not m:
317                break
318            attrname, rest, attrvalue = m.group(1, 2, 3)
319            if not rest:
320                attrvalue = None
321            elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
322                 attrvalue[:1] == '"' == attrvalue[-1:]:
323                attrvalue = attrvalue[1:-1]
324            if attrvalue:
325                attrvalue = unescape(attrvalue)
326            attrs.append((attrname.lower(), attrvalue))
327            k = m.end()
328
329        end = rawdata[k:endpos].strip()
330        if end not in (">", "/>"):
331            lineno, offset = self.getpos()
332            if "\n" in self.__starttag_text:
333                lineno = lineno + self.__starttag_text.count("\n")
334                offset = len(self.__starttag_text) \
335                         - self.__starttag_text.rfind("\n")
336            else:
337                offset = offset + len(self.__starttag_text)
338            self.handle_data(rawdata[i:endpos])
339            return endpos
340        if end.endswith('/>'):
341            # XHTML-style empty tag: <span attr="value" />
342            self.handle_startendtag(tag, attrs)
343        else:
344            self.handle_starttag(tag, attrs)
345            if tag in self.CDATA_CONTENT_ELEMENTS:
346                self.set_cdata_mode(tag)
347        return endpos
348
349    # Internal -- check to see if we have a complete starttag; return end
350    # or -1 if incomplete.
351    def check_for_whole_start_tag(self, i):
352        rawdata = self.rawdata
353        m = locatestarttagend_tolerant.match(rawdata, i)
354        if m:
355            j = m.end()
356            next = rawdata[j:j+1]
357            if next == ">":
358                return j + 1
359            if next == "/":
360                if rawdata.startswith("/>", j):
361                    return j + 2
362                if rawdata.startswith("/", j):
363                    # buffer boundary
364                    return -1
365                # else bogus input
366                if j > i:
367                    return j
368                else:
369                    return i + 1
370            if next == "":
371                # end of input
372                return -1
373            if next in ("abcdefghijklmnopqrstuvwxyz=/"
374                        "ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
375                # end of input in or before attribute value, or we have the
376                # '/' from a '/>' ending
377                return -1
378            if j > i:
379                return j
380            else:
381                return i + 1
382        raise AssertionError("we should not get here!")
383
384    # Internal -- parse endtag, return end or -1 if incomplete
385    def parse_endtag(self, i):
386        rawdata = self.rawdata
387        assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag"
388        match = endendtag.search(rawdata, i+1) # >
389        if not match:
390            return -1
391        gtpos = match.end()
392        match = endtagfind.match(rawdata, i) # </ + tag + >
393        if not match:
394            if self.cdata_elem is not None:
395                self.handle_data(rawdata[i:gtpos])
396                return gtpos
397            # find the name: w3.org/TR/html5/tokenization.html#tag-name-state
398            namematch = tagfind_tolerant.match(rawdata, i+2)
399            if not namematch:
400                # w3.org/TR/html5/tokenization.html#end-tag-open-state
401                if rawdata[i:i+3] == '</>':
402                    return i+3
403                else:
404                    return self.parse_bogus_comment(i)
405            tagname = namematch.group(1).lower()
406            # consume and ignore other stuff between the name and the >
407            # Note: this is not 100% correct, since we might have things like
408            # </tag attr=">">, but looking for > after the name should cover
409            # most of the cases and is much simpler
410            gtpos = rawdata.find('>', namematch.end())
411            self.handle_endtag(tagname)
412            return gtpos+1
413
414        elem = match.group(1).lower() # script or style
415        if self.cdata_elem is not None:
416            if elem != self.cdata_elem:
417                self.handle_data(rawdata[i:gtpos])
418                return gtpos
419
420        self.handle_endtag(elem)
421        self.clear_cdata_mode()
422        return gtpos
423
424    # Overridable -- finish processing of start+end tag: <tag.../>
425    def handle_startendtag(self, tag, attrs):
426        self.handle_starttag(tag, attrs)
427        self.handle_endtag(tag)
428
429    # Overridable -- handle start tag
430    def handle_starttag(self, tag, attrs):
431        pass
432
433    # Overridable -- handle end tag
434    def handle_endtag(self, tag):
435        pass
436
437    # Overridable -- handle character reference
438    def handle_charref(self, name):
439        pass
440
441    # Overridable -- handle entity reference
442    def handle_entityref(self, name):
443        pass
444
445    # Overridable -- handle data
446    def handle_data(self, data):
447        pass
448
449    # Overridable -- handle comment
450    def handle_comment(self, data):
451        pass
452
453    # Overridable -- handle declaration
454    def handle_decl(self, decl):
455        pass
456
457    # Overridable -- handle processing instruction
458    def handle_pi(self, data):
459        pass
460
461    def unknown_decl(self, data):
462        pass
463