• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1"""A parser for SGML, using the derived class as a static DTD."""
2
3# XXX This only supports those SGML features used by HTML.
4
5# XXX There should be a way to distinguish between PCDATA (parsed
6# character data -- the normal case), RCDATA (replaceable character
7# data -- only char and entity references and end tags are special)
8# and CDATA (character data -- only end tags are special).  RCDATA is
9# not supported at all.
10
11
12from warnings import warnpy3k
13warnpy3k("the sgmllib module has been removed in Python 3.0",
14         stacklevel=2)
15del warnpy3k
16
17import markupbase
18import re
19
20__all__ = ["SGMLParser", "SGMLParseError"]
21
22# Regular expressions used for parsing
23
24interesting = re.compile('[&<]')
25incomplete = re.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|'
26                           '<([a-zA-Z][^<>]*|'
27                              '/([a-zA-Z][^<>]*)?|'
28                              '![^<>]*)?')
29
30entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
31charref = re.compile('&#([0-9]+)[^0-9]')
32
33starttagopen = re.compile('<[>a-zA-Z]')
34shorttagopen = re.compile('<[a-zA-Z][-.a-zA-Z0-9]*/')
35shorttag = re.compile('<([a-zA-Z][-.a-zA-Z0-9]*)/([^/]*)/')
36piclose = re.compile('>')
37endbracket = re.compile('[<>]')
38tagfind = re.compile('[a-zA-Z][-_.a-zA-Z0-9]*')
39attrfind = re.compile(
40    r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'
41    r'(\'[^\']*\'|"[^"]*"|[][\-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?')
42
43
44class SGMLParseError(RuntimeError):
45    """Exception raised for all parse errors."""
46    pass
47
48
49# SGML parser base class -- find tags and call handler functions.
50# Usage: p = SGMLParser(); p.feed(data); ...; p.close().
51# The dtd is defined by deriving a class which defines methods
52# with special names to handle tags: start_foo and end_foo to handle
53# <foo> and </foo>, respectively, or do_foo to handle <foo> by itself.
54# (Tags are converted to lower case for this purpose.)  The data
55# between tags is passed to the parser by calling self.handle_data()
56# with some data as argument (the data may be split up in arbitrary
57# chunks).  Entity references are passed by calling
58# self.handle_entityref() with the entity reference as argument.
59
60class SGMLParser(markupbase.ParserBase):
61    # Definition of entities -- derived classes may override
62    entity_or_charref = re.compile('&(?:'
63      '([a-zA-Z][-.a-zA-Z0-9]*)|#([0-9]+)'
64      ')(;?)')
65
66    def __init__(self, verbose=0):
67        """Initialize and reset this instance."""
68        self.verbose = verbose
69        self.reset()
70
71    def reset(self):
72        """Reset this instance. Loses all unprocessed data."""
73        self.__starttag_text = None
74        self.rawdata = ''
75        self.stack = []
76        self.lasttag = '???'
77        self.nomoretags = 0
78        self.literal = 0
79        markupbase.ParserBase.reset(self)
80
81    def setnomoretags(self):
82        """Enter literal mode (CDATA) till EOF.
83
84        Intended for derived classes only.
85        """
86        self.nomoretags = self.literal = 1
87
88    def setliteral(self, *args):
89        """Enter literal mode (CDATA).
90
91        Intended for derived classes only.
92        """
93        self.literal = 1
94
95    def feed(self, data):
96        """Feed some data to the parser.
97
98        Call this as often as you want, with as little or as much text
99        as you want (may include '\n').  (This just saves the text,
100        all the processing is done by goahead().)
101        """
102
103        self.rawdata = self.rawdata + data
104        self.goahead(0)
105
106    def close(self):
107        """Handle the remaining data."""
108        self.goahead(1)
109
110    def error(self, message):
111        raise SGMLParseError(message)
112
113    # Internal -- handle data as far as reasonable.  May leave state
114    # and data to be processed by a subsequent call.  If 'end' is
115    # true, force handling all data as if followed by EOF marker.
116    def goahead(self, end):
117        rawdata = self.rawdata
118        i = 0
119        n = len(rawdata)
120        while i < n:
121            if self.nomoretags:
122                self.handle_data(rawdata[i:n])
123                i = n
124                break
125            match = interesting.search(rawdata, i)
126            if match: j = match.start()
127            else: j = n
128            if i < j:
129                self.handle_data(rawdata[i:j])
130            i = j
131            if i == n: break
132            if rawdata[i] == '<':
133                if starttagopen.match(rawdata, i):
134                    if self.literal:
135                        self.handle_data(rawdata[i])
136                        i = i+1
137                        continue
138                    k = self.parse_starttag(i)
139                    if k < 0: break
140                    i = k
141                    continue
142                if rawdata.startswith("</", i):
143                    k = self.parse_endtag(i)
144                    if k < 0: break
145                    i = k
146                    self.literal = 0
147                    continue
148                if self.literal:
149                    if n > (i + 1):
150                        self.handle_data("<")
151                        i = i+1
152                    else:
153                        # incomplete
154                        break
155                    continue
156                if rawdata.startswith("<!--", i):
157                        # Strictly speaking, a comment is --.*--
158                        # within a declaration tag <!...>.
159                        # This should be removed,
160                        # and comments handled only in parse_declaration.
161                    k = self.parse_comment(i)
162                    if k < 0: break
163                    i = k
164                    continue
165                if rawdata.startswith("<?", i):
166                    k = self.parse_pi(i)
167                    if k < 0: break
168                    i = i+k
169                    continue
170                if rawdata.startswith("<!", i):
171                    # This is some sort of declaration; in "HTML as
172                    # deployed," this should only be the document type
173                    # declaration ("<!DOCTYPE html...>").
174                    k = self.parse_declaration(i)
175                    if k < 0: break
176                    i = k
177                    continue
178            elif rawdata[i] == '&':
179                if self.literal:
180                    self.handle_data(rawdata[i])
181                    i = i+1
182                    continue
183                match = charref.match(rawdata, i)
184                if match:
185                    name = match.group(1)
186                    self.handle_charref(name)
187                    i = match.end(0)
188                    if rawdata[i-1] != ';': i = i-1
189                    continue
190                match = entityref.match(rawdata, i)
191                if match:
192                    name = match.group(1)
193                    self.handle_entityref(name)
194                    i = match.end(0)
195                    if rawdata[i-1] != ';': i = i-1
196                    continue
197            else:
198                self.error('neither < nor & ??')
199            # We get here only if incomplete matches but
200            # nothing else
201            match = incomplete.match(rawdata, i)
202            if not match:
203                self.handle_data(rawdata[i])
204                i = i+1
205                continue
206            j = match.end(0)
207            if j == n:
208                break # Really incomplete
209            self.handle_data(rawdata[i:j])
210            i = j
211        # end while
212        if end and i < n:
213            self.handle_data(rawdata[i:n])
214            i = n
215        self.rawdata = rawdata[i:]
216        # XXX if end: check for empty stack
217
218    # Extensions for the DOCTYPE scanner:
219    _decl_otherchars = '='
220
221    # Internal -- parse processing instr, return length or -1 if not terminated
222    def parse_pi(self, i):
223        rawdata = self.rawdata
224        if rawdata[i:i+2] != '<?':
225            self.error('unexpected call to parse_pi()')
226        match = piclose.search(rawdata, i+2)
227        if not match:
228            return -1
229        j = match.start(0)
230        self.handle_pi(rawdata[i+2: j])
231        j = match.end(0)
232        return j-i
233
234    def get_starttag_text(self):
235        return self.__starttag_text
236
237    # Internal -- handle starttag, return length or -1 if not terminated
238    def parse_starttag(self, i):
239        self.__starttag_text = None
240        start_pos = i
241        rawdata = self.rawdata
242        if shorttagopen.match(rawdata, i):
243            # SGML shorthand: <tag/data/ == <tag>data</tag>
244            # XXX Can data contain &... (entity or char refs)?
245            # XXX Can data contain < or > (tag characters)?
246            # XXX Can there be whitespace before the first /?
247            match = shorttag.match(rawdata, i)
248            if not match:
249                return -1
250            tag, data = match.group(1, 2)
251            self.__starttag_text = '<%s/' % tag
252            tag = tag.lower()
253            k = match.end(0)
254            self.finish_shorttag(tag, data)
255            self.__starttag_text = rawdata[start_pos:match.end(1) + 1]
256            return k
257        # XXX The following should skip matching quotes (' or ")
258        # As a shortcut way to exit, this isn't so bad, but shouldn't
259        # be used to locate the actual end of the start tag since the
260        # < or > characters may be embedded in an attribute value.
261        match = endbracket.search(rawdata, i+1)
262        if not match:
263            return -1
264        j = match.start(0)
265        # Now parse the data between i+1 and j into a tag and attrs
266        attrs = []
267        if rawdata[i:i+2] == '<>':
268            # SGML shorthand: <> == <last open tag seen>
269            k = j
270            tag = self.lasttag
271        else:
272            match = tagfind.match(rawdata, i+1)
273            if not match:
274                self.error('unexpected call to parse_starttag')
275            k = match.end(0)
276            tag = rawdata[i+1:k].lower()
277            self.lasttag = tag
278        while k < j:
279            match = attrfind.match(rawdata, k)
280            if not match: break
281            attrname, rest, attrvalue = match.group(1, 2, 3)
282            if not rest:
283                attrvalue = attrname
284            else:
285                if (attrvalue[:1] == "'" == attrvalue[-1:] or
286                    attrvalue[:1] == '"' == attrvalue[-1:]):
287                    # strip quotes
288                    attrvalue = attrvalue[1:-1]
289                attrvalue = self.entity_or_charref.sub(
290                    self._convert_ref, attrvalue)
291            attrs.append((attrname.lower(), attrvalue))
292            k = match.end(0)
293        if rawdata[j] == '>':
294            j = j+1
295        self.__starttag_text = rawdata[start_pos:j]
296        self.finish_starttag(tag, attrs)
297        return j
298
299    # Internal -- convert entity or character reference
300    def _convert_ref(self, match):
301        if match.group(2):
302            return self.convert_charref(match.group(2)) or \
303                '&#%s%s' % match.groups()[1:]
304        elif match.group(3):
305            return self.convert_entityref(match.group(1)) or \
306                '&%s;' % match.group(1)
307        else:
308            return '&%s' % match.group(1)
309
310    # Internal -- parse endtag
311    def parse_endtag(self, i):
312        rawdata = self.rawdata
313        match = endbracket.search(rawdata, i+1)
314        if not match:
315            return -1
316        j = match.start(0)
317        tag = rawdata[i+2:j].strip().lower()
318        if rawdata[j] == '>':
319            j = j+1
320        self.finish_endtag(tag)
321        return j
322
323    # Internal -- finish parsing of <tag/data/ (same as <tag>data</tag>)
324    def finish_shorttag(self, tag, data):
325        self.finish_starttag(tag, [])
326        self.handle_data(data)
327        self.finish_endtag(tag)
328
329    # Internal -- finish processing of start tag
330    # Return -1 for unknown tag, 0 for open-only tag, 1 for balanced tag
331    def finish_starttag(self, tag, attrs):
332        try:
333            method = getattr(self, 'start_' + tag)
334        except AttributeError:
335            try:
336                method = getattr(self, 'do_' + tag)
337            except AttributeError:
338                self.unknown_starttag(tag, attrs)
339                return -1
340            else:
341                self.handle_starttag(tag, method, attrs)
342                return 0
343        else:
344            self.stack.append(tag)
345            self.handle_starttag(tag, method, attrs)
346            return 1
347
348    # Internal -- finish processing of end tag
349    def finish_endtag(self, tag):
350        if not tag:
351            found = len(self.stack) - 1
352            if found < 0:
353                self.unknown_endtag(tag)
354                return
355        else:
356            if tag not in self.stack:
357                try:
358                    method = getattr(self, 'end_' + tag)
359                except AttributeError:
360                    self.unknown_endtag(tag)
361                else:
362                    self.report_unbalanced(tag)
363                return
364            found = len(self.stack)
365            for i in range(found):
366                if self.stack[i] == tag: found = i
367        while len(self.stack) > found:
368            tag = self.stack[-1]
369            try:
370                method = getattr(self, 'end_' + tag)
371            except AttributeError:
372                method = None
373            if method:
374                self.handle_endtag(tag, method)
375            else:
376                self.unknown_endtag(tag)
377            del self.stack[-1]
378
379    # Overridable -- handle start tag
380    def handle_starttag(self, tag, method, attrs):
381        method(attrs)
382
383    # Overridable -- handle end tag
384    def handle_endtag(self, tag, method):
385        method()
386
387    # Example -- report an unbalanced </...> tag.
388    def report_unbalanced(self, tag):
389        if self.verbose:
390            print '*** Unbalanced </' + tag + '>'
391            print '*** Stack:', self.stack
392
393    def convert_charref(self, name):
394        """Convert character reference, may be overridden."""
395        try:
396            n = int(name)
397        except ValueError:
398            return
399        if not 0 <= n <= 127:
400            return
401        return self.convert_codepoint(n)
402
403    def convert_codepoint(self, codepoint):
404        return chr(codepoint)
405
406    def handle_charref(self, name):
407        """Handle character reference, no need to override."""
408        replacement = self.convert_charref(name)
409        if replacement is None:
410            self.unknown_charref(name)
411        else:
412            self.handle_data(replacement)
413
414    # Definition of entities -- derived classes may override
415    entitydefs = \
416            {'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': '\''}
417
418    def convert_entityref(self, name):
419        """Convert entity references.
420
421        As an alternative to overriding this method; one can tailor the
422        results by setting up the self.entitydefs mapping appropriately.
423        """
424        table = self.entitydefs
425        if name in table:
426            return table[name]
427        else:
428            return
429
430    def handle_entityref(self, name):
431        """Handle entity references, no need to override."""
432        replacement = self.convert_entityref(name)
433        if replacement is None:
434            self.unknown_entityref(name)
435        else:
436            self.handle_data(replacement)
437
438    # Example -- handle data, should be overridden
439    def handle_data(self, data):
440        pass
441
442    # Example -- handle comment, could be overridden
443    def handle_comment(self, data):
444        pass
445
446    # Example -- handle declaration, could be overridden
447    def handle_decl(self, decl):
448        pass
449
450    # Example -- handle processing instruction, could be overridden
451    def handle_pi(self, data):
452        pass
453
454    # To be overridden -- handlers for unknown objects
455    def unknown_starttag(self, tag, attrs): pass
456    def unknown_endtag(self, tag): pass
457    def unknown_charref(self, ref): pass
458    def unknown_entityref(self, ref): pass
459
460
461class TestSGMLParser(SGMLParser):
462
463    def __init__(self, verbose=0):
464        self.testdata = ""
465        SGMLParser.__init__(self, verbose)
466
467    def handle_data(self, data):
468        self.testdata = self.testdata + data
469        if len(repr(self.testdata)) >= 70:
470            self.flush()
471
472    def flush(self):
473        data = self.testdata
474        if data:
475            self.testdata = ""
476            print 'data:', repr(data)
477
478    def handle_comment(self, data):
479        self.flush()
480        r = repr(data)
481        if len(r) > 68:
482            r = r[:32] + '...' + r[-32:]
483        print 'comment:', r
484
485    def unknown_starttag(self, tag, attrs):
486        self.flush()
487        if not attrs:
488            print 'start tag: <' + tag + '>'
489        else:
490            print 'start tag: <' + tag,
491            for name, value in attrs:
492                print name + '=' + '"' + value + '"',
493            print '>'
494
495    def unknown_endtag(self, tag):
496        self.flush()
497        print 'end tag: </' + tag + '>'
498
499    def unknown_entityref(self, ref):
500        self.flush()
501        print '*** unknown entity ref: &' + ref + ';'
502
503    def unknown_charref(self, ref):
504        self.flush()
505        print '*** unknown char ref: &#' + ref + ';'
506
507    def unknown_decl(self, data):
508        self.flush()
509        print '*** unknown decl: [' + data + ']'
510
511    def close(self):
512        SGMLParser.close(self)
513        self.flush()
514
515
516def test(args = None):
517    import sys
518
519    if args is None:
520        args = sys.argv[1:]
521
522    if args and args[0] == '-s':
523        args = args[1:]
524        klass = SGMLParser
525    else:
526        klass = TestSGMLParser
527
528    if args:
529        file = args[0]
530    else:
531        file = 'test.html'
532
533    if file == '-':
534        f = sys.stdin
535    else:
536        try:
537            f = open(file, 'r')
538        except IOError, msg:
539            print file, ":", msg
540            sys.exit(1)
541
542    data = f.read()
543    if f is not sys.stdin:
544        f.close()
545
546    x = klass()
547    for c in data:
548        x.feed(c)
549    x.close()
550
551
552if __name__ == '__main__':
553    test()
554