• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#-----------------------------------------------------------------------------
2# ply: lex.py
3#
4# Author: David M. Beazley (dave@dabeaz.com)
5#
6# Copyright (C) 2001-2006, David M. Beazley
7#
8# This library is free software; you can redistribute it and/or
9# modify it under the terms of the GNU Lesser General Public
10# License as published by the Free Software Foundation; either
11# version 2.1 of the License, or (at your option) any later version.
12#
13# This library is distributed in the hope that it will be useful,
14# but WITHOUT ANY WARRANTY; without even the implied warranty of
15# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16# Lesser General Public License for more details.
17#
18# You should have received a copy of the GNU Lesser General Public
19# License along with this library; if not, write to the Free Software
20# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
21#
22# See the file COPYING for a complete copy of the LGPL.
23#-----------------------------------------------------------------------------
24
25__version__ = "2.2"
26
27import re, sys, types
28
29# Regular expression used to match valid token names
30_is_identifier = re.compile(r'^[a-zA-Z0-9_]+$')
31
32# Available instance types.  This is used when lexers are defined by a class.
33# It's a little funky because I want to preserve backwards compatibility
34# with Python 2.0 where types.ObjectType is undefined.
35
36try:
37   _INSTANCETYPE = (types.InstanceType, types.ObjectType)
38except AttributeError:
39   _INSTANCETYPE = types.InstanceType
40   class object: pass       # Note: needed if no new-style classes present
41
42# Exception thrown when invalid token encountered and no default error
43# handler is defined.
44class LexError(Exception):
45    def __init__(self,message,s):
46         self.args = (message,)
47         self.text = s
48
49# Token class
50class LexToken(object):
51    def __str__(self):
52        return "LexToken(%s,%r,%d,%d)" % (self.type,self.value,self.lineno,self.lexpos)
53    def __repr__(self):
54        return str(self)
55    def skip(self,n):
56        self.lexer.skip(n)
57
58# -----------------------------------------------------------------------------
59# Lexer class
60#
61# This class encapsulates all of the methods and data associated with a lexer.
62#
63#    input()          -  Store a new string in the lexer
64#    token()          -  Get the next token
65# -----------------------------------------------------------------------------
66
67class Lexer:
68    def __init__(self):
69        self.lexre = None             # Master regular expression. This is a list of
70                                      # tuples (re,findex) where re is a compiled
71                                      # regular expression and findex is a list
72                                      # mapping regex group numbers to rules
73        self.lexretext = None         # Current regular expression strings
74        self.lexstatere = {}          # Dictionary mapping lexer states to master regexs
75        self.lexstateretext = {}      # Dictionary mapping lexer states to regex strings
76        self.lexstate = "INITIAL"     # Current lexer state
77        self.lexstatestack = []       # Stack of lexer states
78        self.lexstateinfo = None      # State information
79        self.lexstateignore = {}      # Dictionary of ignored characters for each state
80        self.lexstateerrorf = {}      # Dictionary of error functions for each state
81        self.lexreflags = 0           # Optional re compile flags
82        self.lexdata = None           # Actual input data (as a string)
83        self.lexpos = 0               # Current position in input text
84        self.lexlen = 0               # Length of the input text
85        self.lexerrorf = None         # Error rule (if any)
86        self.lextokens = None         # List of valid tokens
87        self.lexignore = ""           # Ignored characters
88        self.lexliterals = ""         # Literal characters that can be passed through
89        self.lexmodule = None         # Module
90        self.lineno = 1               # Current line number
91        self.lexdebug = 0             # Debugging mode
92        self.lexoptimize = 0          # Optimized mode
93
94    def clone(self,object=None):
95        c = Lexer()
96        c.lexstatere = self.lexstatere
97        c.lexstateinfo = self.lexstateinfo
98        c.lexstateretext = self.lexstateretext
99        c.lexstate = self.lexstate
100        c.lexstatestack = self.lexstatestack
101        c.lexstateignore = self.lexstateignore
102        c.lexstateerrorf = self.lexstateerrorf
103        c.lexreflags = self.lexreflags
104        c.lexdata = self.lexdata
105        c.lexpos = self.lexpos
106        c.lexlen = self.lexlen
107        c.lextokens = self.lextokens
108        c.lexdebug = self.lexdebug
109        c.lineno = self.lineno
110        c.lexoptimize = self.lexoptimize
111        c.lexliterals = self.lexliterals
112        c.lexmodule   = self.lexmodule
113
114        # If the object parameter has been supplied, it means we are attaching the
115        # lexer to a new object.  In this case, we have to rebind all methods in
116        # the lexstatere and lexstateerrorf tables.
117
118        if object:
119            newtab = { }
120            for key, ritem in self.lexstatere.items():
121                newre = []
122                for cre, findex in ritem:
123                     newfindex = []
124                     for f in findex:
125                         if not f or not f[0]:
126                             newfindex.append(f)
127                             continue
128                         newfindex.append((getattr(object,f[0].__name__),f[1]))
129                newre.append((cre,newfindex))
130                newtab[key] = newre
131            c.lexstatere = newtab
132            c.lexstateerrorf = { }
133            for key, ef in self.lexstateerrorf.items():
134                c.lexstateerrorf[key] = getattr(object,ef.__name__)
135            c.lexmodule = object
136
137        # Set up other attributes
138        c.begin(c.lexstate)
139        return c
140
141    # ------------------------------------------------------------
142    # writetab() - Write lexer information to a table file
143    # ------------------------------------------------------------
144    def writetab(self,tabfile):
145        tf = open(tabfile+".py","w")
146        tf.write("# %s.py. This file automatically created by PLY (version %s). Don't edit!\n" % (tabfile,__version__))
147        tf.write("_lextokens    = %s\n" % repr(self.lextokens))
148        tf.write("_lexreflags   = %s\n" % repr(self.lexreflags))
149        tf.write("_lexliterals  = %s\n" % repr(self.lexliterals))
150        tf.write("_lexstateinfo = %s\n" % repr(self.lexstateinfo))
151
152        tabre = { }
153        for key, lre in self.lexstatere.items():
154             titem = []
155             for i in range(len(lre)):
156                  titem.append((self.lexstateretext[key][i],_funcs_to_names(lre[i][1])))
157             tabre[key] = titem
158
159        tf.write("_lexstatere   = %s\n" % repr(tabre))
160        tf.write("_lexstateignore = %s\n" % repr(self.lexstateignore))
161
162        taberr = { }
163        for key, ef in self.lexstateerrorf.items():
164             if ef:
165                  taberr[key] = ef.__name__
166             else:
167                  taberr[key] = None
168        tf.write("_lexstateerrorf = %s\n" % repr(taberr))
169        tf.close()
170
171    # ------------------------------------------------------------
172    # readtab() - Read lexer information from a tab file
173    # ------------------------------------------------------------
174    def readtab(self,tabfile,fdict):
175        exec "import %s as lextab" % tabfile
176        self.lextokens      = lextab._lextokens
177        self.lexreflags     = lextab._lexreflags
178        self.lexliterals    = lextab._lexliterals
179        self.lexstateinfo   = lextab._lexstateinfo
180        self.lexstateignore = lextab._lexstateignore
181        self.lexstatere     = { }
182        self.lexstateretext = { }
183        for key,lre in lextab._lexstatere.items():
184             titem = []
185             txtitem = []
186             for i in range(len(lre)):
187                  titem.append((re.compile(lre[i][0],lextab._lexreflags),_names_to_funcs(lre[i][1],fdict)))
188                  txtitem.append(lre[i][0])
189             self.lexstatere[key] = titem
190             self.lexstateretext[key] = txtitem
191        self.lexstateerrorf = { }
192        for key,ef in lextab._lexstateerrorf.items():
193             self.lexstateerrorf[key] = fdict[ef]
194        self.begin('INITIAL')
195
196    # ------------------------------------------------------------
197    # input() - Push a new string into the lexer
198    # ------------------------------------------------------------
199    def input(self,s):
200        if not (isinstance(s,types.StringType) or isinstance(s,types.UnicodeType)):
201            raise ValueError, "Expected a string"
202        self.lexdata = s
203        self.lexpos = 0
204        self.lexlen = len(s)
205
206    # ------------------------------------------------------------
207    # begin() - Changes the lexing state
208    # ------------------------------------------------------------
209    def begin(self,state):
210        if not self.lexstatere.has_key(state):
211            raise ValueError, "Undefined state"
212        self.lexre = self.lexstatere[state]
213        self.lexretext = self.lexstateretext[state]
214        self.lexignore = self.lexstateignore.get(state,"")
215        self.lexerrorf = self.lexstateerrorf.get(state,None)
216        self.lexstate = state
217
218    # ------------------------------------------------------------
219    # push_state() - Changes the lexing state and saves old on stack
220    # ------------------------------------------------------------
221    def push_state(self,state):
222        self.lexstatestack.append(self.lexstate)
223        self.begin(state)
224
225    # ------------------------------------------------------------
226    # pop_state() - Restores the previous state
227    # ------------------------------------------------------------
228    def pop_state(self):
229        self.begin(self.lexstatestack.pop())
230
231    # ------------------------------------------------------------
232    # current_state() - Returns the current lexing state
233    # ------------------------------------------------------------
234    def current_state(self):
235        return self.lexstate
236
237    # ------------------------------------------------------------
238    # skip() - Skip ahead n characters
239    # ------------------------------------------------------------
240    def skip(self,n):
241        self.lexpos += n
242
243    # ------------------------------------------------------------
244    # token() - Return the next token from the Lexer
245    #
246    # Note: This function has been carefully implemented to be as fast
247    # as possible.  Don't make changes unless you really know what
248    # you are doing
249    # ------------------------------------------------------------
250    def token(self):
251        # Make local copies of frequently referenced attributes
252        lexpos    = self.lexpos
253        lexlen    = self.lexlen
254        lexignore = self.lexignore
255        lexdata   = self.lexdata
256
257        while lexpos < lexlen:
258            # This code provides some short-circuit code for whitespace, tabs, and other ignored characters
259            if lexdata[lexpos] in lexignore:
260                lexpos += 1
261                continue
262
263            # Look for a regular expression match
264            for lexre,lexindexfunc in self.lexre:
265                m = lexre.match(lexdata,lexpos)
266                if not m: continue
267
268                # Set last match in lexer so that rules can access it if they want
269                self.lexmatch = m
270
271                # Create a token for return
272                tok = LexToken()
273                tok.value = m.group()
274                tok.lineno = self.lineno
275                tok.lexpos = lexpos
276                tok.lexer = self
277
278                lexpos = m.end()
279                i = m.lastindex
280                func,tok.type = lexindexfunc[i]
281                self.lexpos = lexpos
282
283                if not func:
284                   # If no token type was set, it's an ignored token
285                   if tok.type: return tok
286                   break
287
288                # if func not callable, it means it's an ignored token
289                if not callable(func):
290                   break
291
292                # If token is processed by a function, call it
293                newtok = func(tok)
294
295                # Every function must return a token, if nothing, we just move to next token
296                if not newtok:
297                    lexpos = self.lexpos        # This is here in case user has updated lexpos.
298                    break
299
300                # Verify type of the token.  If not in the token map, raise an error
301                if not self.lexoptimize:
302                    if not self.lextokens.has_key(newtok.type):
303                        raise LexError, ("%s:%d: Rule '%s' returned an unknown token type '%s'" % (
304                            func.func_code.co_filename, func.func_code.co_firstlineno,
305                            func.__name__, newtok.type),lexdata[lexpos:])
306
307                return newtok
308            else:
309                # No match, see if in literals
310                if lexdata[lexpos] in self.lexliterals:
311                    tok = LexToken()
312                    tok.value = lexdata[lexpos]
313                    tok.lineno = self.lineno
314                    tok.lexer = self
315                    tok.type = tok.value
316                    tok.lexpos = lexpos
317                    self.lexpos = lexpos + 1
318                    return tok
319
320                # No match. Call t_error() if defined.
321                if self.lexerrorf:
322                    tok = LexToken()
323                    tok.value = self.lexdata[lexpos:]
324                    tok.lineno = self.lineno
325                    tok.type = "error"
326                    tok.lexer = self
327                    tok.lexpos = lexpos
328                    self.lexpos = lexpos
329                    newtok = self.lexerrorf(tok)
330                    if lexpos == self.lexpos:
331                        # Error method didn't change text position at all. This is an error.
332                        raise LexError, ("Scanning error. Illegal character '%s'" % (lexdata[lexpos]), lexdata[lexpos:])
333                    lexpos = self.lexpos
334                    if not newtok: continue
335                    return newtok
336
337                self.lexpos = lexpos
338                raise LexError, ("Illegal character '%s' at index %d" % (lexdata[lexpos],lexpos), lexdata[lexpos:])
339
340        self.lexpos = lexpos + 1
341        if self.lexdata is None:
342             raise RuntimeError, "No input string given with input()"
343        return None
344
345# -----------------------------------------------------------------------------
346# _validate_file()
347#
348# This checks to see if there are duplicated t_rulename() functions or strings
349# in the parser input file.  This is done using a simple regular expression
350# match on each line in the filename.
351# -----------------------------------------------------------------------------
352
353def _validate_file(filename):
354    import os.path
355    base,ext = os.path.splitext(filename)
356    if ext != '.py': return 1        # No idea what the file is. Return OK
357
358    try:
359        f = open(filename)
360        lines = f.readlines()
361        f.close()
362    except IOError:
363        return 1                       # Oh well
364
365    fre = re.compile(r'\s*def\s+(t_[a-zA-Z_0-9]*)\(')
366    sre = re.compile(r'\s*(t_[a-zA-Z_0-9]*)\s*=')
367    counthash = { }
368    linen = 1
369    noerror = 1
370    for l in lines:
371        m = fre.match(l)
372        if not m:
373            m = sre.match(l)
374        if m:
375            name = m.group(1)
376            prev = counthash.get(name)
377            if not prev:
378                counthash[name] = linen
379            else:
380                print "%s:%d: Rule %s redefined. Previously defined on line %d" % (filename,linen,name,prev)
381                noerror = 0
382        linen += 1
383    return noerror
384
385# -----------------------------------------------------------------------------
386# _funcs_to_names()
387#
388# Given a list of regular expression functions, this converts it to a list
389# suitable for output to a table file
390# -----------------------------------------------------------------------------
391
392def _funcs_to_names(funclist):
393    result = []
394    for f in funclist:
395         if f and f[0]:
396             result.append((f[0].__name__,f[1]))
397         else:
398             result.append(f)
399    return result
400
401# -----------------------------------------------------------------------------
402# _names_to_funcs()
403#
404# Given a list of regular expression function names, this converts it back to
405# functions.
406# -----------------------------------------------------------------------------
407
408def _names_to_funcs(namelist,fdict):
409     result = []
410     for n in namelist:
411          if n and n[0]:
412              result.append((fdict[n[0]],n[1]))
413          else:
414              result.append(n)
415     return result
416
417# -----------------------------------------------------------------------------
418# _form_master_re()
419#
420# This function takes a list of all of the regex components and attempts to
421# form the master regular expression.  Given limitations in the Python re
422# module, it may be necessary to break the master regex into separate expressions.
423# -----------------------------------------------------------------------------
424
425def _form_master_re(relist,reflags,ldict):
426    if not relist: return []
427    regex = "|".join(relist)
428    try:
429        lexre = re.compile(regex,re.VERBOSE | reflags)
430
431        # Build the index to function map for the matching engine
432        lexindexfunc = [ None ] * (max(lexre.groupindex.values())+1)
433        for f,i in lexre.groupindex.items():
434            handle = ldict.get(f,None)
435            if type(handle) in (types.FunctionType, types.MethodType):
436                lexindexfunc[i] = (handle,handle.__name__[2:])
437            elif handle is not None:
438                # If rule was specified as a string, we build an anonymous
439                # callback function to carry out the action
440                if f.find("ignore_") > 0:
441                    lexindexfunc[i] = (None,None)
442                    print "IGNORE", f
443                else:
444                    lexindexfunc[i] = (None, f[2:])
445
446        return [(lexre,lexindexfunc)],[regex]
447    except Exception,e:
448        m = int(len(relist)/2)
449        if m == 0: m = 1
450        llist, lre = _form_master_re(relist[:m],reflags,ldict)
451        rlist, rre = _form_master_re(relist[m:],reflags,ldict)
452        return llist+rlist, lre+rre
453
454# -----------------------------------------------------------------------------
455# def _statetoken(s,names)
456#
457# Given a declaration name s of the form "t_" and a dictionary whose keys are
458# state names, this function returns a tuple (states,tokenname) where states
459# is a tuple of state names and tokenname is the name of the token.  For example,
460# calling this with s = "t_foo_bar_SPAM" might return (('foo','bar'),'SPAM')
461# -----------------------------------------------------------------------------
462
463def _statetoken(s,names):
464    nonstate = 1
465    parts = s.split("_")
466    for i in range(1,len(parts)):
467         if not names.has_key(parts[i]) and parts[i] != 'ANY': break
468    if i > 1:
469       states = tuple(parts[1:i])
470    else:
471       states = ('INITIAL',)
472
473    if 'ANY' in states:
474       states = tuple(names.keys())
475
476    tokenname = "_".join(parts[i:])
477    return (states,tokenname)
478
479# -----------------------------------------------------------------------------
480# lex(module)
481#
482# Build all of the regular expression rules from definitions in the supplied module
483# -----------------------------------------------------------------------------
484def lex(module=None,object=None,debug=0,optimize=0,lextab="lextab",reflags=0,nowarn=0):
485    global lexer
486    ldict = None
487    stateinfo  = { 'INITIAL' : 'inclusive'}
488    error = 0
489    files = { }
490    lexobj = Lexer()
491    lexobj.lexdebug = debug
492    lexobj.lexoptimize = optimize
493    global token,input
494
495    if nowarn: warn = 0
496    else: warn = 1
497
498    if object: module = object
499
500    if module:
501        # User supplied a module object.
502        if isinstance(module, types.ModuleType):
503            ldict = module.__dict__
504        elif isinstance(module, _INSTANCETYPE):
505            _items = [(k,getattr(module,k)) for k in dir(module)]
506            ldict = { }
507            for (i,v) in _items:
508                ldict[i] = v
509        else:
510            raise ValueError,"Expected a module or instance"
511        lexobj.lexmodule = module
512
513    else:
514        # No module given.  We might be able to get information from the caller.
515        try:
516            raise RuntimeError
517        except RuntimeError:
518            e,b,t = sys.exc_info()
519            f = t.tb_frame
520            f = f.f_back           # Walk out to our calling function
521            ldict = f.f_globals    # Grab its globals dictionary
522
523    if optimize and lextab:
524        try:
525            lexobj.readtab(lextab,ldict)
526            token = lexobj.token
527            input = lexobj.input
528            lexer = lexobj
529            return lexobj
530
531        except ImportError:
532            pass
533
534    # Get the tokens, states, and literals variables (if any)
535    if (module and isinstance(module,_INSTANCETYPE)):
536        tokens   = getattr(module,"tokens",None)
537        states   = getattr(module,"states",None)
538        literals = getattr(module,"literals","")
539    else:
540        tokens   = ldict.get("tokens",None)
541        states   = ldict.get("states",None)
542        literals = ldict.get("literals","")
543
544    if not tokens:
545        raise SyntaxError,"lex: module does not define 'tokens'"
546    if not (isinstance(tokens,types.ListType) or isinstance(tokens,types.TupleType)):
547        raise SyntaxError,"lex: tokens must be a list or tuple."
548
549    # Build a dictionary of valid token names
550    lexobj.lextokens = { }
551    if not optimize:
552        for n in tokens:
553            if not _is_identifier.match(n):
554                print "lex: Bad token name '%s'" % n
555                error = 1
556            if warn and lexobj.lextokens.has_key(n):
557                print "lex: Warning. Token '%s' multiply defined." % n
558            lexobj.lextokens[n] = None
559    else:
560        for n in tokens: lexobj.lextokens[n] = None
561
562    if debug:
563        print "lex: tokens = '%s'" % lexobj.lextokens.keys()
564
565    try:
566         for c in literals:
567               if not (isinstance(c,types.StringType) or isinstance(c,types.UnicodeType)) or len(c) > 1:
568                    print "lex: Invalid literal %s. Must be a single character" % repr(c)
569                    error = 1
570                    continue
571
572    except TypeError:
573         print "lex: Invalid literals specification. literals must be a sequence of characters."
574         error = 1
575
576    lexobj.lexliterals = literals
577
578    # Build statemap
579    if states:
580         if not (isinstance(states,types.TupleType) or isinstance(states,types.ListType)):
581              print "lex: states must be defined as a tuple or list."
582              error = 1
583         else:
584              for s in states:
585                    if not isinstance(s,types.TupleType) or len(s) != 2:
586                           print "lex: invalid state specifier %s. Must be a tuple (statename,'exclusive|inclusive')" % repr(s)
587                           error = 1
588                           continue
589                    name, statetype = s
590                    if not isinstance(name,types.StringType):
591                           print "lex: state name %s must be a string" % repr(name)
592                           error = 1
593                           continue
594                    if not (statetype == 'inclusive' or statetype == 'exclusive'):
595                           print "lex: state type for state %s must be 'inclusive' or 'exclusive'" % name
596                           error = 1
597                           continue
598                    if stateinfo.has_key(name):
599                           print "lex: state '%s' already defined." % name
600                           error = 1
601                           continue
602                    stateinfo[name] = statetype
603
604    # Get a list of symbols with the t_ or s_ prefix
605    tsymbols = [f for f in ldict.keys() if f[:2] == 't_' ]
606
607    # Now build up a list of functions and a list of strings
608
609    funcsym =  { }        # Symbols defined as functions
610    strsym =   { }        # Symbols defined as strings
611    toknames = { }        # Mapping of symbols to token names
612
613    for s in stateinfo.keys():
614         funcsym[s] = []
615         strsym[s] = []
616
617    ignore   = { }        # Ignore strings by state
618    errorf   = { }        # Error functions by state
619
620    if len(tsymbols) == 0:
621        raise SyntaxError,"lex: no rules of the form t_rulename are defined."
622
623    for f in tsymbols:
624        t = ldict[f]
625        states, tokname = _statetoken(f,stateinfo)
626        toknames[f] = tokname
627
628        if callable(t):
629            for s in states: funcsym[s].append((f,t))
630        elif (isinstance(t, types.StringType) or isinstance(t,types.UnicodeType)):
631            for s in states: strsym[s].append((f,t))
632        else:
633            print "lex: %s not defined as a function or string" % f
634            error = 1
635
636    # Sort the functions by line number
637    for f in funcsym.values():
638        f.sort(lambda x,y: cmp(x[1].func_code.co_firstlineno,y[1].func_code.co_firstlineno))
639
640    # Sort the strings by regular expression length
641    for s in strsym.values():
642        s.sort(lambda x,y: (len(x[1]) < len(y[1])) - (len(x[1]) > len(y[1])))
643
644    regexs = { }
645
646    # Build the master regular expressions
647    for state in stateinfo.keys():
648        regex_list = []
649
650        # Add rules defined by functions first
651        for fname, f in funcsym[state]:
652            line = f.func_code.co_firstlineno
653            file = f.func_code.co_filename
654            files[file] = None
655            tokname = toknames[fname]
656
657            ismethod = isinstance(f, types.MethodType)
658
659            if not optimize:
660                nargs = f.func_code.co_argcount
661                if ismethod:
662                    reqargs = 2
663                else:
664                    reqargs = 1
665                if nargs > reqargs:
666                    print "%s:%d: Rule '%s' has too many arguments." % (file,line,f.__name__)
667                    error = 1
668                    continue
669
670                if nargs < reqargs:
671                    print "%s:%d: Rule '%s' requires an argument." % (file,line,f.__name__)
672                    error = 1
673                    continue
674
675                if tokname == 'ignore':
676                    print "%s:%d: Rule '%s' must be defined as a string." % (file,line,f.__name__)
677                    error = 1
678                    continue
679
680            if tokname == 'error':
681                errorf[state] = f
682                continue
683
684            if f.__doc__:
685                if not optimize:
686                    try:
687                        c = re.compile("(?P<%s>%s)" % (f.__name__,f.__doc__), re.VERBOSE | reflags)
688                        if c.match(""):
689                             print "%s:%d: Regular expression for rule '%s' matches empty string." % (file,line,f.__name__)
690                             error = 1
691                             continue
692                    except re.error,e:
693                        print "%s:%d: Invalid regular expression for rule '%s'. %s" % (file,line,f.__name__,e)
694                        if '#' in f.__doc__:
695                             print "%s:%d. Make sure '#' in rule '%s' is escaped with '\\#'." % (file,line, f.__name__)
696                        error = 1
697                        continue
698
699                    if debug:
700                        print "lex: Adding rule %s -> '%s' (state '%s')" % (f.__name__,f.__doc__, state)
701
702                # Okay. The regular expression seemed okay.  Let's append it to the master regular
703                # expression we're building
704
705                regex_list.append("(?P<%s>%s)" % (f.__name__,f.__doc__))
706            else:
707                print "%s:%d: No regular expression defined for rule '%s'" % (file,line,f.__name__)
708
709        # Now add all of the simple rules
710        for name,r in strsym[state]:
711            tokname = toknames[name]
712
713            if tokname == 'ignore':
714                 ignore[state] = r
715                 continue
716
717            if not optimize:
718                if tokname == 'error':
719                    raise SyntaxError,"lex: Rule '%s' must be defined as a function" % name
720                    error = 1
721                    continue
722
723                if not lexobj.lextokens.has_key(tokname) and tokname.find("ignore_") < 0:
724                    print "lex: Rule '%s' defined for an unspecified token %s." % (name,tokname)
725                    error = 1
726                    continue
727                try:
728                    c = re.compile("(?P<%s>%s)" % (name,r),re.VERBOSE | reflags)
729                    if (c.match("")):
730                         print "lex: Regular expression for rule '%s' matches empty string." % name
731                         error = 1
732                         continue
733                except re.error,e:
734                    print "lex: Invalid regular expression for rule '%s'. %s" % (name,e)
735                    if '#' in r:
736                         print "lex: Make sure '#' in rule '%s' is escaped with '\\#'." % name
737
738                    error = 1
739                    continue
740                if debug:
741                    print "lex: Adding rule %s -> '%s' (state '%s')" % (name,r,state)
742
743            regex_list.append("(?P<%s>%s)" % (name,r))
744
745        if not regex_list:
746             print "lex: No rules defined for state '%s'" % state
747             error = 1
748
749        regexs[state] = regex_list
750
751
752    if not optimize:
753        for f in files.keys():
754           if not _validate_file(f):
755                error = 1
756
757    if error:
758        raise SyntaxError,"lex: Unable to build lexer."
759
760    # From this point forward, we're reasonably confident that we can build the lexer.
761    # No more errors will be generated, but there might be some warning messages.
762
763    # Build the master regular expressions
764
765    for state in regexs.keys():
766        lexre, re_text = _form_master_re(regexs[state],reflags,ldict)
767        lexobj.lexstatere[state] = lexre
768        lexobj.lexstateretext[state] = re_text
769        if debug:
770            for i in range(len(re_text)):
771                 print "lex: state '%s'. regex[%d] = '%s'" % (state, i, re_text[i])
772
773    # For inclusive states, we need to add the INITIAL state
774    for state,type in stateinfo.items():
775        if state != "INITIAL" and type == 'inclusive':
776             lexobj.lexstatere[state].extend(lexobj.lexstatere['INITIAL'])
777             lexobj.lexstateretext[state].extend(lexobj.lexstateretext['INITIAL'])
778
779    lexobj.lexstateinfo = stateinfo
780    lexobj.lexre = lexobj.lexstatere["INITIAL"]
781    lexobj.lexretext = lexobj.lexstateretext["INITIAL"]
782
783    # Set up ignore variables
784    lexobj.lexstateignore = ignore
785    lexobj.lexignore = lexobj.lexstateignore.get("INITIAL","")
786
787    # Set up error functions
788    lexobj.lexstateerrorf = errorf
789    lexobj.lexerrorf = errorf.get("INITIAL",None)
790    if warn and not lexobj.lexerrorf:
791        print "lex: Warning. no t_error rule is defined."
792
793    # Check state information for ignore and error rules
794    for s,stype in stateinfo.items():
795        if stype == 'exclusive':
796              if warn and not errorf.has_key(s):
797                   print "lex: Warning. no error rule is defined for exclusive state '%s'" % s
798              if warn and not ignore.has_key(s) and lexobj.lexignore:
799                   print "lex: Warning. no ignore rule is defined for exclusive state '%s'" % s
800        elif stype == 'inclusive':
801              if not errorf.has_key(s):
802                   errorf[s] = errorf.get("INITIAL",None)
803              if not ignore.has_key(s):
804                   ignore[s] = ignore.get("INITIAL","")
805
806
807    # Create global versions of the token() and input() functions
808    token = lexobj.token
809    input = lexobj.input
810    lexer = lexobj
811
812    # If in optimize mode, we write the lextab
813    if lextab and optimize:
814        lexobj.writetab(lextab)
815
816    return lexobj
817
818# -----------------------------------------------------------------------------
819# runmain()
820#
821# This runs the lexer as a main program
822# -----------------------------------------------------------------------------
823
824def runmain(lexer=None,data=None):
825    if not data:
826        try:
827            filename = sys.argv[1]
828            f = open(filename)
829            data = f.read()
830            f.close()
831        except IndexError:
832            print "Reading from standard input (type EOF to end):"
833            data = sys.stdin.read()
834
835    if lexer:
836        _input = lexer.input
837    else:
838        _input = input
839    _input(data)
840    if lexer:
841        _token = lexer.token
842    else:
843        _token = token
844
845    while 1:
846        tok = _token()
847        if not tok: break
848        print "(%s,%r,%d,%d)" % (tok.type, tok.value, tok.lineno,tok.lexpos)
849
850
851# -----------------------------------------------------------------------------
852# @TOKEN(regex)
853#
854# This decorator function can be used to set the regex expression on a function
855# when its docstring might need to be set in an alternative way
856# -----------------------------------------------------------------------------
857
858def TOKEN(r):
859    def set_doc(f):
860        f.__doc__ = r
861        return f
862    return set_doc
863
864# Alternative spelling of the TOKEN decorator
865Token = TOKEN
866
867