• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#-----------------------------------------------------------------------------
2# ply: lex.py
3#
4# Author: David M. Beazley (dave@dabeaz.com)
5#
6# Copyright (C) 2001-2006, David M. Beazley
7#
8# This library is free software; you can redistribute it and/or
9# modify it under the terms of the GNU Lesser General Public
10# License as published by the Free Software Foundation; either
11# version 2.1 of the License, or (at your option) any later version.
12#
13# This library is distributed in the hope that it will be useful,
14# but WITHOUT ANY WARRANTY; without even the implied warranty of
15# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16# Lesser General Public License for more details.
17#
18# You should have received a copy of the GNU Lesser General Public
19# License along with this library; if not, write to the Free Software
20# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
21#
22# See the file COPYING for a complete copy of the LGPL.
23#-----------------------------------------------------------------------------
24
25__version__ = "2.2"
26
27import re, sys, types
28
29from . import util
30import collections
31
32
33# Regular expression used to match valid token names
34_is_identifier = re.compile(r'^[a-zA-Z0-9_]+$')
35
36# Available instance types.  This is used when parsers are defined by a class.
37# In Python3 the InstanceType and ObjectType are no more, they've passed, ceased
38# to be, they are ex-classes along with old-style classes
39
40try:
41   _INSTANCETYPE = (types.InstanceType, types.ObjectType)
42except AttributeError:
43   _INSTANCETYPE = object
44
45# Exception thrown when invalid token encountered and no default error
46# handler is defined.
47class LexError(Exception):
48    def __init__(self,message,s):
49         self.args = (message,)
50         self.text = s
51
52# Token class
53class LexToken(object):
54    def __str__(self):
55        return "LexToken(%s,%r,%d,%d)" % (self.type,self.value,self.lineno,self.lexpos)
56    def __repr__(self):
57        return str(self)
58    def skip(self,n):
59        self.lexer.skip(n)
60
61# -----------------------------------------------------------------------------
62# Lexer class
63#
64# This class encapsulates all of the methods and data associated with a lexer.
65#
66#    input()          -  Store a new string in the lexer
67#    token()          -  Get the next token
68# -----------------------------------------------------------------------------
69
70class Lexer:
71    def __init__(self):
72        self.lexre = None             # Master regular expression. This is a list of
73                                      # tuples (re,findex) where re is a compiled
74                                      # regular expression and findex is a list
75                                      # mapping regex group numbers to rules
76        self.lexretext = None         # Current regular expression strings
77        self.lexstatere = {}          # Dictionary mapping lexer states to master regexs
78        self.lexstateretext = {}      # Dictionary mapping lexer states to regex strings
79        self.lexstate = "INITIAL"     # Current lexer state
80        self.lexstatestack = []       # Stack of lexer states
81        self.lexstateinfo = None      # State information
82        self.lexstateignore = {}      # Dictionary of ignored characters for each state
83        self.lexstateerrorf = {}      # Dictionary of error functions for each state
84        self.lexreflags = 0           # Optional re compile flags
85        self.lexdata = None           # Actual input data (as a string)
86        self.lexpos = 0               # Current position in input text
87        self.lexlen = 0               # Length of the input text
88        self.lexerrorf = None         # Error rule (if any)
89        self.lextokens = None         # List of valid tokens
90        self.lexignore = ""           # Ignored characters
91        self.lexliterals = ""         # Literal characters that can be passed through
92        self.lexmodule = None         # Module
93        self.lineno = 1               # Current line number
94        self.lexdebug = 0             # Debugging mode
95        self.lexoptimize = 0          # Optimized mode
96
97    def clone(self,object=None):
98        c = Lexer()
99        c.lexstatere = self.lexstatere
100        c.lexstateinfo = self.lexstateinfo
101        c.lexstateretext = self.lexstateretext
102        c.lexstate = self.lexstate
103        c.lexstatestack = self.lexstatestack
104        c.lexstateignore = self.lexstateignore
105        c.lexstateerrorf = self.lexstateerrorf
106        c.lexreflags = self.lexreflags
107        c.lexdata = self.lexdata
108        c.lexpos = self.lexpos
109        c.lexlen = self.lexlen
110        c.lextokens = self.lextokens
111        c.lexdebug = self.lexdebug
112        c.lineno = self.lineno
113        c.lexoptimize = self.lexoptimize
114        c.lexliterals = self.lexliterals
115        c.lexmodule   = self.lexmodule
116
117        # If the object parameter has been supplied, it means we are attaching the
118        # lexer to a new object.  In this case, we have to rebind all methods in
119        # the lexstatere and lexstateerrorf tables.
120
121        if object:
122            newtab = { }
123            for key, ritem in self.lexstatere.items():
124                newre = []
125                for cre, findex in ritem:
126                     newfindex = []
127                     for f in findex:
128                         if not f or not f[0]:
129                             newfindex.append(f)
130                             continue
131                         newfindex.append((getattr(object,f[0].__name__),f[1]))
132                newre.append((cre,newfindex))
133                newtab[key] = newre
134            c.lexstatere = newtab
135            c.lexstateerrorf = { }
136            for key, ef in self.lexstateerrorf.items():
137                c.lexstateerrorf[key] = getattr(object,ef.__name__)
138            c.lexmodule = object
139
140        # Set up other attributes
141        c.begin(c.lexstate)
142        return c
143
144    # ------------------------------------------------------------
145    # writetab() - Write lexer information to a table file
146    # ------------------------------------------------------------
147    def writetab(self,tabfile):
148        tf = open(tabfile+".py","w")
149        tf.write("# %s.py. This file automatically created by PLY (version %s). Don't edit!\n" % (tabfile,__version__))
150        tf.write("_lextokens    = %s\n" % repr(self.lextokens))
151        tf.write("_lexreflags   = %s\n" % repr(self.lexreflags))
152        tf.write("_lexliterals  = %s\n" % repr(self.lexliterals))
153        tf.write("_lexstateinfo = %s\n" % repr(self.lexstateinfo))
154
155        tabre = { }
156        for key, lre in self.lexstatere.items():
157             titem = []
158             for i in range(len(lre)):
159                  titem.append((self.lexstateretext[key][i],_funcs_to_names(lre[i][1])))
160             tabre[key] = titem
161
162        tf.write("_lexstatere   = %s\n" % repr(tabre))
163        tf.write("_lexstateignore = %s\n" % repr(self.lexstateignore))
164
165        taberr = { }
166        for key, ef in self.lexstateerrorf.items():
167             if ef:
168                  taberr[key] = ef.__name__
169             else:
170                  taberr[key] = None
171        tf.write("_lexstateerrorf = %s\n" % repr(taberr))
172        tf.close()
173
174    # ------------------------------------------------------------
175    # readtab() - Read lexer information from a tab file
176    # ------------------------------------------------------------
177    def readtab(self,tabfile,fdict):
178        exec("import %s as lextab" % tabfile)
179        self.lextokens      = lextab._lextokens
180        self.lexreflags     = lextab._lexreflags
181        self.lexliterals    = lextab._lexliterals
182        self.lexstateinfo   = lextab._lexstateinfo
183        self.lexstateignore = lextab._lexstateignore
184        self.lexstatere     = { }
185        self.lexstateretext = { }
186        for key,lre in lextab._lexstatere.items():
187             titem = []
188             txtitem = []
189             for i in range(len(lre)):
190                  titem.append((re.compile(lre[i][0],lextab._lexreflags),_names_to_funcs(lre[i][1],fdict)))
191                  txtitem.append(lre[i][0])
192             self.lexstatere[key] = titem
193             self.lexstateretext[key] = txtitem
194        self.lexstateerrorf = { }
195        for key,ef in lextab._lexstateerrorf.items():
196             self.lexstateerrorf[key] = fdict[ef]
197        self.begin('INITIAL')
198
199    # ------------------------------------------------------------
200    # input() - Push a new string into the lexer
201    # ------------------------------------------------------------
202    def input(self,s):
203        if not (isinstance(s,util.bytes_type) or isinstance(s, util.string_type)):
204            raise ValueError("Expected a string")
205        self.lexdata = s
206        self.lexpos = 0
207        self.lexlen = len(s)
208
209    # ------------------------------------------------------------
210    # begin() - Changes the lexing state
211    # ------------------------------------------------------------
212    def begin(self,state):
213        if state not in self.lexstatere:
214            raise ValueError("Undefined state")
215        self.lexre = self.lexstatere[state]
216        self.lexretext = self.lexstateretext[state]
217        self.lexignore = self.lexstateignore.get(state,"")
218        self.lexerrorf = self.lexstateerrorf.get(state,None)
219        self.lexstate = state
220
221    # ------------------------------------------------------------
222    # push_state() - Changes the lexing state and saves old on stack
223    # ------------------------------------------------------------
224    def push_state(self,state):
225        self.lexstatestack.append(self.lexstate)
226        self.begin(state)
227
228    # ------------------------------------------------------------
229    # pop_state() - Restores the previous state
230    # ------------------------------------------------------------
231    def pop_state(self):
232        self.begin(self.lexstatestack.pop())
233
234    # ------------------------------------------------------------
235    # current_state() - Returns the current lexing state
236    # ------------------------------------------------------------
237    def current_state(self):
238        return self.lexstate
239
240    # ------------------------------------------------------------
241    # skip() - Skip ahead n characters
242    # ------------------------------------------------------------
243    def skip(self,n):
244        self.lexpos += n
245
246    # ------------------------------------------------------------
247    # token() - Return the next token from the Lexer
248    #
249    # Note: This function has been carefully implemented to be as fast
250    # as possible.  Don't make changes unless you really know what
251    # you are doing
252    # ------------------------------------------------------------
253    def token(self):
254        # Make local copies of frequently referenced attributes
255        lexpos    = self.lexpos
256        lexlen    = self.lexlen
257        lexignore = self.lexignore
258        lexdata   = self.lexdata
259
260        while lexpos < lexlen:
261            # This code provides some short-circuit code for whitespace, tabs, and other ignored characters
262            if lexdata[lexpos] in lexignore:
263                lexpos += 1
264                continue
265
266            # Look for a regular expression match
267            for lexre,lexindexfunc in self.lexre:
268                m = lexre.match(lexdata,lexpos)
269                if not m: continue
270
271                # Set last match in lexer so that rules can access it if they want
272                self.lexmatch = m
273
274                # Create a token for return
275                tok = LexToken()
276                tok.value = m.group()
277                tok.lineno = self.lineno
278                tok.lexpos = lexpos
279                tok.lexer = self
280
281                lexpos = m.end()
282                i = m.lastindex
283                func,tok.type = lexindexfunc[i]
284                self.lexpos = lexpos
285
286                if not func:
287                   # If no token type was set, it's an ignored token
288                   if tok.type: return tok
289                   break
290
291                # if func not callable, it means it's an ignored token
292                if not isinstance(func, collections.Callable):
293                   break
294
295                # If token is processed by a function, call it
296                newtok = func(tok)
297
298                # Every function must return a token, if nothing, we just move to next token
299                if not newtok:
300                    lexpos = self.lexpos        # This is here in case user has updated lexpos.
301                    break
302
303                # Verify type of the token.  If not in the token map, raise an error
304                if not self.lexoptimize:
305                    if newtok.type not in self.lextokens:
306                        raise LexError("%s:%d: Rule '%s' returned an unknown token type '%s'" % (
307                            func.__code__.co_filename, func.__code__.co_firstlineno,
308                            func.__name__, newtok.type),lexdata[lexpos:])
309
310                return newtok
311            else:
312                # No match, see if in literals
313                if lexdata[lexpos] in self.lexliterals:
314                    tok = LexToken()
315                    tok.value = lexdata[lexpos]
316                    tok.lineno = self.lineno
317                    tok.lexer = self
318                    tok.type = tok.value
319                    tok.lexpos = lexpos
320                    self.lexpos = lexpos + 1
321                    return tok
322
323                # No match. Call t_error() if defined.
324                if self.lexerrorf:
325                    tok = LexToken()
326                    tok.value = self.lexdata[lexpos:]
327                    tok.lineno = self.lineno
328                    tok.type = "error"
329                    tok.lexer = self
330                    tok.lexpos = lexpos
331                    self.lexpos = lexpos
332                    newtok = self.lexerrorf(tok)
333                    if lexpos == self.lexpos:
334                        # Error method didn't change text position at all. This is an error.
335                        raise LexError("Scanning error. Illegal character '%s'" % (lexdata[lexpos]), lexdata[lexpos:])
336                    lexpos = self.lexpos
337                    if not newtok: continue
338                    return newtok
339
340                self.lexpos = lexpos
341                raise LexError("Illegal character '%s' at index %d" % (lexdata[lexpos],lexpos), lexdata[lexpos:])
342
343        self.lexpos = lexpos + 1
344        if self.lexdata is None:
345             raise RuntimeError("No input string given with input()")
346        return None
347
348# -----------------------------------------------------------------------------
349# _validate_file()
350#
351# This checks to see if there are duplicated t_rulename() functions or strings
352# in the parser input file.  This is done using a simple regular expression
353# match on each line in the filename.
354# -----------------------------------------------------------------------------
355
356def _validate_file(filename):
357    import os.path
358    base,ext = os.path.splitext(filename)
359    if ext != '.py': return 1        # No idea what the file is. Return OK
360
361    try:
362        f = open(filename)
363        lines = f.readlines()
364        f.close()
365    except IOError:
366        return 1                       # Oh well
367
368    fre = re.compile(r'\s*def\s+(t_[a-zA-Z_0-9]*)\(')
369    sre = re.compile(r'\s*(t_[a-zA-Z_0-9]*)\s*=')
370    counthash = { }
371    linen = 1
372    noerror = 1
373    for l in lines:
374        m = fre.match(l)
375        if not m:
376            m = sre.match(l)
377        if m:
378            name = m.group(1)
379            prev = counthash.get(name)
380            if not prev:
381                counthash[name] = linen
382            else:
383                print("%s:%d: Rule %s redefined. Previously defined on line %d" % (filename,linen,name,prev))
384                noerror = 0
385        linen += 1
386    return noerror
387
388# -----------------------------------------------------------------------------
389# _funcs_to_names()
390#
391# Given a list of regular expression functions, this converts it to a list
392# suitable for output to a table file
393# -----------------------------------------------------------------------------
394
395def _funcs_to_names(funclist):
396    result = []
397    for f in funclist:
398         if f and f[0]:
399             result.append((f[0].__name__,f[1]))
400         else:
401             result.append(f)
402    return result
403
404# -----------------------------------------------------------------------------
405# _names_to_funcs()
406#
407# Given a list of regular expression function names, this converts it back to
408# functions.
409# -----------------------------------------------------------------------------
410
411def _names_to_funcs(namelist,fdict):
412     result = []
413     for n in namelist:
414          if n and n[0]:
415              result.append((fdict[n[0]],n[1]))
416          else:
417              result.append(n)
418     return result
419
420# -----------------------------------------------------------------------------
421# _form_master_re()
422#
423# This function takes a list of all of the regex components and attempts to
424# form the master regular expression.  Given limitations in the Python re
425# module, it may be necessary to break the master regex into separate expressions.
426# -----------------------------------------------------------------------------
427
428def _form_master_re(relist,reflags,ldict):
429    if not relist: return []
430    regex = "|".join(relist)
431    try:
432        lexre = re.compile(regex,re.VERBOSE | reflags)
433
434        # Build the index to function map for the matching engine
435        lexindexfunc = [ None ] * (max(lexre.groupindex.values())+1)
436        for f,i in lexre.groupindex.items():
437            handle = ldict.get(f,None)
438            if type(handle) in (types.FunctionType, types.MethodType):
439                lexindexfunc[i] = (handle,handle.__name__[2:])
440            elif handle is not None:
441                # If rule was specified as a string, we build an anonymous
442                # callback function to carry out the action
443                if f.find("ignore_") > 0:
444                    lexindexfunc[i] = (None,None)
445                    print("IGNORE", f)
446                else:
447                    lexindexfunc[i] = (None, f[2:])
448
449        return [(lexre,lexindexfunc)],[regex]
450    except Exception as e:
451        m = int(len(relist)/2)
452        if m == 0: m = 1
453        llist, lre = _form_master_re(relist[:m],reflags,ldict)
454        rlist, rre = _form_master_re(relist[m:],reflags,ldict)
455        return llist+rlist, lre+rre
456
457# -----------------------------------------------------------------------------
458# def _statetoken(s,names)
459#
460# Given a declaration name s of the form "t_" and a dictionary whose keys are
461# state names, this function returns a tuple (states,tokenname) where states
462# is a tuple of state names and tokenname is the name of the token.  For example,
463# calling this with s = "t_foo_bar_SPAM" might return (('foo','bar'),'SPAM')
464# -----------------------------------------------------------------------------
465
466def _statetoken(s,names):
467    nonstate = 1
468    parts = s.split("_")
469    for i in range(1,len(parts)):
470         if parts[i] not in names and parts[i] != 'ANY': break
471    if i > 1:
472       states = tuple(parts[1:i])
473    else:
474       states = ('INITIAL',)
475
476    if 'ANY' in states:
477       states = tuple(names.keys())
478
479    tokenname = "_".join(parts[i:])
480    return (states,tokenname)
481
482# -----------------------------------------------------------------------------
483# lex(module)
484#
485# Build all of the regular expression rules from definitions in the supplied module
486# -----------------------------------------------------------------------------
487def lex(module=None,object=None,debug=0,optimize=0,lextab="lextab",reflags=0,nowarn=0):
488    global lexer
489    ldict = None
490    stateinfo  = { 'INITIAL' : 'inclusive'}
491    error = 0
492    files = { }
493    lexobj = Lexer()
494    lexobj.lexdebug = debug
495    lexobj.lexoptimize = optimize
496    global token,input
497
498    if nowarn: warn = 0
499    else: warn = 1
500
501    if object: module = object
502
503    if module:
504        # User supplied a module object.
505        if isinstance(module, types.ModuleType):
506            ldict = module.__dict__
507        elif isinstance(module, _INSTANCETYPE):
508            _items = [(k,getattr(module,k)) for k in dir(module)]
509            ldict = { }
510            for (i,v) in _items:
511                ldict[i] = v
512        else:
513            raise ValueError("Expected a module or instance")
514        lexobj.lexmodule = module
515
516    else:
517        # No module given.  We might be able to get information from the caller.
518        try:
519            raise RuntimeError
520        except RuntimeError:
521            e,b,t = sys.exc_info()
522            f = t.tb_frame
523            f = f.f_back           # Walk out to our calling function
524            ldict = f.f_globals    # Grab its globals dictionary
525
526    if optimize and lextab:
527        try:
528            lexobj.readtab(lextab,ldict)
529            token = lexobj.token
530            input = lexobj.input
531            lexer = lexobj
532            return lexobj
533
534        except ImportError:
535            pass
536
537    # Get the tokens, states, and literals variables (if any)
538    if (module and isinstance(module,_INSTANCETYPE)):
539        tokens   = getattr(module,"tokens",None)
540        states   = getattr(module,"states",None)
541        literals = getattr(module,"literals","")
542    else:
543        tokens   = ldict.get("tokens",None)
544        states   = ldict.get("states",None)
545        literals = ldict.get("literals","")
546
547    if not tokens:
548        raise SyntaxError("lex: module does not define 'tokens'")
549    if not (isinstance(tokens,list) or isinstance(tokens,tuple)):
550        raise SyntaxError("lex: tokens must be a list or tuple.")
551
552    # Build a dictionary of valid token names
553    lexobj.lextokens = { }
554    if not optimize:
555        for n in tokens:
556            if not _is_identifier.match(n):
557                print("lex: Bad token name '%s'" % n)
558                error = 1
559            if warn and n in lexobj.lextokens:
560                print("lex: Warning. Token '%s' multiply defined." % n)
561            lexobj.lextokens[n] = None
562    else:
563        for n in tokens: lexobj.lextokens[n] = None
564
565    if debug:
566        print("lex: tokens = '%s'" % list(lexobj.lextokens.keys()))
567
568    try:
569         for c in literals:
570               if not (isinstance(c,util.bytes_type) or isinstance(c, util.string_type)) or len(c) > 1:
571                    print("lex: Invalid literal %s. Must be a single character" % repr(c))
572                    error = 1
573                    continue
574
575    except TypeError:
576         print("lex: Invalid literals specification. literals must be a sequence of characters.")
577         error = 1
578
579    lexobj.lexliterals = literals
580
581    # Build statemap
582    if states:
583         if not (isinstance(states,tuple) or isinstance(states,list)):
584              print("lex: states must be defined as a tuple or list.")
585              error = 1
586         else:
587              for s in states:
588                    if not isinstance(s,tuple) or len(s) != 2:
589                           print("lex: invalid state specifier %s. Must be a tuple (statename,'exclusive|inclusive')" % repr(s))
590                           error = 1
591                           continue
592                    name, statetype = s
593                    if isinstance(name, util.string_type):
594                           original_name = name
595                           name = util.encode_input(name)
596                    if not isinstance(name,util.bytes_type) or len(original_name) != len(name):
597                           print("lex: state name %s must be a byte string" % repr(original_name))
598                           error = 1
599                           continue
600                    if not (statetype == 'inclusive' or statetype == 'exclusive'):
601                           print("lex: state type for state %s must be 'inclusive' or 'exclusive'" % name)
602                           error = 1
603                           continue
604                    if name in stateinfo:
605                           print("lex: state '%s' already defined." % name)
606                           error = 1
607                           continue
608                    stateinfo[name] = statetype
609
610    # Get a list of symbols with the t_ or s_ prefix
611    tsymbols = [f for f in ldict.keys() if f[:2] == 't_' ]
612
613    # Now build up a list of functions and a list of strings
614
615    funcsym =  { }        # Symbols defined as functions
616    strsym =   { }        # Symbols defined as strings
617    toknames = { }        # Mapping of symbols to token names
618
619    for s in stateinfo.keys():
620         funcsym[s] = []
621         strsym[s] = []
622
623    ignore   = { }        # Ignore strings by state
624    errorf   = { }        # Error functions by state
625
626    if len(tsymbols) == 0:
627        raise SyntaxError("lex: no rules of the form t_rulename are defined.")
628
629    for f in tsymbols:
630        t = ldict[f]
631        states, tokname = _statetoken(f,stateinfo)
632        toknames[f] = tokname
633
634        if isinstance(t, collections.Callable):
635            for s in states: funcsym[s].append((f,t))
636        elif (isinstance(t, util.bytes_type) or isinstance(t,util.string_type)):
637            for s in states: strsym[s].append((f,t))
638        else:
639            print("lex: %s not defined as a function or string" % f)
640            error = 1
641
642    # Sort the functions by line number
643    for f in funcsym.values():
644        f.sort(key=lambda x: x[1].__code__.co_firstlineno)
645
646    # Sort the strings by regular expression length
647    for s in strsym.values():
648        s.sort(key=lambda x: len(x[1]))
649
650    regexs = { }
651
652    # Build the master regular expressions
653    for state in stateinfo.keys():
654        regex_list = []
655
656        # Add rules defined by functions first
657        for fname, f in funcsym[state]:
658            line = f.__code__.co_firstlineno
659            file = f.__code__.co_filename
660            files[file] = None
661            tokname = toknames[fname]
662
663            ismethod = isinstance(f, types.MethodType)
664
665            if not optimize:
666                nargs = f.__code__.co_argcount
667                if ismethod:
668                    reqargs = 2
669                else:
670                    reqargs = 1
671                if nargs > reqargs:
672                    print("%s:%d: Rule '%s' has too many arguments." % (file,line,f.__name__))
673                    error = 1
674                    continue
675
676                if nargs < reqargs:
677                    print("%s:%d: Rule '%s' requires an argument." % (file,line,f.__name__))
678                    error = 1
679                    continue
680
681                if tokname == 'ignore':
682                    print("%s:%d: Rule '%s' must be defined as a string." % (file,line,f.__name__))
683                    error = 1
684                    continue
685
686            if tokname == 'error':
687                errorf[state] = f
688                continue
689
690            if f.__doc__:
691                if not optimize:
692                    try:
693                        c = re.compile("(?P<%s>%s)" % (f.__name__,f.__doc__), re.VERBOSE | reflags)
694                        if c.match(""):
695                             print("%s:%d: Regular expression for rule '%s' matches empty string." % (file,line,f.__name__))
696                             error = 1
697                             continue
698                    except re.error as e:
699                        print("%s:%d: Invalid regular expression for rule '%s'. %s" % (file,line,f.__name__,e))
700                        if '#' in f.__doc__:
701                             print("%s:%d. Make sure '#' in rule '%s' is escaped with '\\#'." % (file,line, f.__name__))
702                        error = 1
703                        continue
704
705                    if debug:
706                        print("lex: Adding rule %s -> '%s' (state '%s')" % (f.__name__,f.__doc__, state))
707
708                # Okay. The regular expression seemed okay.  Let's append it to the master regular
709                # expression we're building
710
711                regex_list.append("(?P<%s>%s)" % (f.__name__,f.__doc__))
712            else:
713                print("%s:%d: No regular expression defined for rule '%s'" % (file,line,f.__name__))
714
715        # Now add all of the simple rules
716        for name,r in strsym[state]:
717            tokname = toknames[name]
718
719            if tokname == 'ignore':
720                 ignore[state] = r
721                 continue
722
723            if not optimize:
724                if tokname == 'error':
725                    raise SyntaxError("lex: Rule '%s' must be defined as a function" % name)
726                    error = 1
727                    continue
728
729                if tokname not in lexobj.lextokens and tokname.find("ignore_") < 0:
730                    print("lex: Rule '%s' defined for an unspecified token %s." % (name,tokname))
731                    error = 1
732                    continue
733                try:
734                    c = re.compile("(?P<%s>%s)" % (name,r),re.VERBOSE | reflags)
735                    if (c.match("")):
736                         print("lex: Regular expression for rule '%s' matches empty string." % name)
737                         error = 1
738                         continue
739                except re.error as e:
740                    print("lex: Invalid regular expression for rule '%s'. %s" % (name,e))
741                    if '#' in r:
742                         print("lex: Make sure '#' in rule '%s' is escaped with '\\#'." % name)
743
744                    error = 1
745                    continue
746                if debug:
747                    print("lex: Adding rule %s -> '%s' (state '%s')" % (name,r,state))
748
749            regex_list.append("(?P<%s>%s)" % (name,r))
750
751        if not regex_list:
752             print("lex: No rules defined for state '%s'" % state)
753             error = 1
754
755        regexs[state] = regex_list
756
757
758    if not optimize:
759        for f in files.keys():
760           if not _validate_file(f):
761                error = 1
762
763    if error:
764        raise SyntaxError("lex: Unable to build lexer.")
765
766    # From this point forward, we're reasonably confident that we can build the lexer.
767    # No more errors will be generated, but there might be some warning messages.
768
769    # Build the master regular expressions
770
771    for state in regexs.keys():
772        lexre, re_text = _form_master_re(regexs[state],reflags,ldict)
773        lexobj.lexstatere[state] = lexre
774        lexobj.lexstateretext[state] = re_text
775        if debug:
776            for i in range(len(re_text)):
777                 print("lex: state '%s'. regex[%d] = '%s'" % (state, i, re_text[i]))
778
779    # For inclusive states, we need to add the INITIAL state
780    for state,type in stateinfo.items():
781        if state != "INITIAL" and type == 'inclusive':
782             lexobj.lexstatere[state].extend(lexobj.lexstatere['INITIAL'])
783             lexobj.lexstateretext[state].extend(lexobj.lexstateretext['INITIAL'])
784
785    lexobj.lexstateinfo = stateinfo
786    lexobj.lexre = lexobj.lexstatere["INITIAL"]
787    lexobj.lexretext = lexobj.lexstateretext["INITIAL"]
788
789    # Set up ignore variables
790    lexobj.lexstateignore = ignore
791    lexobj.lexignore = lexobj.lexstateignore.get("INITIAL","")
792
793    # Set up error functions
794    lexobj.lexstateerrorf = errorf
795    lexobj.lexerrorf = errorf.get("INITIAL",None)
796    if warn and not lexobj.lexerrorf:
797        print("lex: Warning. no t_error rule is defined.")
798
799    # Check state information for ignore and error rules
800    for s,stype in stateinfo.items():
801        if stype == 'exclusive':
802              if warn and s not in errorf:
803                   print("lex: Warning. no error rule is defined for exclusive state '%s'" % s)
804              if warn and s not in ignore and lexobj.lexignore:
805                   print("lex: Warning. no ignore rule is defined for exclusive state '%s'" % s)
806        elif stype == 'inclusive':
807              if s not in errorf:
808                   errorf[s] = errorf.get("INITIAL",None)
809              if s not in ignore:
810                   ignore[s] = ignore.get("INITIAL","")
811
812
813    # Create global versions of the token() and input() functions
814    token = lexobj.token
815    input = lexobj.input
816    lexer = lexobj
817
818    # If in optimize mode, we write the lextab
819    if lextab and optimize:
820        lexobj.writetab(lextab)
821
822    return lexobj
823
824# -----------------------------------------------------------------------------
825# runmain()
826#
827# This runs the lexer as a main program
828# -----------------------------------------------------------------------------
829
830def runmain(lexer=None,data=None):
831    if not data:
832        try:
833            filename = sys.argv[1]
834            f = open(filename)
835            data = f.read()
836            f.close()
837        except IndexError:
838            print("Reading from standard input (type EOF to end):")
839            data = sys.stdin.read()
840
841    if lexer:
842        _input = lexer.input
843    else:
844        _input = input
845    _input(data)
846    if lexer:
847        _token = lexer.token
848    else:
849        _token = token
850
851    while 1:
852        tok = _token()
853        if not tok: break
854        print("(%s,%r,%d,%d)" % (tok.type, tok.value, tok.lineno,tok.lexpos))
855
856
857# -----------------------------------------------------------------------------
858# @TOKEN(regex)
859#
860# This decorator function can be used to set the regex expression on a function
861# when its docstring might need to be set in an alternative way
862# -----------------------------------------------------------------------------
863
864def TOKEN(r):
865    def set_doc(f):
866        f.__doc__ = r
867        return f
868    return set_doc
869
870# Alternative spelling of the TOKEN decorator
871Token = TOKEN
872
873