1# -*- coding: iso-8859-1 -*- 2"""A lexical analyzer class for simple shell-like syntaxes.""" 3 4# Module and documentation by Eric S. Raymond, 21 Dec 1998 5# Input stacking and error message cleanup added by ESR, March 2000 6# push_source() and pop_source() made explicit by ESR, January 2001. 7# Posix compliance, split(), string arguments, and 8# iterator interface by Gustavo Niemeyer, April 2003. 9 10import os.path 11import sys 12from collections import deque 13 14try: 15 from cStringIO import StringIO 16except ImportError: 17 from StringIO import StringIO 18 19__all__ = ["shlex", "split"] 20 21class shlex: 22 "A lexical analyzer class for simple shell-like syntaxes." 23 def __init__(self, instream=None, infile=None, posix=False): 24 if isinstance(instream, basestring): 25 instream = StringIO(instream) 26 if instream is not None: 27 self.instream = instream 28 self.infile = infile 29 else: 30 self.instream = sys.stdin 31 self.infile = None 32 self.posix = posix 33 if posix: 34 self.eof = None 35 else: 36 self.eof = '' 37 self.commenters = '#' 38 self.wordchars = ('abcdfeghijklmnopqrstuvwxyz' 39 'ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_') 40 if self.posix: 41 self.wordchars += ('��������������������������������' 42 '������������������������������') 43 self.whitespace = ' \t\r\n' 44 self.whitespace_split = False 45 self.quotes = '\'"' 46 self.escape = '\\' 47 self.escapedquotes = '"' 48 self.state = ' ' 49 self.pushback = deque() 50 self.lineno = 1 51 self.debug = 0 52 self.token = '' 53 self.filestack = deque() 54 self.source = None 55 if self.debug: 56 print 'shlex: reading from %s, line %d' \ 57 % (self.instream, self.lineno) 58 59 def push_token(self, tok): 60 "Push a token onto the stack popped by the get_token method" 61 if self.debug >= 1: 62 print "shlex: pushing token " + repr(tok) 63 self.pushback.appendleft(tok) 64 65 def push_source(self, newstream, newfile=None): 66 "Push an input source onto the lexer's input source stack." 67 if isinstance(newstream, basestring): 68 newstream = StringIO(newstream) 69 self.filestack.appendleft((self.infile, self.instream, self.lineno)) 70 self.infile = newfile 71 self.instream = newstream 72 self.lineno = 1 73 if self.debug: 74 if newfile is not None: 75 print 'shlex: pushing to file %s' % (self.infile,) 76 else: 77 print 'shlex: pushing to stream %s' % (self.instream,) 78 79 def pop_source(self): 80 "Pop the input source stack." 81 self.instream.close() 82 (self.infile, self.instream, self.lineno) = self.filestack.popleft() 83 if self.debug: 84 print 'shlex: popping to %s, line %d' \ 85 % (self.instream, self.lineno) 86 self.state = ' ' 87 88 def get_token(self): 89 "Get a token from the input stream (or from stack if it's nonempty)" 90 if self.pushback: 91 tok = self.pushback.popleft() 92 if self.debug >= 1: 93 print "shlex: popping token " + repr(tok) 94 return tok 95 # No pushback. Get a token. 96 raw = self.read_token() 97 # Handle inclusions 98 if self.source is not None: 99 while raw == self.source: 100 spec = self.sourcehook(self.read_token()) 101 if spec: 102 (newfile, newstream) = spec 103 self.push_source(newstream, newfile) 104 raw = self.get_token() 105 # Maybe we got EOF instead? 106 while raw == self.eof: 107 if not self.filestack: 108 return self.eof 109 else: 110 self.pop_source() 111 raw = self.get_token() 112 # Neither inclusion nor EOF 113 if self.debug >= 1: 114 if raw != self.eof: 115 print "shlex: token=" + repr(raw) 116 else: 117 print "shlex: token=EOF" 118 return raw 119 120 def read_token(self): 121 quoted = False 122 escapedstate = ' ' 123 while True: 124 nextchar = self.instream.read(1) 125 if nextchar == '\n': 126 self.lineno = self.lineno + 1 127 if self.debug >= 3: 128 print "shlex: in state", repr(self.state), \ 129 "I see character:", repr(nextchar) 130 if self.state is None: 131 self.token = '' # past end of file 132 break 133 elif self.state == ' ': 134 if not nextchar: 135 self.state = None # end of file 136 break 137 elif nextchar in self.whitespace: 138 if self.debug >= 2: 139 print "shlex: I see whitespace in whitespace state" 140 if self.token or (self.posix and quoted): 141 break # emit current token 142 else: 143 continue 144 elif nextchar in self.commenters: 145 self.instream.readline() 146 self.lineno = self.lineno + 1 147 elif self.posix and nextchar in self.escape: 148 escapedstate = 'a' 149 self.state = nextchar 150 elif nextchar in self.wordchars: 151 self.token = nextchar 152 self.state = 'a' 153 elif nextchar in self.quotes: 154 if not self.posix: 155 self.token = nextchar 156 self.state = nextchar 157 elif self.whitespace_split: 158 self.token = nextchar 159 self.state = 'a' 160 else: 161 self.token = nextchar 162 if self.token or (self.posix and quoted): 163 break # emit current token 164 else: 165 continue 166 elif self.state in self.quotes: 167 quoted = True 168 if not nextchar: # end of file 169 if self.debug >= 2: 170 print "shlex: I see EOF in quotes state" 171 # XXX what error should be raised here? 172 raise ValueError, "No closing quotation" 173 if nextchar == self.state: 174 if not self.posix: 175 self.token = self.token + nextchar 176 self.state = ' ' 177 break 178 else: 179 self.state = 'a' 180 elif self.posix and nextchar in self.escape and \ 181 self.state in self.escapedquotes: 182 escapedstate = self.state 183 self.state = nextchar 184 else: 185 self.token = self.token + nextchar 186 elif self.state in self.escape: 187 if not nextchar: # end of file 188 if self.debug >= 2: 189 print "shlex: I see EOF in escape state" 190 # XXX what error should be raised here? 191 raise ValueError, "No escaped character" 192 # In posix shells, only the quote itself or the escape 193 # character may be escaped within quotes. 194 if escapedstate in self.quotes and \ 195 nextchar != self.state and nextchar != escapedstate: 196 self.token = self.token + self.state 197 self.token = self.token + nextchar 198 self.state = escapedstate 199 elif self.state == 'a': 200 if not nextchar: 201 self.state = None # end of file 202 break 203 elif nextchar in self.whitespace: 204 if self.debug >= 2: 205 print "shlex: I see whitespace in word state" 206 self.state = ' ' 207 if self.token or (self.posix and quoted): 208 break # emit current token 209 else: 210 continue 211 elif nextchar in self.commenters: 212 self.instream.readline() 213 self.lineno = self.lineno + 1 214 if self.posix: 215 self.state = ' ' 216 if self.token or (self.posix and quoted): 217 break # emit current token 218 else: 219 continue 220 elif self.posix and nextchar in self.quotes: 221 self.state = nextchar 222 elif self.posix and nextchar in self.escape: 223 escapedstate = 'a' 224 self.state = nextchar 225 elif nextchar in self.wordchars or nextchar in self.quotes \ 226 or self.whitespace_split: 227 self.token = self.token + nextchar 228 else: 229 self.pushback.appendleft(nextchar) 230 if self.debug >= 2: 231 print "shlex: I see punctuation in word state" 232 self.state = ' ' 233 if self.token or (self.posix and quoted): 234 break # emit current token 235 else: 236 continue 237 result = self.token 238 self.token = '' 239 if self.posix and not quoted and result == '': 240 result = None 241 if self.debug > 1: 242 if result: 243 print "shlex: raw token=" + repr(result) 244 else: 245 print "shlex: raw token=EOF" 246 return result 247 248 def sourcehook(self, newfile): 249 "Hook called on a filename to be sourced." 250 if newfile[0] == '"': 251 newfile = newfile[1:-1] 252 # This implements cpp-like semantics for relative-path inclusion. 253 if isinstance(self.infile, basestring) and not os.path.isabs(newfile): 254 newfile = os.path.join(os.path.dirname(self.infile), newfile) 255 return (newfile, open(newfile, "r")) 256 257 def error_leader(self, infile=None, lineno=None): 258 "Emit a C-compiler-like, Emacs-friendly error-message leader." 259 if infile is None: 260 infile = self.infile 261 if lineno is None: 262 lineno = self.lineno 263 return "\"%s\", line %d: " % (infile, lineno) 264 265 def __iter__(self): 266 return self 267 268 def next(self): 269 token = self.get_token() 270 if token == self.eof: 271 raise StopIteration 272 return token 273 274def split(s, comments=False, posix=True): 275 lex = shlex(s, posix=posix) 276 lex.whitespace_split = True 277 if not comments: 278 lex.commenters = '' 279 return list(lex) 280 281if __name__ == '__main__': 282 if len(sys.argv) == 1: 283 lexer = shlex() 284 else: 285 file = sys.argv[1] 286 lexer = shlex(open(file), file) 287 while 1: 288 tt = lexer.get_token() 289 if tt: 290 print "Token: " + repr(tt) 291 else: 292 break 293