1"""A lexical analyzer class for simple shell-like syntaxes.""" 2 3# Module and documentation by Eric S. Raymond, 21 Dec 1998 4# Input stacking and error message cleanup added by ESR, March 2000 5# push_source() and pop_source() made explicit by ESR, January 2001. 6# Posix compliance, split(), string arguments, and 7# iterator interface by Gustavo Niemeyer, April 2003. 8# changes to tokenize more like Posix shells by Vinay Sajip, July 2016. 9 10import os 11import re 12import sys 13from collections import deque 14 15from io import StringIO 16 17__all__ = ["shlex", "split", "quote", "join"] 18 19class shlex: 20 "A lexical analyzer class for simple shell-like syntaxes." 21 def __init__(self, instream=None, infile=None, posix=False, 22 punctuation_chars=False): 23 if isinstance(instream, str): 24 instream = StringIO(instream) 25 if instream is not None: 26 self.instream = instream 27 self.infile = infile 28 else: 29 self.instream = sys.stdin 30 self.infile = None 31 self.posix = posix 32 if posix: 33 self.eof = None 34 else: 35 self.eof = '' 36 self.commenters = '#' 37 self.wordchars = ('abcdfeghijklmnopqrstuvwxyz' 38 'ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_') 39 if self.posix: 40 self.wordchars += ('ßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ' 41 'ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞ') 42 self.whitespace = ' \t\r\n' 43 self.whitespace_split = False 44 self.quotes = '\'"' 45 self.escape = '\\' 46 self.escapedquotes = '"' 47 self.state = ' ' 48 self.pushback = deque() 49 self.lineno = 1 50 self.debug = 0 51 self.token = '' 52 self.filestack = deque() 53 self.source = None 54 if not punctuation_chars: 55 punctuation_chars = '' 56 elif punctuation_chars is True: 57 punctuation_chars = '();<>|&' 58 self._punctuation_chars = punctuation_chars 59 if punctuation_chars: 60 # _pushback_chars is a push back queue used by lookahead logic 61 self._pushback_chars = deque() 62 # these chars added because allowed in file names, args, wildcards 63 self.wordchars += '~-./*?=' 64 #remove any punctuation chars from wordchars 65 t = self.wordchars.maketrans(dict.fromkeys(punctuation_chars)) 66 self.wordchars = self.wordchars.translate(t) 67 68 @property 69 def punctuation_chars(self): 70 return self._punctuation_chars 71 72 def push_token(self, tok): 73 "Push a token onto the stack popped by the get_token method" 74 if self.debug >= 1: 75 print("shlex: pushing token " + repr(tok)) 76 self.pushback.appendleft(tok) 77 78 def push_source(self, newstream, newfile=None): 79 "Push an input source onto the lexer's input source stack." 80 if isinstance(newstream, str): 81 newstream = StringIO(newstream) 82 self.filestack.appendleft((self.infile, self.instream, self.lineno)) 83 self.infile = newfile 84 self.instream = newstream 85 self.lineno = 1 86 if self.debug: 87 if newfile is not None: 88 print('shlex: pushing to file %s' % (self.infile,)) 89 else: 90 print('shlex: pushing to stream %s' % (self.instream,)) 91 92 def pop_source(self): 93 "Pop the input source stack." 94 self.instream.close() 95 (self.infile, self.instream, self.lineno) = self.filestack.popleft() 96 if self.debug: 97 print('shlex: popping to %s, line %d' \ 98 % (self.instream, self.lineno)) 99 self.state = ' ' 100 101 def get_token(self): 102 "Get a token from the input stream (or from stack if it's nonempty)" 103 if self.pushback: 104 tok = self.pushback.popleft() 105 if self.debug >= 1: 106 print("shlex: popping token " + repr(tok)) 107 return tok 108 # No pushback. Get a token. 109 raw = self.read_token() 110 # Handle inclusions 111 if self.source is not None: 112 while raw == self.source: 113 spec = self.sourcehook(self.read_token()) 114 if spec: 115 (newfile, newstream) = spec 116 self.push_source(newstream, newfile) 117 raw = self.get_token() 118 # Maybe we got EOF instead? 119 while raw == self.eof: 120 if not self.filestack: 121 return self.eof 122 else: 123 self.pop_source() 124 raw = self.get_token() 125 # Neither inclusion nor EOF 126 if self.debug >= 1: 127 if raw != self.eof: 128 print("shlex: token=" + repr(raw)) 129 else: 130 print("shlex: token=EOF") 131 return raw 132 133 def read_token(self): 134 quoted = False 135 escapedstate = ' ' 136 while True: 137 if self.punctuation_chars and self._pushback_chars: 138 nextchar = self._pushback_chars.pop() 139 else: 140 nextchar = self.instream.read(1) 141 if nextchar == '\n': 142 self.lineno += 1 143 if self.debug >= 3: 144 print("shlex: in state %r I see character: %r" % (self.state, 145 nextchar)) 146 if self.state is None: 147 self.token = '' # past end of file 148 break 149 elif self.state == ' ': 150 if not nextchar: 151 self.state = None # end of file 152 break 153 elif nextchar in self.whitespace: 154 if self.debug >= 2: 155 print("shlex: I see whitespace in whitespace state") 156 if self.token or (self.posix and quoted): 157 break # emit current token 158 else: 159 continue 160 elif nextchar in self.commenters: 161 self.instream.readline() 162 self.lineno += 1 163 elif self.posix and nextchar in self.escape: 164 escapedstate = 'a' 165 self.state = nextchar 166 elif nextchar in self.wordchars: 167 self.token = nextchar 168 self.state = 'a' 169 elif nextchar in self.punctuation_chars: 170 self.token = nextchar 171 self.state = 'c' 172 elif nextchar in self.quotes: 173 if not self.posix: 174 self.token = nextchar 175 self.state = nextchar 176 elif self.whitespace_split: 177 self.token = nextchar 178 self.state = 'a' 179 else: 180 self.token = nextchar 181 if self.token or (self.posix and quoted): 182 break # emit current token 183 else: 184 continue 185 elif self.state in self.quotes: 186 quoted = True 187 if not nextchar: # end of file 188 if self.debug >= 2: 189 print("shlex: I see EOF in quotes state") 190 # XXX what error should be raised here? 191 raise ValueError("No closing quotation") 192 if nextchar == self.state: 193 if not self.posix: 194 self.token += nextchar 195 self.state = ' ' 196 break 197 else: 198 self.state = 'a' 199 elif (self.posix and nextchar in self.escape and self.state 200 in self.escapedquotes): 201 escapedstate = self.state 202 self.state = nextchar 203 else: 204 self.token += nextchar 205 elif self.state in self.escape: 206 if not nextchar: # end of file 207 if self.debug >= 2: 208 print("shlex: I see EOF in escape state") 209 # XXX what error should be raised here? 210 raise ValueError("No escaped character") 211 # In posix shells, only the quote itself or the escape 212 # character may be escaped within quotes. 213 if (escapedstate in self.quotes and 214 nextchar != self.state and nextchar != escapedstate): 215 self.token += self.state 216 self.token += nextchar 217 self.state = escapedstate 218 elif self.state in ('a', 'c'): 219 if not nextchar: 220 self.state = None # end of file 221 break 222 elif nextchar in self.whitespace: 223 if self.debug >= 2: 224 print("shlex: I see whitespace in word state") 225 self.state = ' ' 226 if self.token or (self.posix and quoted): 227 break # emit current token 228 else: 229 continue 230 elif nextchar in self.commenters: 231 self.instream.readline() 232 self.lineno += 1 233 if self.posix: 234 self.state = ' ' 235 if self.token or (self.posix and quoted): 236 break # emit current token 237 else: 238 continue 239 elif self.state == 'c': 240 if nextchar in self.punctuation_chars: 241 self.token += nextchar 242 else: 243 if nextchar not in self.whitespace: 244 self._pushback_chars.append(nextchar) 245 self.state = ' ' 246 break 247 elif self.posix and nextchar in self.quotes: 248 self.state = nextchar 249 elif self.posix and nextchar in self.escape: 250 escapedstate = 'a' 251 self.state = nextchar 252 elif (nextchar in self.wordchars or nextchar in self.quotes 253 or (self.whitespace_split and 254 nextchar not in self.punctuation_chars)): 255 self.token += nextchar 256 else: 257 if self.punctuation_chars: 258 self._pushback_chars.append(nextchar) 259 else: 260 self.pushback.appendleft(nextchar) 261 if self.debug >= 2: 262 print("shlex: I see punctuation in word state") 263 self.state = ' ' 264 if self.token or (self.posix and quoted): 265 break # emit current token 266 else: 267 continue 268 result = self.token 269 self.token = '' 270 if self.posix and not quoted and result == '': 271 result = None 272 if self.debug > 1: 273 if result: 274 print("shlex: raw token=" + repr(result)) 275 else: 276 print("shlex: raw token=EOF") 277 return result 278 279 def sourcehook(self, newfile): 280 "Hook called on a filename to be sourced." 281 if newfile[0] == '"': 282 newfile = newfile[1:-1] 283 # This implements cpp-like semantics for relative-path inclusion. 284 if isinstance(self.infile, str) and not os.path.isabs(newfile): 285 newfile = os.path.join(os.path.dirname(self.infile), newfile) 286 return (newfile, open(newfile, "r")) 287 288 def error_leader(self, infile=None, lineno=None): 289 "Emit a C-compiler-like, Emacs-friendly error-message leader." 290 if infile is None: 291 infile = self.infile 292 if lineno is None: 293 lineno = self.lineno 294 return "\"%s\", line %d: " % (infile, lineno) 295 296 def __iter__(self): 297 return self 298 299 def __next__(self): 300 token = self.get_token() 301 if token == self.eof: 302 raise StopIteration 303 return token 304 305def split(s, comments=False, posix=True): 306 """Split the string *s* using shell-like syntax.""" 307 if s is None: 308 import warnings 309 warnings.warn("Passing None for 's' to shlex.split() is deprecated.", 310 DeprecationWarning, stacklevel=2) 311 lex = shlex(s, posix=posix) 312 lex.whitespace_split = True 313 if not comments: 314 lex.commenters = '' 315 return list(lex) 316 317 318def join(split_command): 319 """Return a shell-escaped string from *split_command*.""" 320 return ' '.join(quote(arg) for arg in split_command) 321 322 323_find_unsafe = re.compile(r'[^\w@%+=:,./-]', re.ASCII).search 324 325def quote(s): 326 """Return a shell-escaped version of the string *s*.""" 327 if not s: 328 return "''" 329 if _find_unsafe(s) is None: 330 return s 331 332 # use single quotes, and put single quotes into double quotes 333 # the string $'b is then quoted as '$'"'"'b' 334 return "'" + s.replace("'", "'\"'\"'") + "'" 335 336 337def _print_tokens(lexer): 338 while 1: 339 tt = lexer.get_token() 340 if not tt: 341 break 342 print("Token: " + repr(tt)) 343 344if __name__ == '__main__': 345 if len(sys.argv) == 1: 346 _print_tokens(shlex()) 347 else: 348 fn = sys.argv[1] 349 with open(fn) as f: 350 _print_tokens(shlex(f, fn)) 351