1"""Define partial Python code Parser used by editor and hyperparser. 2 3Instances of ParseMap are used with str.translate. 4 5The following bound search and match functions are defined: 6_synchre - start of popular statement; 7_junkre - whitespace or comment line; 8_match_stringre: string, possibly without closer; 9_itemre - line that may have bracket structure start; 10_closere - line that must be followed by dedent. 11_chew_ordinaryre - non-special characters. 12""" 13import re 14 15# Reason last statement is continued (or C_NONE if it's not). 16(C_NONE, C_BACKSLASH, C_STRING_FIRST_LINE, 17 C_STRING_NEXT_LINES, C_BRACKET) = range(5) 18 19# Find what looks like the start of a popular statement. 20 21_synchre = re.compile(r""" 22 ^ 23 [ \t]* 24 (?: while 25 | else 26 | def 27 | return 28 | assert 29 | break 30 | class 31 | continue 32 | elif 33 | try 34 | except 35 | raise 36 | import 37 | yield 38 ) 39 \b 40""", re.VERBOSE | re.MULTILINE).search 41 42# Match blank line or non-indenting comment line. 43 44_junkre = re.compile(r""" 45 [ \t]* 46 (?: \# \S .* )? 47 \n 48""", re.VERBOSE).match 49 50# Match any flavor of string; the terminating quote is optional 51# so that we're robust in the face of incomplete program text. 52 53_match_stringre = re.compile(r""" 54 \""" [^"\\]* (?: 55 (?: \\. | "(?!"") ) 56 [^"\\]* 57 )* 58 (?: \""" )? 59 60| " [^"\\\n]* (?: \\. [^"\\\n]* )* "? 61 62| ''' [^'\\]* (?: 63 (?: \\. | '(?!'') ) 64 [^'\\]* 65 )* 66 (?: ''' )? 67 68| ' [^'\\\n]* (?: \\. [^'\\\n]* )* '? 69""", re.VERBOSE | re.DOTALL).match 70 71# Match a line that starts with something interesting; 72# used to find the first item of a bracket structure. 73 74_itemre = re.compile(r""" 75 [ \t]* 76 [^\s#\\] # if we match, m.end()-1 is the interesting char 77""", re.VERBOSE).match 78 79# Match start of statements that should be followed by a dedent. 80 81_closere = re.compile(r""" 82 \s* 83 (?: return 84 | break 85 | continue 86 | raise 87 | pass 88 ) 89 \b 90""", re.VERBOSE).match 91 92# Chew up non-special chars as quickly as possible. If match is 93# successful, m.end() less 1 is the index of the last boring char 94# matched. If match is unsuccessful, the string starts with an 95# interesting char. 96 97_chew_ordinaryre = re.compile(r""" 98 [^[\](){}#'"\\]+ 99""", re.VERBOSE).match 100 101 102class ParseMap(dict): 103 r"""Dict subclass that maps anything not in dict to 'x'. 104 105 This is designed to be used with str.translate in study1. 106 Anything not specifically mapped otherwise becomes 'x'. 107 Example: replace everything except whitespace with 'x'. 108 109 >>> keepwhite = ParseMap((ord(c), ord(c)) for c in ' \t\n\r') 110 >>> "a + b\tc\nd".translate(keepwhite) 111 'x x x\tx\nx' 112 """ 113 # Calling this triples access time; see bpo-32940 114 def __missing__(self, key): 115 return 120 # ord('x') 116 117 118# Map all ascii to 120 to avoid __missing__ call, then replace some. 119trans = ParseMap.fromkeys(range(128), 120) 120trans.update((ord(c), ord('(')) for c in "({[") # open brackets => '('; 121trans.update((ord(c), ord(')')) for c in ")}]") # close brackets => ')'. 122trans.update((ord(c), ord(c)) for c in "\"'\\\n#") # Keep these. 123 124 125class Parser: 126 127 def __init__(self, indentwidth, tabwidth): 128 self.indentwidth = indentwidth 129 self.tabwidth = tabwidth 130 131 def set_code(self, s): 132 assert len(s) == 0 or s[-1] == '\n' 133 self.code = s 134 self.study_level = 0 135 136 def find_good_parse_start(self, is_char_in_string): 137 """ 138 Return index of a good place to begin parsing, as close to the 139 end of the string as possible. This will be the start of some 140 popular stmt like "if" or "def". Return None if none found: 141 the caller should pass more prior context then, if possible, or 142 if not (the entire program text up until the point of interest 143 has already been tried) pass 0 to set_lo(). 144 145 This will be reliable iff given a reliable is_char_in_string() 146 function, meaning that when it says "no", it's absolutely 147 guaranteed that the char is not in a string. 148 """ 149 code, pos = self.code, None 150 151 # Peek back from the end for a good place to start, 152 # but don't try too often; pos will be left None, or 153 # bumped to a legitimate synch point. 154 limit = len(code) 155 for tries in range(5): 156 i = code.rfind(":\n", 0, limit) 157 if i < 0: 158 break 159 i = code.rfind('\n', 0, i) + 1 # start of colon line (-1+1=0) 160 m = _synchre(code, i, limit) 161 if m and not is_char_in_string(m.start()): 162 pos = m.start() 163 break 164 limit = i 165 if pos is None: 166 # Nothing looks like a block-opener, or stuff does 167 # but is_char_in_string keeps returning true; most likely 168 # we're in or near a giant string, the colorizer hasn't 169 # caught up enough to be helpful, or there simply *aren't* 170 # any interesting stmts. In any of these cases we're 171 # going to have to parse the whole thing to be sure, so 172 # give it one last try from the start, but stop wasting 173 # time here regardless of the outcome. 174 m = _synchre(code) 175 if m and not is_char_in_string(m.start()): 176 pos = m.start() 177 return pos 178 179 # Peeking back worked; look forward until _synchre no longer 180 # matches. 181 i = pos + 1 182 while 1: 183 m = _synchre(code, i) 184 if m: 185 s, i = m.span() 186 if not is_char_in_string(s): 187 pos = s 188 else: 189 break 190 return pos 191 192 def set_lo(self, lo): 193 """ Throw away the start of the string. 194 195 Intended to be called with the result of find_good_parse_start(). 196 """ 197 assert lo == 0 or self.code[lo-1] == '\n' 198 if lo > 0: 199 self.code = self.code[lo:] 200 201 def _study1(self): 202 """Find the line numbers of non-continuation lines. 203 204 As quickly as humanly possible <wink>, find the line numbers (0- 205 based) of the non-continuation lines. 206 Creates self.{goodlines, continuation}. 207 """ 208 if self.study_level >= 1: 209 return 210 self.study_level = 1 211 212 # Map all uninteresting characters to "x", all open brackets 213 # to "(", all close brackets to ")", then collapse runs of 214 # uninteresting characters. This can cut the number of chars 215 # by a factor of 10-40, and so greatly speed the following loop. 216 code = self.code 217 code = code.translate(trans) 218 code = code.replace('xxxxxxxx', 'x') 219 code = code.replace('xxxx', 'x') 220 code = code.replace('xx', 'x') 221 code = code.replace('xx', 'x') 222 code = code.replace('\nx', '\n') 223 # Replacing x\n with \n would be incorrect because 224 # x may be preceded by a backslash. 225 226 # March over the squashed version of the program, accumulating 227 # the line numbers of non-continued stmts, and determining 228 # whether & why the last stmt is a continuation. 229 continuation = C_NONE 230 level = lno = 0 # level is nesting level; lno is line number 231 self.goodlines = goodlines = [0] 232 push_good = goodlines.append 233 i, n = 0, len(code) 234 while i < n: 235 ch = code[i] 236 i = i+1 237 238 # cases are checked in decreasing order of frequency 239 if ch == 'x': 240 continue 241 242 if ch == '\n': 243 lno = lno + 1 244 if level == 0: 245 push_good(lno) 246 # else we're in an unclosed bracket structure 247 continue 248 249 if ch == '(': 250 level = level + 1 251 continue 252 253 if ch == ')': 254 if level: 255 level = level - 1 256 # else the program is invalid, but we can't complain 257 continue 258 259 if ch == '"' or ch == "'": 260 # consume the string 261 quote = ch 262 if code[i-1:i+2] == quote * 3: 263 quote = quote * 3 264 firstlno = lno 265 w = len(quote) - 1 266 i = i+w 267 while i < n: 268 ch = code[i] 269 i = i+1 270 271 if ch == 'x': 272 continue 273 274 if code[i-1:i+w] == quote: 275 i = i+w 276 break 277 278 if ch == '\n': 279 lno = lno + 1 280 if w == 0: 281 # unterminated single-quoted string 282 if level == 0: 283 push_good(lno) 284 break 285 continue 286 287 if ch == '\\': 288 assert i < n 289 if code[i] == '\n': 290 lno = lno + 1 291 i = i+1 292 continue 293 294 # else comment char or paren inside string 295 296 else: 297 # didn't break out of the loop, so we're still 298 # inside a string 299 if (lno - 1) == firstlno: 300 # before the previous \n in code, we were in the first 301 # line of the string 302 continuation = C_STRING_FIRST_LINE 303 else: 304 continuation = C_STRING_NEXT_LINES 305 continue # with outer loop 306 307 if ch == '#': 308 # consume the comment 309 i = code.find('\n', i) 310 assert i >= 0 311 continue 312 313 assert ch == '\\' 314 assert i < n 315 if code[i] == '\n': 316 lno = lno + 1 317 if i+1 == n: 318 continuation = C_BACKSLASH 319 i = i+1 320 321 # The last stmt may be continued for all 3 reasons. 322 # String continuation takes precedence over bracket 323 # continuation, which beats backslash continuation. 324 if (continuation != C_STRING_FIRST_LINE 325 and continuation != C_STRING_NEXT_LINES and level > 0): 326 continuation = C_BRACKET 327 self.continuation = continuation 328 329 # Push the final line number as a sentinel value, regardless of 330 # whether it's continued. 331 assert (continuation == C_NONE) == (goodlines[-1] == lno) 332 if goodlines[-1] != lno: 333 push_good(lno) 334 335 def get_continuation_type(self): 336 self._study1() 337 return self.continuation 338 339 def _study2(self): 340 """ 341 study1 was sufficient to determine the continuation status, 342 but doing more requires looking at every character. study2 343 does this for the last interesting statement in the block. 344 Creates: 345 self.stmt_start, stmt_end 346 slice indices of last interesting stmt 347 self.stmt_bracketing 348 the bracketing structure of the last interesting stmt; for 349 example, for the statement "say(boo) or die", 350 stmt_bracketing will be ((0, 0), (0, 1), (2, 0), (2, 1), 351 (4, 0)). Strings and comments are treated as brackets, for 352 the matter. 353 self.lastch 354 last interesting character before optional trailing comment 355 self.lastopenbracketpos 356 if continuation is C_BRACKET, index of last open bracket 357 """ 358 if self.study_level >= 2: 359 return 360 self._study1() 361 self.study_level = 2 362 363 # Set p and q to slice indices of last interesting stmt. 364 code, goodlines = self.code, self.goodlines 365 i = len(goodlines) - 1 # Index of newest line. 366 p = len(code) # End of goodlines[i] 367 while i: 368 assert p 369 # Make p be the index of the stmt at line number goodlines[i]. 370 # Move p back to the stmt at line number goodlines[i-1]. 371 q = p 372 for nothing in range(goodlines[i-1], goodlines[i]): 373 # tricky: sets p to 0 if no preceding newline 374 p = code.rfind('\n', 0, p-1) + 1 375 # The stmt code[p:q] isn't a continuation, but may be blank 376 # or a non-indenting comment line. 377 if _junkre(code, p): 378 i = i-1 379 else: 380 break 381 if i == 0: 382 # nothing but junk! 383 assert p == 0 384 q = p 385 self.stmt_start, self.stmt_end = p, q 386 387 # Analyze this stmt, to find the last open bracket (if any) 388 # and last interesting character (if any). 389 lastch = "" 390 stack = [] # stack of open bracket indices 391 push_stack = stack.append 392 bracketing = [(p, 0)] 393 while p < q: 394 # suck up all except ()[]{}'"#\\ 395 m = _chew_ordinaryre(code, p, q) 396 if m: 397 # we skipped at least one boring char 398 newp = m.end() 399 # back up over totally boring whitespace 400 i = newp - 1 # index of last boring char 401 while i >= p and code[i] in " \t\n": 402 i = i-1 403 if i >= p: 404 lastch = code[i] 405 p = newp 406 if p >= q: 407 break 408 409 ch = code[p] 410 411 if ch in "([{": 412 push_stack(p) 413 bracketing.append((p, len(stack))) 414 lastch = ch 415 p = p+1 416 continue 417 418 if ch in ")]}": 419 if stack: 420 del stack[-1] 421 lastch = ch 422 p = p+1 423 bracketing.append((p, len(stack))) 424 continue 425 426 if ch == '"' or ch == "'": 427 # consume string 428 # Note that study1 did this with a Python loop, but 429 # we use a regexp here; the reason is speed in both 430 # cases; the string may be huge, but study1 pre-squashed 431 # strings to a couple of characters per line. study1 432 # also needed to keep track of newlines, and we don't 433 # have to. 434 bracketing.append((p, len(stack)+1)) 435 lastch = ch 436 p = _match_stringre(code, p, q).end() 437 bracketing.append((p, len(stack))) 438 continue 439 440 if ch == '#': 441 # consume comment and trailing newline 442 bracketing.append((p, len(stack)+1)) 443 p = code.find('\n', p, q) + 1 444 assert p > 0 445 bracketing.append((p, len(stack))) 446 continue 447 448 assert ch == '\\' 449 p = p+1 # beyond backslash 450 assert p < q 451 if code[p] != '\n': 452 # the program is invalid, but can't complain 453 lastch = ch + code[p] 454 p = p+1 # beyond escaped char 455 456 # end while p < q: 457 458 self.lastch = lastch 459 self.lastopenbracketpos = stack[-1] if stack else None 460 self.stmt_bracketing = tuple(bracketing) 461 462 def compute_bracket_indent(self): 463 """Return number of spaces the next line should be indented. 464 465 Line continuation must be C_BRACKET. 466 """ 467 self._study2() 468 assert self.continuation == C_BRACKET 469 j = self.lastopenbracketpos 470 code = self.code 471 n = len(code) 472 origi = i = code.rfind('\n', 0, j) + 1 473 j = j+1 # one beyond open bracket 474 # find first list item; set i to start of its line 475 while j < n: 476 m = _itemre(code, j) 477 if m: 478 j = m.end() - 1 # index of first interesting char 479 extra = 0 480 break 481 else: 482 # this line is junk; advance to next line 483 i = j = code.find('\n', j) + 1 484 else: 485 # nothing interesting follows the bracket; 486 # reproduce the bracket line's indentation + a level 487 j = i = origi 488 while code[j] in " \t": 489 j = j+1 490 extra = self.indentwidth 491 return len(code[i:j].expandtabs(self.tabwidth)) + extra 492 493 def get_num_lines_in_stmt(self): 494 """Return number of physical lines in last stmt. 495 496 The statement doesn't have to be an interesting statement. This is 497 intended to be called when continuation is C_BACKSLASH. 498 """ 499 self._study1() 500 goodlines = self.goodlines 501 return goodlines[-1] - goodlines[-2] 502 503 def compute_backslash_indent(self): 504 """Return number of spaces the next line should be indented. 505 506 Line continuation must be C_BACKSLASH. Also assume that the new 507 line is the first one following the initial line of the stmt. 508 """ 509 self._study2() 510 assert self.continuation == C_BACKSLASH 511 code = self.code 512 i = self.stmt_start 513 while code[i] in " \t": 514 i = i+1 515 startpos = i 516 517 # See whether the initial line starts an assignment stmt; i.e., 518 # look for an = operator 519 endpos = code.find('\n', startpos) + 1 520 found = level = 0 521 while i < endpos: 522 ch = code[i] 523 if ch in "([{": 524 level = level + 1 525 i = i+1 526 elif ch in ")]}": 527 if level: 528 level = level - 1 529 i = i+1 530 elif ch == '"' or ch == "'": 531 i = _match_stringre(code, i, endpos).end() 532 elif ch == '#': 533 # This line is unreachable because the # makes a comment of 534 # everything after it. 535 break 536 elif level == 0 and ch == '=' and \ 537 (i == 0 or code[i-1] not in "=<>!") and \ 538 code[i+1] != '=': 539 found = 1 540 break 541 else: 542 i = i+1 543 544 if found: 545 # found a legit =, but it may be the last interesting 546 # thing on the line 547 i = i+1 # move beyond the = 548 found = re.match(r"\s*\\", code[i:endpos]) is None 549 550 if not found: 551 # oh well ... settle for moving beyond the first chunk 552 # of non-whitespace chars 553 i = startpos 554 while code[i] not in " \t\n": 555 i = i+1 556 557 return len(code[self.stmt_start:i].expandtabs(\ 558 self.tabwidth)) + 1 559 560 def get_base_indent_string(self): 561 """Return the leading whitespace on the initial line of the last 562 interesting stmt. 563 """ 564 self._study2() 565 i, n = self.stmt_start, self.stmt_end 566 j = i 567 code = self.code 568 while j < n and code[j] in " \t": 569 j = j + 1 570 return code[i:j] 571 572 def is_block_opener(self): 573 "Return True if the last interesting statement opens a block." 574 self._study2() 575 return self.lastch == ':' 576 577 def is_block_closer(self): 578 "Return True if the last interesting statement closes a block." 579 self._study2() 580 return _closere(self.code, self.stmt_start) is not None 581 582 def get_last_stmt_bracketing(self): 583 """Return bracketing structure of the last interesting statement. 584 585 The returned tuple is in the format defined in _study2(). 586 """ 587 self._study2() 588 return self.stmt_bracketing 589 590 591if __name__ == '__main__': 592 from unittest import main 593 main('idlelib.idle_test.test_pyparse', verbosity=2) 594