1"""Define partial Python code Parser used by editor and hyperparser. 2 3Instances of ParseMap are used with str.translate. 4 5The following bound search and match functions are defined: 6_synchre - start of popular statement; 7_junkre - whitespace or comment line; 8_match_stringre: string, possibly without closer; 9_itemre - line that may have bracket structure start; 10_closere - line that must be followed by dedent. 11_chew_ordinaryre - non-special characters. 12""" 13import re 14 15# Reason last statement is continued (or C_NONE if it's not). 16(C_NONE, C_BACKSLASH, C_STRING_FIRST_LINE, 17 C_STRING_NEXT_LINES, C_BRACKET) = range(5) 18 19# Find what looks like the start of a popular statement. 20 21_synchre = re.compile(r""" 22 ^ 23 [ \t]* 24 (?: while 25 | else 26 | def 27 | return 28 | assert 29 | break 30 | class 31 | continue 32 | elif 33 | try 34 | except 35 | raise 36 | import 37 | yield 38 ) 39 \b 40""", re.VERBOSE | re.MULTILINE).search 41 42# Match blank line or non-indenting comment line. 43 44_junkre = re.compile(r""" 45 [ \t]* 46 (?: \# \S .* )? 47 \n 48""", re.VERBOSE).match 49 50# Match any flavor of string; the terminating quote is optional 51# so that we're robust in the face of incomplete program text. 52 53_match_stringre = re.compile(r""" 54 \""" [^"\\]* (?: 55 (?: \\. | "(?!"") ) 56 [^"\\]* 57 )* 58 (?: \""" )? 59 60| " [^"\\\n]* (?: \\. [^"\\\n]* )* "? 61 62| ''' [^'\\]* (?: 63 (?: \\. | '(?!'') ) 64 [^'\\]* 65 )* 66 (?: ''' )? 67 68| ' [^'\\\n]* (?: \\. [^'\\\n]* )* '? 69""", re.VERBOSE | re.DOTALL).match 70 71# Match a line that starts with something interesting; 72# used to find the first item of a bracket structure. 73 74_itemre = re.compile(r""" 75 [ \t]* 76 [^\s#\\] # if we match, m.end()-1 is the interesting char 77""", re.VERBOSE).match 78 79# Match start of statements that should be followed by a dedent. 80 81_closere = re.compile(r""" 82 \s* 83 (?: return 84 | break 85 | continue 86 | raise 87 | pass 88 ) 89 \b 90""", re.VERBOSE).match 91 92# Chew up non-special chars as quickly as possible. If match is 93# successful, m.end() less 1 is the index of the last boring char 94# matched. If match is unsuccessful, the string starts with an 95# interesting char. 96 97_chew_ordinaryre = re.compile(r""" 98 [^[\](){}#'"\\]+ 99""", re.VERBOSE).match 100 101 102class ParseMap(dict): 103 r"""Dict subclass that maps anything not in dict to 'x'. 104 105 This is designed to be used with str.translate in study1. 106 Anything not specifically mapped otherwise becomes 'x'. 107 Example: replace everything except whitespace with 'x'. 108 109 >>> keepwhite = ParseMap((ord(c), ord(c)) for c in ' \t\n\r') 110 >>> "a + b\tc\nd".translate(keepwhite) 111 'x x x\tx\nx' 112 """ 113 # Calling this triples access time; see bpo-32940 114 def __missing__(self, key): 115 return 120 # ord('x') 116 117 118# Map all ascii to 120 to avoid __missing__ call, then replace some. 119trans = ParseMap.fromkeys(range(128), 120) 120trans.update((ord(c), ord('(')) for c in "({[") # open brackets => '('; 121trans.update((ord(c), ord(')')) for c in ")}]") # close brackets => ')'. 122trans.update((ord(c), ord(c)) for c in "\"'\\\n#") # Keep these. 123 124 125class Parser: 126 127 def __init__(self, indentwidth, tabwidth): 128 self.indentwidth = indentwidth 129 self.tabwidth = tabwidth 130 131 def set_code(self, s): 132 assert len(s) == 0 or s[-1] == '\n' 133 self.code = s 134 self.study_level = 0 135 136 def find_good_parse_start(self, is_char_in_string=None, 137 _synchre=_synchre): 138 """ 139 Return index of a good place to begin parsing, as close to the 140 end of the string as possible. This will be the start of some 141 popular stmt like "if" or "def". Return None if none found: 142 the caller should pass more prior context then, if possible, or 143 if not (the entire program text up until the point of interest 144 has already been tried) pass 0 to set_lo(). 145 146 This will be reliable iff given a reliable is_char_in_string() 147 function, meaning that when it says "no", it's absolutely 148 guaranteed that the char is not in a string. 149 """ 150 code, pos = self.code, None 151 152 if not is_char_in_string: 153 # no clue -- make the caller pass everything 154 return None 155 156 # Peek back from the end for a good place to start, 157 # but don't try too often; pos will be left None, or 158 # bumped to a legitimate synch point. 159 limit = len(code) 160 for tries in range(5): 161 i = code.rfind(":\n", 0, limit) 162 if i < 0: 163 break 164 i = code.rfind('\n', 0, i) + 1 # start of colon line (-1+1=0) 165 m = _synchre(code, i, limit) 166 if m and not is_char_in_string(m.start()): 167 pos = m.start() 168 break 169 limit = i 170 if pos is None: 171 # Nothing looks like a block-opener, or stuff does 172 # but is_char_in_string keeps returning true; most likely 173 # we're in or near a giant string, the colorizer hasn't 174 # caught up enough to be helpful, or there simply *aren't* 175 # any interesting stmts. In any of these cases we're 176 # going to have to parse the whole thing to be sure, so 177 # give it one last try from the start, but stop wasting 178 # time here regardless of the outcome. 179 m = _synchre(code) 180 if m and not is_char_in_string(m.start()): 181 pos = m.start() 182 return pos 183 184 # Peeking back worked; look forward until _synchre no longer 185 # matches. 186 i = pos + 1 187 while 1: 188 m = _synchre(code, i) 189 if m: 190 s, i = m.span() 191 if not is_char_in_string(s): 192 pos = s 193 else: 194 break 195 return pos 196 197 def set_lo(self, lo): 198 """ Throw away the start of the string. 199 200 Intended to be called with the result of find_good_parse_start(). 201 """ 202 assert lo == 0 or self.code[lo-1] == '\n' 203 if lo > 0: 204 self.code = self.code[lo:] 205 206 def _study1(self): 207 """Find the line numbers of non-continuation lines. 208 209 As quickly as humanly possible <wink>, find the line numbers (0- 210 based) of the non-continuation lines. 211 Creates self.{goodlines, continuation}. 212 """ 213 if self.study_level >= 1: 214 return 215 self.study_level = 1 216 217 # Map all uninteresting characters to "x", all open brackets 218 # to "(", all close brackets to ")", then collapse runs of 219 # uninteresting characters. This can cut the number of chars 220 # by a factor of 10-40, and so greatly speed the following loop. 221 code = self.code 222 code = code.translate(trans) 223 code = code.replace('xxxxxxxx', 'x') 224 code = code.replace('xxxx', 'x') 225 code = code.replace('xx', 'x') 226 code = code.replace('xx', 'x') 227 code = code.replace('\nx', '\n') 228 # Replacing x\n with \n would be incorrect because 229 # x may be preceded by a backslash. 230 231 # March over the squashed version of the program, accumulating 232 # the line numbers of non-continued stmts, and determining 233 # whether & why the last stmt is a continuation. 234 continuation = C_NONE 235 level = lno = 0 # level is nesting level; lno is line number 236 self.goodlines = goodlines = [0] 237 push_good = goodlines.append 238 i, n = 0, len(code) 239 while i < n: 240 ch = code[i] 241 i = i+1 242 243 # cases are checked in decreasing order of frequency 244 if ch == 'x': 245 continue 246 247 if ch == '\n': 248 lno = lno + 1 249 if level == 0: 250 push_good(lno) 251 # else we're in an unclosed bracket structure 252 continue 253 254 if ch == '(': 255 level = level + 1 256 continue 257 258 if ch == ')': 259 if level: 260 level = level - 1 261 # else the program is invalid, but we can't complain 262 continue 263 264 if ch == '"' or ch == "'": 265 # consume the string 266 quote = ch 267 if code[i-1:i+2] == quote * 3: 268 quote = quote * 3 269 firstlno = lno 270 w = len(quote) - 1 271 i = i+w 272 while i < n: 273 ch = code[i] 274 i = i+1 275 276 if ch == 'x': 277 continue 278 279 if code[i-1:i+w] == quote: 280 i = i+w 281 break 282 283 if ch == '\n': 284 lno = lno + 1 285 if w == 0: 286 # unterminated single-quoted string 287 if level == 0: 288 push_good(lno) 289 break 290 continue 291 292 if ch == '\\': 293 assert i < n 294 if code[i] == '\n': 295 lno = lno + 1 296 i = i+1 297 continue 298 299 # else comment char or paren inside string 300 301 else: 302 # didn't break out of the loop, so we're still 303 # inside a string 304 if (lno - 1) == firstlno: 305 # before the previous \n in code, we were in the first 306 # line of the string 307 continuation = C_STRING_FIRST_LINE 308 else: 309 continuation = C_STRING_NEXT_LINES 310 continue # with outer loop 311 312 if ch == '#': 313 # consume the comment 314 i = code.find('\n', i) 315 assert i >= 0 316 continue 317 318 assert ch == '\\' 319 assert i < n 320 if code[i] == '\n': 321 lno = lno + 1 322 if i+1 == n: 323 continuation = C_BACKSLASH 324 i = i+1 325 326 # The last stmt may be continued for all 3 reasons. 327 # String continuation takes precedence over bracket 328 # continuation, which beats backslash continuation. 329 if (continuation != C_STRING_FIRST_LINE 330 and continuation != C_STRING_NEXT_LINES and level > 0): 331 continuation = C_BRACKET 332 self.continuation = continuation 333 334 # Push the final line number as a sentinel value, regardless of 335 # whether it's continued. 336 assert (continuation == C_NONE) == (goodlines[-1] == lno) 337 if goodlines[-1] != lno: 338 push_good(lno) 339 340 def get_continuation_type(self): 341 self._study1() 342 return self.continuation 343 344 def _study2(self): 345 """ 346 study1 was sufficient to determine the continuation status, 347 but doing more requires looking at every character. study2 348 does this for the last interesting statement in the block. 349 Creates: 350 self.stmt_start, stmt_end 351 slice indices of last interesting stmt 352 self.stmt_bracketing 353 the bracketing structure of the last interesting stmt; for 354 example, for the statement "say(boo) or die", 355 stmt_bracketing will be ((0, 0), (0, 1), (2, 0), (2, 1), 356 (4, 0)). Strings and comments are treated as brackets, for 357 the matter. 358 self.lastch 359 last interesting character before optional trailing comment 360 self.lastopenbracketpos 361 if continuation is C_BRACKET, index of last open bracket 362 """ 363 if self.study_level >= 2: 364 return 365 self._study1() 366 self.study_level = 2 367 368 # Set p and q to slice indices of last interesting stmt. 369 code, goodlines = self.code, self.goodlines 370 i = len(goodlines) - 1 # Index of newest line. 371 p = len(code) # End of goodlines[i] 372 while i: 373 assert p 374 # Make p be the index of the stmt at line number goodlines[i]. 375 # Move p back to the stmt at line number goodlines[i-1]. 376 q = p 377 for nothing in range(goodlines[i-1], goodlines[i]): 378 # tricky: sets p to 0 if no preceding newline 379 p = code.rfind('\n', 0, p-1) + 1 380 # The stmt code[p:q] isn't a continuation, but may be blank 381 # or a non-indenting comment line. 382 if _junkre(code, p): 383 i = i-1 384 else: 385 break 386 if i == 0: 387 # nothing but junk! 388 assert p == 0 389 q = p 390 self.stmt_start, self.stmt_end = p, q 391 392 # Analyze this stmt, to find the last open bracket (if any) 393 # and last interesting character (if any). 394 lastch = "" 395 stack = [] # stack of open bracket indices 396 push_stack = stack.append 397 bracketing = [(p, 0)] 398 while p < q: 399 # suck up all except ()[]{}'"#\\ 400 m = _chew_ordinaryre(code, p, q) 401 if m: 402 # we skipped at least one boring char 403 newp = m.end() 404 # back up over totally boring whitespace 405 i = newp - 1 # index of last boring char 406 while i >= p and code[i] in " \t\n": 407 i = i-1 408 if i >= p: 409 lastch = code[i] 410 p = newp 411 if p >= q: 412 break 413 414 ch = code[p] 415 416 if ch in "([{": 417 push_stack(p) 418 bracketing.append((p, len(stack))) 419 lastch = ch 420 p = p+1 421 continue 422 423 if ch in ")]}": 424 if stack: 425 del stack[-1] 426 lastch = ch 427 p = p+1 428 bracketing.append((p, len(stack))) 429 continue 430 431 if ch == '"' or ch == "'": 432 # consume string 433 # Note that study1 did this with a Python loop, but 434 # we use a regexp here; the reason is speed in both 435 # cases; the string may be huge, but study1 pre-squashed 436 # strings to a couple of characters per line. study1 437 # also needed to keep track of newlines, and we don't 438 # have to. 439 bracketing.append((p, len(stack)+1)) 440 lastch = ch 441 p = _match_stringre(code, p, q).end() 442 bracketing.append((p, len(stack))) 443 continue 444 445 if ch == '#': 446 # consume comment and trailing newline 447 bracketing.append((p, len(stack)+1)) 448 p = code.find('\n', p, q) + 1 449 assert p > 0 450 bracketing.append((p, len(stack))) 451 continue 452 453 assert ch == '\\' 454 p = p+1 # beyond backslash 455 assert p < q 456 if code[p] != '\n': 457 # the program is invalid, but can't complain 458 lastch = ch + code[p] 459 p = p+1 # beyond escaped char 460 461 # end while p < q: 462 463 self.lastch = lastch 464 self.lastopenbracketpos = stack[-1] if stack else None 465 self.stmt_bracketing = tuple(bracketing) 466 467 def compute_bracket_indent(self): 468 """Return number of spaces the next line should be indented. 469 470 Line continuation must be C_BRACKET. 471 """ 472 self._study2() 473 assert self.continuation == C_BRACKET 474 j = self.lastopenbracketpos 475 code = self.code 476 n = len(code) 477 origi = i = code.rfind('\n', 0, j) + 1 478 j = j+1 # one beyond open bracket 479 # find first list item; set i to start of its line 480 while j < n: 481 m = _itemre(code, j) 482 if m: 483 j = m.end() - 1 # index of first interesting char 484 extra = 0 485 break 486 else: 487 # this line is junk; advance to next line 488 i = j = code.find('\n', j) + 1 489 else: 490 # nothing interesting follows the bracket; 491 # reproduce the bracket line's indentation + a level 492 j = i = origi 493 while code[j] in " \t": 494 j = j+1 495 extra = self.indentwidth 496 return len(code[i:j].expandtabs(self.tabwidth)) + extra 497 498 def get_num_lines_in_stmt(self): 499 """Return number of physical lines in last stmt. 500 501 The statement doesn't have to be an interesting statement. This is 502 intended to be called when continuation is C_BACKSLASH. 503 """ 504 self._study1() 505 goodlines = self.goodlines 506 return goodlines[-1] - goodlines[-2] 507 508 def compute_backslash_indent(self): 509 """Return number of spaces the next line should be indented. 510 511 Line continuation must be C_BACKSLASH. Also assume that the new 512 line is the first one following the initial line of the stmt. 513 """ 514 self._study2() 515 assert self.continuation == C_BACKSLASH 516 code = self.code 517 i = self.stmt_start 518 while code[i] in " \t": 519 i = i+1 520 startpos = i 521 522 # See whether the initial line starts an assignment stmt; i.e., 523 # look for an = operator 524 endpos = code.find('\n', startpos) + 1 525 found = level = 0 526 while i < endpos: 527 ch = code[i] 528 if ch in "([{": 529 level = level + 1 530 i = i+1 531 elif ch in ")]}": 532 if level: 533 level = level - 1 534 i = i+1 535 elif ch == '"' or ch == "'": 536 i = _match_stringre(code, i, endpos).end() 537 elif ch == '#': 538 # This line is unreachable because the # makes a comment of 539 # everything after it. 540 break 541 elif level == 0 and ch == '=' and \ 542 (i == 0 or code[i-1] not in "=<>!") and \ 543 code[i+1] != '=': 544 found = 1 545 break 546 else: 547 i = i+1 548 549 if found: 550 # found a legit =, but it may be the last interesting 551 # thing on the line 552 i = i+1 # move beyond the = 553 found = re.match(r"\s*\\", code[i:endpos]) is None 554 555 if not found: 556 # oh well ... settle for moving beyond the first chunk 557 # of non-whitespace chars 558 i = startpos 559 while code[i] not in " \t\n": 560 i = i+1 561 562 return len(code[self.stmt_start:i].expandtabs(\ 563 self.tabwidth)) + 1 564 565 def get_base_indent_string(self): 566 """Return the leading whitespace on the initial line of the last 567 interesting stmt. 568 """ 569 self._study2() 570 i, n = self.stmt_start, self.stmt_end 571 j = i 572 code = self.code 573 while j < n and code[j] in " \t": 574 j = j + 1 575 return code[i:j] 576 577 def is_block_opener(self): 578 "Return True if the last interesting statemtent opens a block." 579 self._study2() 580 return self.lastch == ':' 581 582 def is_block_closer(self): 583 "Return True if the last interesting statement closes a block." 584 self._study2() 585 return _closere(self.code, self.stmt_start) is not None 586 587 def get_last_stmt_bracketing(self): 588 """Return bracketing structure of the last interesting statement. 589 590 The returned tuple is in the format defined in _study2(). 591 """ 592 self._study2() 593 return self.stmt_bracketing 594 595 596if __name__ == '__main__': 597 from unittest import main 598 main('idlelib.idle_test.test_pyparse', verbosity=2) 599