1"""Provide advanced parsing abilities for ParenMatch and other extensions. 2 3HyperParser uses PyParser. PyParser mostly gives information on the 4proper indentation of code. HyperParser gives additional information on 5the structure of code. 6""" 7from keyword import iskeyword 8import string 9 10from idlelib import pyparse 11 12# all ASCII chars that may be in an identifier 13_ASCII_ID_CHARS = frozenset(string.ascii_letters + string.digits + "_") 14# all ASCII chars that may be the first char of an identifier 15_ASCII_ID_FIRST_CHARS = frozenset(string.ascii_letters + "_") 16 17# lookup table for whether 7-bit ASCII chars are valid in a Python identifier 18_IS_ASCII_ID_CHAR = [(chr(x) in _ASCII_ID_CHARS) for x in range(128)] 19# lookup table for whether 7-bit ASCII chars are valid as the first 20# char in a Python identifier 21_IS_ASCII_ID_FIRST_CHAR = \ 22 [(chr(x) in _ASCII_ID_FIRST_CHARS) for x in range(128)] 23 24 25class HyperParser: 26 def __init__(self, editwin, index): 27 "To initialize, analyze the surroundings of the given index." 28 29 self.editwin = editwin 30 self.text = text = editwin.text 31 32 parser = pyparse.Parser(editwin.indentwidth, editwin.tabwidth) 33 34 def index2line(index): 35 return int(float(index)) 36 lno = index2line(text.index(index)) 37 38 if not editwin.prompt_last_line: 39 for context in editwin.num_context_lines: 40 startat = max(lno - context, 1) 41 startatindex = repr(startat) + ".0" 42 stopatindex = "%d.end" % lno 43 # We add the newline because PyParse requires a newline 44 # at end. We add a space so that index won't be at end 45 # of line, so that its status will be the same as the 46 # char before it, if should. 47 parser.set_code(text.get(startatindex, stopatindex)+' \n') 48 bod = parser.find_good_parse_start( 49 editwin._build_char_in_string_func(startatindex)) 50 if bod is not None or startat == 1: 51 break 52 parser.set_lo(bod or 0) 53 else: 54 r = text.tag_prevrange("console", index) 55 if r: 56 startatindex = r[1] 57 else: 58 startatindex = "1.0" 59 stopatindex = "%d.end" % lno 60 # We add the newline because PyParse requires it. We add a 61 # space so that index won't be at end of line, so that its 62 # status will be the same as the char before it, if should. 63 parser.set_code(text.get(startatindex, stopatindex)+' \n') 64 parser.set_lo(0) 65 66 # We want what the parser has, minus the last newline and space. 67 self.rawtext = parser.code[:-2] 68 # Parser.code apparently preserves the statement we are in, so 69 # that stopatindex can be used to synchronize the string with 70 # the text box indices. 71 self.stopatindex = stopatindex 72 self.bracketing = parser.get_last_stmt_bracketing() 73 # find which pairs of bracketing are openers. These always 74 # correspond to a character of rawtext. 75 self.isopener = [i>0 and self.bracketing[i][1] > 76 self.bracketing[i-1][1] 77 for i in range(len(self.bracketing))] 78 79 self.set_index(index) 80 81 def set_index(self, index): 82 """Set the index to which the functions relate. 83 84 The index must be in the same statement. 85 """ 86 indexinrawtext = (len(self.rawtext) - 87 len(self.text.get(index, self.stopatindex))) 88 if indexinrawtext < 0: 89 raise ValueError("Index %s precedes the analyzed statement" 90 % index) 91 self.indexinrawtext = indexinrawtext 92 # find the rightmost bracket to which index belongs 93 self.indexbracket = 0 94 while (self.indexbracket < len(self.bracketing)-1 and 95 self.bracketing[self.indexbracket+1][0] < self.indexinrawtext): 96 self.indexbracket += 1 97 if (self.indexbracket < len(self.bracketing)-1 and 98 self.bracketing[self.indexbracket+1][0] == self.indexinrawtext and 99 not self.isopener[self.indexbracket+1]): 100 self.indexbracket += 1 101 102 def is_in_string(self): 103 """Is the index given to the HyperParser in a string?""" 104 # The bracket to which we belong should be an opener. 105 # If it's an opener, it has to have a character. 106 return (self.isopener[self.indexbracket] and 107 self.rawtext[self.bracketing[self.indexbracket][0]] 108 in ('"', "'")) 109 110 def is_in_code(self): 111 """Is the index given to the HyperParser in normal code?""" 112 return (not self.isopener[self.indexbracket] or 113 self.rawtext[self.bracketing[self.indexbracket][0]] 114 not in ('#', '"', "'")) 115 116 def get_surrounding_brackets(self, openers='([{', mustclose=False): 117 """Return bracket indexes or None. 118 119 If the index given to the HyperParser is surrounded by a 120 bracket defined in openers (or at least has one before it), 121 return the indices of the opening bracket and the closing 122 bracket (or the end of line, whichever comes first). 123 124 If it is not surrounded by brackets, or the end of line comes 125 before the closing bracket and mustclose is True, returns None. 126 """ 127 128 bracketinglevel = self.bracketing[self.indexbracket][1] 129 before = self.indexbracket 130 while (not self.isopener[before] or 131 self.rawtext[self.bracketing[before][0]] not in openers or 132 self.bracketing[before][1] > bracketinglevel): 133 before -= 1 134 if before < 0: 135 return None 136 bracketinglevel = min(bracketinglevel, self.bracketing[before][1]) 137 after = self.indexbracket + 1 138 while (after < len(self.bracketing) and 139 self.bracketing[after][1] >= bracketinglevel): 140 after += 1 141 142 beforeindex = self.text.index("%s-%dc" % 143 (self.stopatindex, len(self.rawtext)-self.bracketing[before][0])) 144 if (after >= len(self.bracketing) or 145 self.bracketing[after][0] > len(self.rawtext)): 146 if mustclose: 147 return None 148 afterindex = self.stopatindex 149 else: 150 # We are after a real char, so it is a ')' and we give the 151 # index before it. 152 afterindex = self.text.index( 153 "%s-%dc" % (self.stopatindex, 154 len(self.rawtext)-(self.bracketing[after][0]-1))) 155 156 return beforeindex, afterindex 157 158 # the set of built-in identifiers which are also keywords, 159 # i.e. keyword.iskeyword() returns True for them 160 _ID_KEYWORDS = frozenset({"True", "False", "None"}) 161 162 @classmethod 163 def _eat_identifier(cls, str, limit, pos): 164 """Given a string and pos, return the number of chars in the 165 identifier which ends at pos, or 0 if there is no such one. 166 167 This ignores non-identifier eywords are not identifiers. 168 """ 169 is_ascii_id_char = _IS_ASCII_ID_CHAR 170 171 # Start at the end (pos) and work backwards. 172 i = pos 173 174 # Go backwards as long as the characters are valid ASCII 175 # identifier characters. This is an optimization, since it 176 # is faster in the common case where most of the characters 177 # are ASCII. 178 while i > limit and ( 179 ord(str[i - 1]) < 128 and 180 is_ascii_id_char[ord(str[i - 1])] 181 ): 182 i -= 1 183 184 # If the above loop ended due to reaching a non-ASCII 185 # character, continue going backwards using the most generic 186 # test for whether a string contains only valid identifier 187 # characters. 188 if i > limit and ord(str[i - 1]) >= 128: 189 while i - 4 >= limit and ('a' + str[i - 4:pos]).isidentifier(): 190 i -= 4 191 if i - 2 >= limit and ('a' + str[i - 2:pos]).isidentifier(): 192 i -= 2 193 if i - 1 >= limit and ('a' + str[i - 1:pos]).isidentifier(): 194 i -= 1 195 196 # The identifier candidate starts here. If it isn't a valid 197 # identifier, don't eat anything. At this point that is only 198 # possible if the first character isn't a valid first 199 # character for an identifier. 200 if not str[i:pos].isidentifier(): 201 return 0 202 elif i < pos: 203 # All characters in str[i:pos] are valid ASCII identifier 204 # characters, so it is enough to check that the first is 205 # valid as the first character of an identifier. 206 if not _IS_ASCII_ID_FIRST_CHAR[ord(str[i])]: 207 return 0 208 209 # All keywords are valid identifiers, but should not be 210 # considered identifiers here, except for True, False and None. 211 if i < pos and ( 212 iskeyword(str[i:pos]) and 213 str[i:pos] not in cls._ID_KEYWORDS 214 ): 215 return 0 216 217 return pos - i 218 219 # This string includes all chars that may be in a white space 220 _whitespace_chars = " \t\n\\" 221 222 def get_expression(self): 223 """Return a string with the Python expression which ends at the 224 given index, which is empty if there is no real one. 225 """ 226 if not self.is_in_code(): 227 raise ValueError("get_expression should only be called " 228 "if index is inside a code.") 229 230 rawtext = self.rawtext 231 bracketing = self.bracketing 232 233 brck_index = self.indexbracket 234 brck_limit = bracketing[brck_index][0] 235 pos = self.indexinrawtext 236 237 last_identifier_pos = pos 238 postdot_phase = True 239 240 while 1: 241 # Eat whitespaces, comments, and if postdot_phase is False - a dot 242 while 1: 243 if pos>brck_limit and rawtext[pos-1] in self._whitespace_chars: 244 # Eat a whitespace 245 pos -= 1 246 elif (not postdot_phase and 247 pos > brck_limit and rawtext[pos-1] == '.'): 248 # Eat a dot 249 pos -= 1 250 postdot_phase = True 251 # The next line will fail if we are *inside* a comment, 252 # but we shouldn't be. 253 elif (pos == brck_limit and brck_index > 0 and 254 rawtext[bracketing[brck_index-1][0]] == '#'): 255 # Eat a comment 256 brck_index -= 2 257 brck_limit = bracketing[brck_index][0] 258 pos = bracketing[brck_index+1][0] 259 else: 260 # If we didn't eat anything, quit. 261 break 262 263 if not postdot_phase: 264 # We didn't find a dot, so the expression end at the 265 # last identifier pos. 266 break 267 268 ret = self._eat_identifier(rawtext, brck_limit, pos) 269 if ret: 270 # There is an identifier to eat 271 pos = pos - ret 272 last_identifier_pos = pos 273 # Now, to continue the search, we must find a dot. 274 postdot_phase = False 275 # (the loop continues now) 276 277 elif pos == brck_limit: 278 # We are at a bracketing limit. If it is a closing 279 # bracket, eat the bracket, otherwise, stop the search. 280 level = bracketing[brck_index][1] 281 while brck_index > 0 and bracketing[brck_index-1][1] > level: 282 brck_index -= 1 283 if bracketing[brck_index][0] == brck_limit: 284 # We were not at the end of a closing bracket 285 break 286 pos = bracketing[brck_index][0] 287 brck_index -= 1 288 brck_limit = bracketing[brck_index][0] 289 last_identifier_pos = pos 290 if rawtext[pos] in "([": 291 # [] and () may be used after an identifier, so we 292 # continue. postdot_phase is True, so we don't allow a dot. 293 pass 294 else: 295 # We can't continue after other types of brackets 296 if rawtext[pos] in "'\"": 297 # Scan a string prefix 298 while pos > 0 and rawtext[pos - 1] in "rRbBuU": 299 pos -= 1 300 last_identifier_pos = pos 301 break 302 303 else: 304 # We've found an operator or something. 305 break 306 307 return rawtext[last_identifier_pos:self.indexinrawtext] 308 309 310if __name__ == '__main__': 311 from unittest import main 312 main('idlelib.idle_test.test_hyperparser', verbosity=2) 313