1# Licensed under the Apache License: http://www.apache.org/licenses/LICENSE-2.0 2# For details: https://bitbucket.org/ned/coveragepy/src/default/NOTICE.txt 3 4"""Better tokenizing for coverage.py.""" 5 6import codecs 7import keyword 8import re 9import sys 10import token 11import tokenize 12 13from coverage import env 14from coverage.backward import iternext 15from coverage.misc import contract 16 17 18def phys_tokens(toks): 19 """Return all physical tokens, even line continuations. 20 21 tokenize.generate_tokens() doesn't return a token for the backslash that 22 continues lines. This wrapper provides those tokens so that we can 23 re-create a faithful representation of the original source. 24 25 Returns the same values as generate_tokens() 26 27 """ 28 last_line = None 29 last_lineno = -1 30 last_ttype = None 31 for ttype, ttext, (slineno, scol), (elineno, ecol), ltext in toks: 32 if last_lineno != elineno: 33 if last_line and last_line.endswith("\\\n"): 34 # We are at the beginning of a new line, and the last line 35 # ended with a backslash. We probably have to inject a 36 # backslash token into the stream. Unfortunately, there's more 37 # to figure out. This code:: 38 # 39 # usage = """\ 40 # HEY THERE 41 # """ 42 # 43 # triggers this condition, but the token text is:: 44 # 45 # '"""\\\nHEY THERE\n"""' 46 # 47 # so we need to figure out if the backslash is already in the 48 # string token or not. 49 inject_backslash = True 50 if last_ttype == tokenize.COMMENT: 51 # Comments like this \ 52 # should never result in a new token. 53 inject_backslash = False 54 elif ttype == token.STRING: 55 if "\n" in ttext and ttext.split('\n', 1)[0][-1] == '\\': 56 # It's a multi-line string and the first line ends with 57 # a backslash, so we don't need to inject another. 58 inject_backslash = False 59 if inject_backslash: 60 # Figure out what column the backslash is in. 61 ccol = len(last_line.split("\n")[-2]) - 1 62 # Yield the token, with a fake token type. 63 yield ( 64 99999, "\\\n", 65 (slineno, ccol), (slineno, ccol+2), 66 last_line 67 ) 68 last_line = ltext 69 last_ttype = ttype 70 yield ttype, ttext, (slineno, scol), (elineno, ecol), ltext 71 last_lineno = elineno 72 73 74@contract(source='unicode') 75def source_token_lines(source): 76 """Generate a series of lines, one for each line in `source`. 77 78 Each line is a list of pairs, each pair is a token:: 79 80 [('key', 'def'), ('ws', ' '), ('nam', 'hello'), ('op', '('), ... ] 81 82 Each pair has a token class, and the token text. 83 84 If you concatenate all the token texts, and then join them with newlines, 85 you should have your original `source` back, with two differences: 86 trailing whitespace is not preserved, and a final line with no newline 87 is indistinguishable from a final line with a newline. 88 89 """ 90 91 ws_tokens = set([token.INDENT, token.DEDENT, token.NEWLINE, tokenize.NL]) 92 line = [] 93 col = 0 94 95 # The \f is because of http://bugs.python.org/issue19035 96 source = source.expandtabs(8).replace('\r\n', '\n').replace('\f', ' ') 97 tokgen = generate_tokens(source) 98 99 for ttype, ttext, (_, scol), (_, ecol), _ in phys_tokens(tokgen): 100 mark_start = True 101 for part in re.split('(\n)', ttext): 102 if part == '\n': 103 yield line 104 line = [] 105 col = 0 106 mark_end = False 107 elif part == '': 108 mark_end = False 109 elif ttype in ws_tokens: 110 mark_end = False 111 else: 112 if mark_start and scol > col: 113 line.append(("ws", u" " * (scol - col))) 114 mark_start = False 115 tok_class = tokenize.tok_name.get(ttype, 'xx').lower()[:3] 116 if ttype == token.NAME and keyword.iskeyword(ttext): 117 tok_class = "key" 118 line.append((tok_class, part)) 119 mark_end = True 120 scol = 0 121 if mark_end: 122 col = ecol 123 124 if line: 125 yield line 126 127 128class CachedTokenizer(object): 129 """A one-element cache around tokenize.generate_tokens. 130 131 When reporting, coverage.py tokenizes files twice, once to find the 132 structure of the file, and once to syntax-color it. Tokenizing is 133 expensive, and easily cached. 134 135 This is a one-element cache so that our twice-in-a-row tokenizing doesn't 136 actually tokenize twice. 137 138 """ 139 def __init__(self): 140 self.last_text = None 141 self.last_tokens = None 142 143 @contract(text='unicode') 144 def generate_tokens(self, text): 145 """A stand-in for `tokenize.generate_tokens`.""" 146 if text != self.last_text: 147 self.last_text = text 148 readline = iternext(text.splitlines(True)) 149 self.last_tokens = list(tokenize.generate_tokens(readline)) 150 return self.last_tokens 151 152# Create our generate_tokens cache as a callable replacement function. 153generate_tokens = CachedTokenizer().generate_tokens 154 155 156COOKIE_RE = re.compile(r"^[ \t]*#.*coding[:=][ \t]*([-\w.]+)", flags=re.MULTILINE) 157 158@contract(source='bytes') 159def _source_encoding_py2(source): 160 """Determine the encoding for `source`, according to PEP 263. 161 162 `source` is a byte string, the text of the program. 163 164 Returns a string, the name of the encoding. 165 166 """ 167 assert isinstance(source, bytes) 168 169 # Do this so the detect_encode code we copied will work. 170 readline = iternext(source.splitlines(True)) 171 172 # This is mostly code adapted from Py3.2's tokenize module. 173 174 def _get_normal_name(orig_enc): 175 """Imitates get_normal_name in tokenizer.c.""" 176 # Only care about the first 12 characters. 177 enc = orig_enc[:12].lower().replace("_", "-") 178 if re.match(r"^utf-8($|-)", enc): 179 return "utf-8" 180 if re.match(r"^(latin-1|iso-8859-1|iso-latin-1)($|-)", enc): 181 return "iso-8859-1" 182 return orig_enc 183 184 # From detect_encode(): 185 # It detects the encoding from the presence of a UTF-8 BOM or an encoding 186 # cookie as specified in PEP-0263. If both a BOM and a cookie are present, 187 # but disagree, a SyntaxError will be raised. If the encoding cookie is an 188 # invalid charset, raise a SyntaxError. Note that if a UTF-8 BOM is found, 189 # 'utf-8-sig' is returned. 190 191 # If no encoding is specified, then the default will be returned. 192 default = 'ascii' 193 194 bom_found = False 195 encoding = None 196 197 def read_or_stop(): 198 """Get the next source line, or ''.""" 199 try: 200 return readline() 201 except StopIteration: 202 return '' 203 204 def find_cookie(line): 205 """Find an encoding cookie in `line`.""" 206 try: 207 line_string = line.decode('ascii') 208 except UnicodeDecodeError: 209 return None 210 211 matches = COOKIE_RE.findall(line_string) 212 if not matches: 213 return None 214 encoding = _get_normal_name(matches[0]) 215 try: 216 codec = codecs.lookup(encoding) 217 except LookupError: 218 # This behavior mimics the Python interpreter 219 raise SyntaxError("unknown encoding: " + encoding) 220 221 if bom_found: 222 # codecs in 2.3 were raw tuples of functions, assume the best. 223 codec_name = getattr(codec, 'name', encoding) 224 if codec_name != 'utf-8': 225 # This behavior mimics the Python interpreter 226 raise SyntaxError('encoding problem: utf-8') 227 encoding += '-sig' 228 return encoding 229 230 first = read_or_stop() 231 if first.startswith(codecs.BOM_UTF8): 232 bom_found = True 233 first = first[3:] 234 default = 'utf-8-sig' 235 if not first: 236 return default 237 238 encoding = find_cookie(first) 239 if encoding: 240 return encoding 241 242 second = read_or_stop() 243 if not second: 244 return default 245 246 encoding = find_cookie(second) 247 if encoding: 248 return encoding 249 250 return default 251 252 253@contract(source='bytes') 254def _source_encoding_py3(source): 255 """Determine the encoding for `source`, according to PEP 263. 256 257 `source` is a byte string: the text of the program. 258 259 Returns a string, the name of the encoding. 260 261 """ 262 readline = iternext(source.splitlines(True)) 263 return tokenize.detect_encoding(readline)[0] 264 265 266if env.PY3: 267 source_encoding = _source_encoding_py3 268else: 269 source_encoding = _source_encoding_py2 270 271 272@contract(source='unicode') 273def compile_unicode(source, filename, mode): 274 """Just like the `compile` builtin, but works on any Unicode string. 275 276 Python 2's compile() builtin has a stupid restriction: if the source string 277 is Unicode, then it may not have a encoding declaration in it. Why not? 278 Who knows! It also decodes to utf8, and then tries to interpret those utf8 279 bytes according to the encoding declaration. Why? Who knows! 280 281 This function neuters the coding declaration, and compiles it. 282 283 """ 284 source = neuter_encoding_declaration(source) 285 if env.PY2 and isinstance(filename, unicode): 286 filename = filename.encode(sys.getfilesystemencoding(), "replace") 287 code = compile(source, filename, mode) 288 return code 289 290 291@contract(source='unicode', returns='unicode') 292def neuter_encoding_declaration(source): 293 """Return `source`, with any encoding declaration neutered.""" 294 source = COOKIE_RE.sub("# (deleted declaration)", source, count=1) 295 return source 296