1#!/usr/bin/env python 2# 3# Copyright 2007 Neal Norwitz 4# Portions Copyright 2007 Google Inc. 5# 6# Licensed under the Apache License, Version 2.0 (the "License"); 7# you may not use this file except in compliance with the License. 8# You may obtain a copy of the License at 9# 10# http://www.apache.org/licenses/LICENSE-2.0 11# 12# Unless required by applicable law or agreed to in writing, software 13# distributed under the License is distributed on an "AS IS" BASIS, 14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15# See the License for the specific language governing permissions and 16# limitations under the License. 17 18"""Tokenize C++ source code.""" 19 20try: 21 # Python 3.x 22 import builtins 23except ImportError: 24 # Python 2.x 25 import __builtin__ as builtins 26 27 28import sys 29 30from cpp import utils 31 32 33if not hasattr(builtins, 'set'): 34 # Nominal support for Python 2.3. 35 from sets import Set as set 36 37 38# Add $ as a valid identifier char since so much code uses it. 39_letters = 'abcdefghijklmnopqrstuvwxyz' 40VALID_IDENTIFIER_CHARS = set(_letters + _letters.upper() + '_0123456789$') 41HEX_DIGITS = set('0123456789abcdefABCDEF') 42INT_OR_FLOAT_DIGITS = set('01234567890eE-+') 43 44 45# C++0x string preffixes. 46_STR_PREFIXES = set(('R', 'u8', 'u8R', 'u', 'uR', 'U', 'UR', 'L', 'LR')) 47 48 49# Token types. 50UNKNOWN = 'UNKNOWN' 51SYNTAX = 'SYNTAX' 52CONSTANT = 'CONSTANT' 53NAME = 'NAME' 54PREPROCESSOR = 'PREPROCESSOR' 55 56# Where the token originated from. This can be used for backtracking. 57# It is always set to WHENCE_STREAM in this code. 58WHENCE_STREAM, WHENCE_QUEUE = range(2) 59 60 61class Token(object): 62 """Data container to represent a C++ token. 63 64 Tokens can be identifiers, syntax char(s), constants, or 65 pre-processor directives. 66 67 start contains the index of the first char of the token in the source 68 end contains the index of the last char of the token in the source 69 """ 70 71 def __init__(self, token_type, name, start, end): 72 self.token_type = token_type 73 self.name = name 74 self.start = start 75 self.end = end 76 self.whence = WHENCE_STREAM 77 78 def __str__(self): 79 if not utils.DEBUG: 80 return 'Token(%r)' % self.name 81 return 'Token(%r, %s, %s)' % (self.name, self.start, self.end) 82 83 __repr__ = __str__ 84 85 86def _GetString(source, start, i): 87 i = source.find('"', i+1) 88 while source[i-1] == '\\': 89 # Count the trailing backslashes. 90 backslash_count = 1 91 j = i - 2 92 while source[j] == '\\': 93 backslash_count += 1 94 j -= 1 95 # When trailing backslashes are even, they escape each other. 96 if (backslash_count % 2) == 0: 97 break 98 i = source.find('"', i+1) 99 return i + 1 100 101 102def _GetChar(source, start, i): 103 # NOTE(nnorwitz): may not be quite correct, should be good enough. 104 i = source.find("'", i+1) 105 while source[i-1] == '\\': 106 # Need to special case '\\'. 107 if (i - 2) > start and source[i-2] == '\\': 108 break 109 i = source.find("'", i+1) 110 # Try to handle unterminated single quotes (in a #if 0 block). 111 if i < 0: 112 i = start 113 return i + 1 114 115 116def GetTokens(source): 117 """Returns a sequence of Tokens. 118 119 Args: 120 source: string of C++ source code. 121 122 Yields: 123 Token that represents the next token in the source. 124 """ 125 # Cache various valid character sets for speed. 126 valid_identifier_chars = VALID_IDENTIFIER_CHARS 127 hex_digits = HEX_DIGITS 128 int_or_float_digits = INT_OR_FLOAT_DIGITS 129 int_or_float_digits2 = int_or_float_digits | set('.') 130 131 # Only ignore errors while in a #if 0 block. 132 ignore_errors = False 133 count_ifs = 0 134 135 i = 0 136 end = len(source) 137 while i < end: 138 # Skip whitespace. 139 while i < end and source[i].isspace(): 140 i += 1 141 if i >= end: 142 return 143 144 token_type = UNKNOWN 145 start = i 146 c = source[i] 147 if c.isalpha() or c == '_': # Find a string token. 148 token_type = NAME 149 while source[i] in valid_identifier_chars: 150 i += 1 151 # String and character constants can look like a name if 152 # they are something like L"". 153 if (source[i] == "'" and (i - start) == 1 and 154 source[start:i] in 'uUL'): 155 # u, U, and L are valid C++0x character preffixes. 156 token_type = CONSTANT 157 i = _GetChar(source, start, i) 158 elif source[i] == "'" and source[start:i] in _STR_PREFIXES: 159 token_type = CONSTANT 160 i = _GetString(source, start, i) 161 elif c == '/' and source[i+1] == '/': # Find // comments. 162 i = source.find('\n', i) 163 if i == -1: # Handle EOF. 164 i = end 165 continue 166 elif c == '/' and source[i+1] == '*': # Find /* comments. */ 167 i = source.find('*/', i) + 2 168 continue 169 elif c in ':+-<>&|*=': # : or :: (plus other chars). 170 token_type = SYNTAX 171 i += 1 172 new_ch = source[i] 173 if new_ch == c and c != '>': # Treat ">>" as two tokens. 174 i += 1 175 elif c == '-' and new_ch == '>': 176 i += 1 177 elif new_ch == '=': 178 i += 1 179 elif c in '()[]{}~!?^%;/.,': # Handle single char tokens. 180 token_type = SYNTAX 181 i += 1 182 if c == '.' and source[i].isdigit(): 183 token_type = CONSTANT 184 i += 1 185 while source[i] in int_or_float_digits: 186 i += 1 187 # Handle float suffixes. 188 for suffix in ('l', 'f'): 189 if suffix == source[i:i+1].lower(): 190 i += 1 191 break 192 elif c.isdigit(): # Find integer. 193 token_type = CONSTANT 194 if c == '0' and source[i+1] in 'xX': 195 # Handle hex digits. 196 i += 2 197 while source[i] in hex_digits: 198 i += 1 199 else: 200 while source[i] in int_or_float_digits2: 201 i += 1 202 # Handle integer (and float) suffixes. 203 for suffix in ('ull', 'll', 'ul', 'l', 'f', 'u'): 204 size = len(suffix) 205 if suffix == source[i:i+size].lower(): 206 i += size 207 break 208 elif c == '"': # Find string. 209 token_type = CONSTANT 210 i = _GetString(source, start, i) 211 elif c == "'": # Find char. 212 token_type = CONSTANT 213 i = _GetChar(source, start, i) 214 elif c == '#': # Find pre-processor command. 215 token_type = PREPROCESSOR 216 got_if = source[i:i+3] == '#if' and source[i+3:i+4].isspace() 217 if got_if: 218 count_ifs += 1 219 elif source[i:i+6] == '#endif': 220 count_ifs -= 1 221 if count_ifs == 0: 222 ignore_errors = False 223 224 # TODO(nnorwitz): handle preprocessor statements (\ continuations). 225 while 1: 226 i1 = source.find('\n', i) 227 i2 = source.find('//', i) 228 i3 = source.find('/*', i) 229 i4 = source.find('"', i) 230 # NOTE(nnorwitz): doesn't handle comments in #define macros. 231 # Get the first important symbol (newline, comment, EOF/end). 232 i = min([x for x in (i1, i2, i3, i4, end) if x != -1]) 233 234 # Handle #include "dir//foo.h" properly. 235 if source[i] == '"': 236 i = source.find('"', i+1) + 1 237 assert i > 0 238 continue 239 # Keep going if end of the line and the line ends with \. 240 if not (i == i1 and source[i-1] == '\\'): 241 if got_if: 242 condition = source[start+4:i].lstrip() 243 if (condition.startswith('0') or 244 condition.startswith('(0)')): 245 ignore_errors = True 246 break 247 i += 1 248 elif c == '\\': # Handle \ in code. 249 # This is different from the pre-processor \ handling. 250 i += 1 251 continue 252 elif ignore_errors: 253 # The tokenizer seems to be in pretty good shape. This 254 # raise is conditionally disabled so that bogus code 255 # in an #if 0 block can be handled. Since we will ignore 256 # it anyways, this is probably fine. So disable the 257 # exception and return the bogus char. 258 i += 1 259 else: 260 sys.stderr.write('Got invalid token in %s @ %d token:%s: %r\n' % 261 ('?', i, c, source[i-10:i+10])) 262 raise RuntimeError('unexpected token') 263 264 if i <= 0: 265 print('Invalid index, exiting now.') 266 return 267 yield Token(token_type, source[start:i], start, i) 268 269 270if __name__ == '__main__': 271 def main(argv): 272 """Driver mostly for testing purposes.""" 273 for filename in argv[1:]: 274 source = utils.ReadFile(filename) 275 if source is None: 276 continue 277 278 for token in GetTokens(source): 279 print('%-12s: %s' % (token.token_type, token.name)) 280 # print('\r%6.2f%%' % (100.0 * index / token.end),) 281 sys.stdout.write('\n') 282 283 284 main(sys.argv) 285