1#!/usr/bin/env python 2 3"""A tool for extracting a list of symbols to export 4 5When exporting symbols from a dll or exe we either need to mark the symbols in 6the source code as __declspec(dllexport) or supply a list of symbols to the 7linker. This program automates the latter by inspecting the symbol tables of a 8list of link inputs and deciding which of those symbols need to be exported. 9 10We can't just export all the defined symbols, as there's a limit of 65535 11exported symbols and in clang we go way over that, particularly in a debug 12build. Therefore a large part of the work is pruning symbols either which can't 13be imported, or which we think are things that have definitions in public header 14files (i.e. template instantiations) and we would get defined in the thing 15importing these symbols anyway. 16""" 17 18from __future__ import print_function 19import sys 20import re 21import os 22import subprocess 23import multiprocessing 24import argparse 25 26# Define functions which extract a list of symbols from a library using several 27# different tools. We use subprocess.Popen and yield a symbol at a time instead 28# of using subprocess.check_output and returning a list as, especially on 29# Windows, waiting for the entire output to be ready can take a significant 30# amount of time. 31 32def dumpbin_get_symbols(lib): 33 process = subprocess.Popen(['dumpbin','/symbols',lib], bufsize=1, 34 stdout=subprocess.PIPE, stdin=subprocess.PIPE, 35 universal_newlines=True) 36 process.stdin.close() 37 for line in process.stdout: 38 # Look for external symbols that are defined in some section 39 match = re.match("^.+SECT.+External\s+\|\s+(\S+).*$", line) 40 if match: 41 yield match.group(1) 42 process.wait() 43 44def nm_get_symbols(lib): 45 process = subprocess.Popen(['nm',lib], bufsize=1, 46 stdout=subprocess.PIPE, stdin=subprocess.PIPE, 47 universal_newlines=True) 48 process.stdin.close() 49 for line in process.stdout: 50 # Look for external symbols that are defined in some section 51 match = re.match("^\S+\s+[BDGRSTVW]\s+(\S+)$", line) 52 if match: 53 yield match.group(1) 54 process.wait() 55 56def readobj_get_symbols(lib): 57 process = subprocess.Popen(['llvm-readobj','-symbols',lib], bufsize=1, 58 stdout=subprocess.PIPE, stdin=subprocess.PIPE, 59 universal_newlines=True) 60 process.stdin.close() 61 for line in process.stdout: 62 # When looking through the output of llvm-readobj we expect to see Name, 63 # Section, then StorageClass, so record Name and Section when we see 64 # them and decide if this is a defined external symbol when we see 65 # StorageClass. 66 match = re.search('Name: (\S+)', line) 67 if match: 68 name = match.group(1) 69 match = re.search('Section: (\S+)', line) 70 if match: 71 section = match.group(1) 72 match = re.search('StorageClass: (\S+)', line) 73 if match: 74 storageclass = match.group(1) 75 if section != 'IMAGE_SYM_ABSOLUTE' and \ 76 section != 'IMAGE_SYM_UNDEFINED' and \ 77 storageclass == 'External': 78 yield name 79 process.wait() 80 81# Define functions which determine if the target is 32-bit Windows (as that's 82# where calling convention name decoration happens). 83 84def dumpbin_is_32bit_windows(lib): 85 # dumpbin /headers can output a huge amount of data (>100MB in a debug 86 # build) so we read only up to the 'machine' line then close the output. 87 process = subprocess.Popen(['dumpbin','/headers',lib], bufsize=1, 88 stdout=subprocess.PIPE, stdin=subprocess.PIPE, 89 universal_newlines=True) 90 process.stdin.close() 91 retval = False 92 for line in process.stdout: 93 match = re.match('.+machine \((\S+)\)', line) 94 if match: 95 retval = (match.group(1) == 'x86') 96 break 97 process.stdout.close() 98 process.wait() 99 return retval 100 101def objdump_is_32bit_windows(lib): 102 output = subprocess.check_output(['objdump','-f',lib], 103 universal_newlines=True) 104 for line in output: 105 match = re.match('.+file format (\S+)', line) 106 if match: 107 return (match.group(1) == 'pe-i386') 108 return False 109 110def readobj_is_32bit_windows(lib): 111 output = subprocess.check_output(['llvm-readobj','-file-headers',lib], 112 universal_newlines=True) 113 for line in output: 114 match = re.match('Format: (\S+)', line) 115 if match: 116 return (match.group(1) == 'COFF-i386') 117 return False 118 119# MSVC mangles names to ?<identifier_mangling>@<type_mangling>. By examining the 120# identifier/type mangling we can decide which symbols could possibly be 121# required and which we can discard. 122def should_keep_microsoft_symbol(symbol, calling_convention_decoration): 123 # Keep unmangled (i.e. extern "C") names 124 if not '?' in symbol: 125 if calling_convention_decoration: 126 # Remove calling convention decoration from names 127 match = re.match('[_@]([^@]+)', symbol) 128 if match: 129 return match.group(1) 130 return symbol 131 # Function template instantiations start with ?$, discard them as it's 132 # assumed that the definition is public 133 elif symbol.startswith('??$'): 134 return None 135 # Deleting destructors start with ?_G or ?_E and can be discarded because 136 # link.exe gives you a warning telling you they can't be exported if you 137 # don't 138 elif symbol.startswith('??_G') or symbol.startswith('??_E'): 139 return None 140 # Constructors (?0) and destructors (?1) of templates (?$) are assumed to be 141 # defined in headers and not required to be kept 142 elif symbol.startswith('??0?$') or symbol.startswith('??1?$'): 143 return None 144 # An anonymous namespace is mangled as ?A(maybe hex number)@. Any symbol 145 # that mentions an anonymous namespace can be discarded, as the anonymous 146 # namespace doesn't exist outside of that translation unit. 147 elif re.search('\?A(0x\w+)?@', symbol): 148 return None 149 # Keep mangled llvm:: and clang:: function symbols. How we detect these is a 150 # bit of a mess and imprecise, but that avoids having to completely demangle 151 # the symbol name. The outermost namespace is at the end of the identifier 152 # mangling, and the identifier mangling is followed by the type mangling, so 153 # we look for (llvm|clang)@@ followed by something that looks like a 154 # function type mangling. To spot a function type we use (this is derived 155 # from clang/lib/AST/MicrosoftMangle.cpp): 156 # <function-type> ::= <function-class> <this-cvr-qualifiers> 157 # <calling-convention> <return-type> 158 # <argument-list> <throw-spec> 159 # <function-class> ::= [A-Z] 160 # <this-cvr-qualifiers> ::= [A-Z0-9_]* 161 # <calling-convention> ::= [A-JQ] 162 # <return-type> ::= .+ 163 # <argument-list> ::= X (void) 164 # ::= .+@ (list of types) 165 # ::= .*Z (list of types, varargs) 166 # <throw-spec> ::= exceptions are not allowed 167 elif re.search('(llvm|clang)@@[A-Z][A-Z0-9_]*[A-JQ].+(X|.+@|.*Z)$', symbol): 168 return symbol 169 return None 170 171# Itanium manglings are of the form _Z<identifier_mangling><type_mangling>. We 172# demangle the identifier mangling to identify symbols that can be safely 173# discarded. 174def should_keep_itanium_symbol(symbol, calling_convention_decoration): 175 # Start by removing any calling convention decoration (which we expect to 176 # see on all symbols, even mangled C++ symbols) 177 if calling_convention_decoration and symbol.startswith('_'): 178 symbol = symbol[1:] 179 # Keep unmangled names 180 if not symbol.startswith('_') and not symbol.startswith('.'): 181 return symbol 182 # Discard manglings that aren't nested names 183 match = re.match('_Z(T[VTIS])?(N.+)', symbol) 184 if not match: 185 return None 186 # Demangle the name. If the name is too complex then we don't need to keep 187 # it, but it the demangling fails then keep the symbol just in case. 188 try: 189 names, _ = parse_itanium_nested_name(match.group(2)) 190 except TooComplexName: 191 return None 192 if not names: 193 return symbol 194 # Constructors and destructors of templates classes are assumed to be 195 # defined in headers and not required to be kept 196 if re.match('[CD][123]', names[-1][0]) and names[-2][1]: 197 return None 198 # Discard function template instantiations as it's assumed that the 199 # definition is public 200 elif names[-1][1]: 201 return None 202 # Keep llvm:: and clang:: names 203 elif names[0][0] == '4llvm' or names[0][0] == '5clang': 204 return symbol 205 # Discard everything else 206 else: 207 return None 208 209# Certain kinds of complex manglings we assume cannot be part of a public 210# interface, and we handle them by raising an exception. 211class TooComplexName(Exception): 212 pass 213 214# Parse an itanium mangled name from the start of a string and return a 215# (name, rest of string) pair. 216def parse_itanium_name(arg): 217 # Check for a normal name 218 match = re.match('(\d+)(.+)', arg) 219 if match: 220 n = int(match.group(1)) 221 name = match.group(1)+match.group(2)[:n] 222 rest = match.group(2)[n:] 223 return name, rest 224 # Check for constructor/destructor names 225 match = re.match('([CD][123])(.+)', arg) 226 if match: 227 return match.group(1), match.group(2) 228 # Assume that a sequence of characters that doesn't end a nesting is an 229 # operator (this is very imprecise, but appears to be good enough) 230 match = re.match('([^E]+)(.+)', arg) 231 if match: 232 return match.group(1), match.group(2) 233 # Anything else: we can't handle it 234 return None, arg 235 236# Parse an itanium mangled template argument list from the start of a string 237# and throw it away, returning the rest of the string. 238def skip_itanium_template(arg): 239 # A template argument list starts with I 240 assert arg.startswith('I'), arg 241 tmp = arg[1:] 242 while tmp: 243 # Check for names 244 match = re.match('(\d+)(.+)', tmp) 245 if match: 246 n = int(match.group(1)) 247 tmp = match.group(2)[n:] 248 continue 249 # Check for substitutions 250 match = re.match('S[A-Z0-9]*_(.+)', tmp) 251 if match: 252 tmp = match.group(1) 253 # Start of a template 254 elif tmp.startswith('I'): 255 tmp = skip_itanium_template(tmp) 256 # Start of a nested name 257 elif tmp.startswith('N'): 258 _, tmp = parse_itanium_nested_name(tmp) 259 # Start of an expression: assume that it's too complicated 260 elif tmp.startswith('L') or tmp.startswith('X'): 261 raise TooComplexName 262 # End of the template 263 elif tmp.startswith('E'): 264 return tmp[1:] 265 # Something else: probably a type, skip it 266 else: 267 tmp = tmp[1:] 268 return None 269 270# Parse an itanium mangled nested name and transform it into a list of pairs of 271# (name, is_template), returning (list, rest of string). 272def parse_itanium_nested_name(arg): 273 # A nested name starts with N 274 assert arg.startswith('N'), arg 275 ret = [] 276 277 # Skip past the N, and possibly a substitution 278 match = re.match('NS[A-Z0-9]*_(.+)', arg) 279 if match: 280 tmp = match.group(1) 281 else: 282 tmp = arg[1:] 283 284 # Skip past CV-qualifiers and ref qualifiers 285 match = re.match('[rVKRO]*(.+)', tmp); 286 if match: 287 tmp = match.group(1) 288 289 # Repeatedly parse names from the string until we reach the end of the 290 # nested name 291 while tmp: 292 # An E ends the nested name 293 if tmp.startswith('E'): 294 return ret, tmp[1:] 295 # Parse a name 296 name_part, tmp = parse_itanium_name(tmp) 297 if not name_part: 298 # If we failed then we don't know how to demangle this 299 return None, None 300 is_template = False 301 # If this name is a template record that, then skip the template 302 # arguments 303 if tmp.startswith('I'): 304 tmp = skip_itanium_template(tmp) 305 is_template = True 306 # Add the name to the list 307 ret.append((name_part, is_template)) 308 309 # If we get here then something went wrong 310 return None, None 311 312def extract_symbols(arg): 313 get_symbols, should_keep_symbol, calling_convention_decoration, lib = arg 314 symbols = dict() 315 for symbol in get_symbols(lib): 316 symbol = should_keep_symbol(symbol, calling_convention_decoration) 317 if symbol: 318 symbols[symbol] = 1 + symbols.setdefault(symbol,0) 319 return symbols 320 321if __name__ == '__main__': 322 tool_exes = ['dumpbin','nm','objdump','llvm-readobj'] 323 parser = argparse.ArgumentParser( 324 description='Extract symbols to export from libraries') 325 parser.add_argument('--mangling', choices=['itanium','microsoft'], 326 required=True, help='expected symbol mangling scheme') 327 parser.add_argument('--tools', choices=tool_exes, nargs='*', 328 help='tools to use to extract symbols and determine the' 329 ' target') 330 parser.add_argument('libs', metavar='lib', type=str, nargs='+', 331 help='libraries to extract symbols from') 332 parser.add_argument('-o', metavar='file', type=str, help='output to file') 333 args = parser.parse_args() 334 335 # Determine the function to use to get the list of symbols from the inputs, 336 # and the function to use to determine if the target is 32-bit windows. 337 tools = { 'dumpbin' : (dumpbin_get_symbols, dumpbin_is_32bit_windows), 338 'nm' : (nm_get_symbols, None), 339 'objdump' : (None, objdump_is_32bit_windows), 340 'llvm-readobj' : (readobj_get_symbols, readobj_is_32bit_windows) } 341 get_symbols = None 342 is_32bit_windows = None 343 # If we have a tools argument then use that for the list of tools to check 344 if args.tools: 345 tool_exes = args.tools 346 # Find a tool to use by trying each in turn until we find one that exists 347 # (subprocess.call will throw OSError when the program does not exist) 348 get_symbols = None 349 for exe in tool_exes: 350 try: 351 # Close std streams as we don't want any output and we don't 352 # want the process to wait for something on stdin. 353 p = subprocess.Popen([exe], stdout=subprocess.PIPE, 354 stderr=subprocess.PIPE, 355 stdin=subprocess.PIPE, 356 universal_newlines=True) 357 p.stdout.close() 358 p.stderr.close() 359 p.stdin.close() 360 p.wait() 361 # Keep going until we have a tool to use for both get_symbols and 362 # is_32bit_windows 363 if not get_symbols: 364 get_symbols = tools[exe][0] 365 if not is_32bit_windows: 366 is_32bit_windows = tools[exe][1] 367 if get_symbols and is_32bit_windows: 368 break 369 except OSError: 370 continue 371 if not get_symbols: 372 print("Couldn't find a program to read symbols with", file=sys.stderr) 373 exit(1) 374 if not is_32bit_windows: 375 print("Couldn't find a program to determing the target", file=sys.stderr) 376 exit(1) 377 378 # How we determine which symbols to keep and which to discard depends on 379 # the mangling scheme 380 if args.mangling == 'microsoft': 381 should_keep_symbol = should_keep_microsoft_symbol 382 else: 383 should_keep_symbol = should_keep_itanium_symbol 384 385 # Get the list of libraries to extract symbols from 386 libs = list() 387 for lib in args.libs: 388 # When invoked by cmake the arguments are the cmake target names of the 389 # libraries, so we need to add .lib/.a to the end and maybe lib to the 390 # start to get the filename. Also allow objects. 391 suffixes = ['.lib','.a','.obj','.o'] 392 if not any([lib.endswith(s) for s in suffixes]): 393 for s in suffixes: 394 if os.path.exists(lib+s): 395 lib = lib+s 396 break 397 if os.path.exists('lib'+lib+s): 398 lib = 'lib'+lib+s 399 break 400 if not any([lib.endswith(s) for s in suffixes]): 401 print("Don't know what to do with argument "+lib, file=sys.stderr) 402 exit(1) 403 libs.append(lib) 404 405 # Check if calling convention decoration is used by inspecting the first 406 # library in the list 407 calling_convention_decoration = is_32bit_windows(libs[0]) 408 409 # Extract symbols from libraries in parallel. This is a huge time saver when 410 # doing a debug build, as there are hundreds of thousands of symbols in each 411 # library. 412 pool = multiprocessing.Pool() 413 try: 414 # Only one argument can be passed to the mapping function, and we can't 415 # use a lambda or local function definition as that doesn't work on 416 # windows, so create a list of tuples which duplicates the arguments 417 # that are the same in all calls. 418 vals = [(get_symbols, should_keep_symbol, calling_convention_decoration, x) for x in libs] 419 # Do an async map then wait for the result to make sure that 420 # KeyboardInterrupt gets caught correctly (see 421 # http://bugs.python.org/issue8296) 422 result = pool.map_async(extract_symbols, vals) 423 pool.close() 424 libs_symbols = result.get(3600) 425 except KeyboardInterrupt: 426 # On Ctrl-C terminate everything and exit 427 pool.terminate() 428 pool.join() 429 exit(1) 430 431 # Merge everything into a single dict 432 symbols = dict() 433 for this_lib_symbols in libs_symbols: 434 for k,v in list(this_lib_symbols.items()): 435 symbols[k] = v + symbols.setdefault(k,0) 436 437 # Count instances of member functions of template classes, and map the 438 # symbol name to the function+class. We do this under the assumption that if 439 # a member function of a template class is instantiated many times it's 440 # probably declared in a public header file. 441 template_function_count = dict() 442 template_function_mapping = dict() 443 template_function_count[""] = 0 444 for k in symbols: 445 name = None 446 if args.mangling == 'microsoft': 447 # Member functions of templates start with 448 # ?<fn_name>@?$<class_name>@, so we map to <fn_name>@?$<class_name>. 449 # As manglings go from the innermost scope to the outermost scope 450 # this means: 451 # * When we have a function member of a subclass of a template 452 # class then <fn_name> will actually contain the mangling of 453 # both the subclass and the function member. This is fine. 454 # * When we have a function member of a template subclass of a 455 # (possibly template) class then it's the innermost template 456 # subclass that becomes <class_name>. This should be OK so long 457 # as we don't have multiple classes with a template subclass of 458 # the same name. 459 match = re.search("^\?(\??\w+\@\?\$\w+)\@", k) 460 if match: 461 name = match.group(1) 462 else: 463 # Find member functions of templates by demangling the name and 464 # checking if the second-to-last name in the list is a template. 465 match = re.match('_Z(T[VTIS])?(N.+)', k) 466 if match: 467 try: 468 names, _ = parse_itanium_nested_name(match.group(2)) 469 if names and names[-2][1]: 470 name = ''.join([x for x,_ in names]) 471 except TooComplexName: 472 # Manglings that are too complex should already have been 473 # filtered out, but if we happen to somehow see one here 474 # just leave it as-is. 475 pass 476 if name: 477 old_count = template_function_count.setdefault(name,0) 478 template_function_count[name] = old_count + 1 479 template_function_mapping[k] = name 480 else: 481 template_function_mapping[k] = "" 482 483 # Print symbols which both: 484 # * Appear in exactly one input, as symbols defined in multiple 485 # objects/libraries are assumed to have public definitions. 486 # * Aren't instances of member functions of templates which have been 487 # instantiated 100 times or more, which are assumed to have public 488 # definitions. (100 is an arbitrary guess here.) 489 if args.o: 490 outfile = open(args.o,'w') 491 else: 492 outfile = sys.stdout 493 for k,v in list(symbols.items()): 494 template_count = template_function_count[template_function_mapping[k]] 495 if v == 1 and template_count < 100: 496 print(k, file=outfile) 497