• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/env python
2
3"""A tool for extracting a list of symbols to export
4
5When exporting symbols from a dll or exe we either need to mark the symbols in
6the source code as __declspec(dllexport) or supply a list of symbols to the
7linker. This program automates the latter by inspecting the symbol tables of a
8list of link inputs and deciding which of those symbols need to be exported.
9
10We can't just export all the defined symbols, as there's a limit of 65535
11exported symbols and in clang we go way over that, particularly in a debug
12build. Therefore a large part of the work is pruning symbols either which can't
13be imported, or which we think are things that have definitions in public header
14files (i.e. template instantiations) and we would get defined in the thing
15importing these symbols anyway.
16"""
17
18from __future__ import print_function
19import sys
20import re
21import os
22import subprocess
23import multiprocessing
24import argparse
25
26# Define functions which extract a list of symbols from a library using several
27# different tools. We use subprocess.Popen and yield a symbol at a time instead
28# of using subprocess.check_output and returning a list as, especially on
29# Windows, waiting for the entire output to be ready can take a significant
30# amount of time.
31
32def dumpbin_get_symbols(lib):
33    process = subprocess.Popen(['dumpbin','/symbols',lib], bufsize=1,
34                               stdout=subprocess.PIPE, stdin=subprocess.PIPE,
35                               universal_newlines=True)
36    process.stdin.close()
37    for line in process.stdout:
38        # Look for external symbols that are defined in some section
39        match = re.match("^.+SECT.+External\s+\|\s+(\S+).*$", line)
40        if match:
41            yield match.group(1)
42    process.wait()
43
44def nm_get_symbols(lib):
45    process = subprocess.Popen(['nm',lib], bufsize=1,
46                               stdout=subprocess.PIPE, stdin=subprocess.PIPE,
47                               universal_newlines=True)
48    process.stdin.close()
49    for line in process.stdout:
50        # Look for external symbols that are defined in some section
51        match = re.match("^\S+\s+[BDGRSTVW]\s+(\S+)$", line)
52        if match:
53            yield match.group(1)
54    process.wait()
55
56def readobj_get_symbols(lib):
57    process = subprocess.Popen(['llvm-readobj','-symbols',lib], bufsize=1,
58                               stdout=subprocess.PIPE, stdin=subprocess.PIPE,
59                               universal_newlines=True)
60    process.stdin.close()
61    for line in process.stdout:
62        # When looking through the output of llvm-readobj we expect to see Name,
63        # Section, then StorageClass, so record Name and Section when we see
64        # them and decide if this is a defined external symbol when we see
65        # StorageClass.
66        match = re.search('Name: (\S+)', line)
67        if match:
68            name = match.group(1)
69        match = re.search('Section: (\S+)', line)
70        if match:
71            section = match.group(1)
72        match = re.search('StorageClass: (\S+)', line)
73        if match:
74            storageclass = match.group(1)
75            if section != 'IMAGE_SYM_ABSOLUTE' and \
76               section != 'IMAGE_SYM_UNDEFINED' and \
77               storageclass == 'External':
78                yield name
79    process.wait()
80
81# Define functions which determine if the target is 32-bit Windows (as that's
82# where calling convention name decoration happens).
83
84def dumpbin_is_32bit_windows(lib):
85    # dumpbin /headers can output a huge amount of data (>100MB in a debug
86    # build) so we read only up to the 'machine' line then close the output.
87    process = subprocess.Popen(['dumpbin','/headers',lib], bufsize=1,
88                               stdout=subprocess.PIPE, stdin=subprocess.PIPE,
89                               universal_newlines=True)
90    process.stdin.close()
91    retval = False
92    for line in process.stdout:
93        match = re.match('.+machine \((\S+)\)', line)
94        if match:
95            retval = (match.group(1) == 'x86')
96            break
97    process.stdout.close()
98    process.wait()
99    return retval
100
101def objdump_is_32bit_windows(lib):
102    output = subprocess.check_output(['objdump','-f',lib],
103                                     universal_newlines=True)
104    for line in output:
105        match = re.match('.+file format (\S+)', line)
106        if match:
107            return (match.group(1) == 'pe-i386')
108    return False
109
110def readobj_is_32bit_windows(lib):
111    output = subprocess.check_output(['llvm-readobj','-file-headers',lib],
112                                     universal_newlines=True)
113    for line in output:
114        match = re.match('Format: (\S+)', line)
115        if match:
116            return (match.group(1) == 'COFF-i386')
117    return False
118
119# MSVC mangles names to ?<identifier_mangling>@<type_mangling>. By examining the
120# identifier/type mangling we can decide which symbols could possibly be
121# required and which we can discard.
122def should_keep_microsoft_symbol(symbol, calling_convention_decoration):
123    # Keep unmangled (i.e. extern "C") names
124    if not '?' in symbol:
125        if calling_convention_decoration:
126            # Remove calling convention decoration from names
127            match = re.match('[_@]([^@]+)', symbol)
128            if match:
129                return match.group(1)
130        return symbol
131    # Function template instantiations start with ?$, discard them as it's
132    # assumed that the definition is public
133    elif symbol.startswith('??$'):
134        return None
135    # Deleting destructors start with ?_G or ?_E and can be discarded because
136    # link.exe gives you a warning telling you they can't be exported if you
137    # don't
138    elif symbol.startswith('??_G') or symbol.startswith('??_E'):
139        return None
140    # Constructors (?0) and destructors (?1) of templates (?$) are assumed to be
141    # defined in headers and not required to be kept
142    elif symbol.startswith('??0?$') or symbol.startswith('??1?$'):
143        return None
144    # An anonymous namespace is mangled as ?A(maybe hex number)@. Any symbol
145    # that mentions an anonymous namespace can be discarded, as the anonymous
146    # namespace doesn't exist outside of that translation unit.
147    elif re.search('\?A(0x\w+)?@', symbol):
148        return None
149    # Keep mangled llvm:: and clang:: function symbols. How we detect these is a
150    # bit of a mess and imprecise, but that avoids having to completely demangle
151    # the symbol name. The outermost namespace is at the end of the identifier
152    # mangling, and the identifier mangling is followed by the type mangling, so
153    # we look for (llvm|clang)@@ followed by something that looks like a
154    # function type mangling. To spot a function type we use (this is derived
155    # from clang/lib/AST/MicrosoftMangle.cpp):
156    # <function-type> ::= <function-class> <this-cvr-qualifiers>
157    #                     <calling-convention> <return-type>
158    #                     <argument-list> <throw-spec>
159    # <function-class> ::= [A-Z]
160    # <this-cvr-qualifiers> ::= [A-Z0-9_]*
161    # <calling-convention> ::= [A-JQ]
162    # <return-type> ::= .+
163    # <argument-list> ::= X   (void)
164    #                 ::= .+@ (list of types)
165    #                 ::= .*Z (list of types, varargs)
166    # <throw-spec> ::= exceptions are not allowed
167    elif re.search('(llvm|clang)@@[A-Z][A-Z0-9_]*[A-JQ].+(X|.+@|.*Z)$', symbol):
168        return symbol
169    return None
170
171# Itanium manglings are of the form _Z<identifier_mangling><type_mangling>. We
172# demangle the identifier mangling to identify symbols that can be safely
173# discarded.
174def should_keep_itanium_symbol(symbol, calling_convention_decoration):
175    # Start by removing any calling convention decoration (which we expect to
176    # see on all symbols, even mangled C++ symbols)
177    if calling_convention_decoration and symbol.startswith('_'):
178        symbol = symbol[1:]
179    # Keep unmangled names
180    if not symbol.startswith('_') and not symbol.startswith('.'):
181        return symbol
182    # Discard manglings that aren't nested names
183    match = re.match('_Z(T[VTIS])?(N.+)', symbol)
184    if not match:
185        return None
186    # Demangle the name. If the name is too complex then we don't need to keep
187    # it, but it the demangling fails then keep the symbol just in case.
188    try:
189        names, _ = parse_itanium_nested_name(match.group(2))
190    except TooComplexName:
191        return None
192    if not names:
193        return symbol
194    # Constructors and destructors of templates classes are assumed to be
195    # defined in headers and not required to be kept
196    if re.match('[CD][123]', names[-1][0]) and names[-2][1]:
197        return None
198    # Discard function template instantiations as it's assumed that the
199    # definition is public
200    elif names[-1][1]:
201        return None
202    # Keep llvm:: and clang:: names
203    elif names[0][0] == '4llvm' or names[0][0] == '5clang':
204        return symbol
205    # Discard everything else
206    else:
207        return None
208
209# Certain kinds of complex manglings we assume cannot be part of a public
210# interface, and we handle them by raising an exception.
211class TooComplexName(Exception):
212    pass
213
214# Parse an itanium mangled name from the start of a string and return a
215# (name, rest of string) pair.
216def parse_itanium_name(arg):
217    # Check for a normal name
218    match = re.match('(\d+)(.+)', arg)
219    if match:
220        n = int(match.group(1))
221        name = match.group(1)+match.group(2)[:n]
222        rest = match.group(2)[n:]
223        return name, rest
224    # Check for constructor/destructor names
225    match = re.match('([CD][123])(.+)', arg)
226    if match:
227        return match.group(1), match.group(2)
228    # Assume that a sequence of characters that doesn't end a nesting is an
229    # operator (this is very imprecise, but appears to be good enough)
230    match = re.match('([^E]+)(.+)', arg)
231    if match:
232        return match.group(1), match.group(2)
233    # Anything else: we can't handle it
234    return None, arg
235
236# Parse an itanium mangled template argument list from the start of a string
237# and throw it away, returning the rest of the string.
238def skip_itanium_template(arg):
239    # A template argument list starts with I
240    assert arg.startswith('I'), arg
241    tmp = arg[1:]
242    while tmp:
243        # Check for names
244        match = re.match('(\d+)(.+)', tmp)
245        if match:
246            n = int(match.group(1))
247            tmp =  match.group(2)[n:]
248            continue
249        # Check for substitutions
250        match = re.match('S[A-Z0-9]*_(.+)', tmp)
251        if match:
252            tmp = match.group(1)
253        # Start of a template
254        elif tmp.startswith('I'):
255            tmp = skip_itanium_template(tmp)
256        # Start of a nested name
257        elif tmp.startswith('N'):
258            _, tmp = parse_itanium_nested_name(tmp)
259        # Start of an expression: assume that it's too complicated
260        elif tmp.startswith('L') or tmp.startswith('X'):
261            raise TooComplexName
262        # End of the template
263        elif tmp.startswith('E'):
264            return tmp[1:]
265        # Something else: probably a type, skip it
266        else:
267            tmp = tmp[1:]
268    return None
269
270# Parse an itanium mangled nested name and transform it into a list of pairs of
271# (name, is_template), returning (list, rest of string).
272def parse_itanium_nested_name(arg):
273    # A nested name starts with N
274    assert arg.startswith('N'), arg
275    ret = []
276
277    # Skip past the N, and possibly a substitution
278    match = re.match('NS[A-Z0-9]*_(.+)', arg)
279    if match:
280        tmp = match.group(1)
281    else:
282        tmp = arg[1:]
283
284    # Skip past CV-qualifiers and ref qualifiers
285    match = re.match('[rVKRO]*(.+)', tmp);
286    if match:
287        tmp = match.group(1)
288
289    # Repeatedly parse names from the string until we reach the end of the
290    # nested name
291    while tmp:
292        # An E ends the nested name
293        if tmp.startswith('E'):
294            return ret, tmp[1:]
295        # Parse a name
296        name_part, tmp = parse_itanium_name(tmp)
297        if not name_part:
298            # If we failed then we don't know how to demangle this
299            return None, None
300        is_template = False
301        # If this name is a template record that, then skip the template
302        # arguments
303        if tmp.startswith('I'):
304            tmp = skip_itanium_template(tmp)
305            is_template = True
306        # Add the name to the list
307        ret.append((name_part, is_template))
308
309    # If we get here then something went wrong
310    return None, None
311
312def extract_symbols(arg):
313    get_symbols, should_keep_symbol, calling_convention_decoration, lib = arg
314    symbols = dict()
315    for symbol in get_symbols(lib):
316        symbol = should_keep_symbol(symbol, calling_convention_decoration)
317        if symbol:
318            symbols[symbol] = 1 + symbols.setdefault(symbol,0)
319    return symbols
320
321if __name__ == '__main__':
322    tool_exes = ['dumpbin','nm','objdump','llvm-readobj']
323    parser = argparse.ArgumentParser(
324        description='Extract symbols to export from libraries')
325    parser.add_argument('--mangling', choices=['itanium','microsoft'],
326                        required=True, help='expected symbol mangling scheme')
327    parser.add_argument('--tools', choices=tool_exes, nargs='*',
328                        help='tools to use to extract symbols and determine the'
329                        ' target')
330    parser.add_argument('libs', metavar='lib', type=str, nargs='+',
331                        help='libraries to extract symbols from')
332    parser.add_argument('-o', metavar='file', type=str, help='output to file')
333    args = parser.parse_args()
334
335    # Determine the function to use to get the list of symbols from the inputs,
336    # and the function to use to determine if the target is 32-bit windows.
337    tools = { 'dumpbin' : (dumpbin_get_symbols, dumpbin_is_32bit_windows),
338              'nm' : (nm_get_symbols, None),
339              'objdump' : (None, objdump_is_32bit_windows),
340              'llvm-readobj' : (readobj_get_symbols, readobj_is_32bit_windows) }
341    get_symbols = None
342    is_32bit_windows = None
343    # If we have a tools argument then use that for the list of tools to check
344    if args.tools:
345        tool_exes = args.tools
346    # Find a tool to use by trying each in turn until we find one that exists
347    # (subprocess.call will throw OSError when the program does not exist)
348    get_symbols = None
349    for exe in tool_exes:
350        try:
351            # Close std streams as we don't want any output and we don't
352            # want the process to wait for something on stdin.
353            p = subprocess.Popen([exe], stdout=subprocess.PIPE,
354                                 stderr=subprocess.PIPE,
355                                 stdin=subprocess.PIPE,
356                                 universal_newlines=True)
357            p.stdout.close()
358            p.stderr.close()
359            p.stdin.close()
360            p.wait()
361            # Keep going until we have a tool to use for both get_symbols and
362            # is_32bit_windows
363            if not get_symbols:
364                get_symbols = tools[exe][0]
365            if not is_32bit_windows:
366                is_32bit_windows = tools[exe][1]
367            if get_symbols and is_32bit_windows:
368                break
369        except OSError:
370            continue
371    if not get_symbols:
372        print("Couldn't find a program to read symbols with", file=sys.stderr)
373        exit(1)
374    if not is_32bit_windows:
375        print("Couldn't find a program to determing the target", file=sys.stderr)
376        exit(1)
377
378    # How we determine which symbols to keep and which to discard depends on
379    # the mangling scheme
380    if args.mangling == 'microsoft':
381        should_keep_symbol = should_keep_microsoft_symbol
382    else:
383        should_keep_symbol = should_keep_itanium_symbol
384
385    # Get the list of libraries to extract symbols from
386    libs = list()
387    for lib in args.libs:
388        # When invoked by cmake the arguments are the cmake target names of the
389        # libraries, so we need to add .lib/.a to the end and maybe lib to the
390        # start to get the filename. Also allow objects.
391        suffixes = ['.lib','.a','.obj','.o']
392        if not any([lib.endswith(s) for s in suffixes]):
393            for s in suffixes:
394                if os.path.exists(lib+s):
395                    lib = lib+s
396                    break
397                if os.path.exists('lib'+lib+s):
398                    lib = 'lib'+lib+s
399                    break
400        if not any([lib.endswith(s) for s in suffixes]):
401            print("Don't know what to do with argument "+lib, file=sys.stderr)
402            exit(1)
403        libs.append(lib)
404
405    # Check if calling convention decoration is used by inspecting the first
406    # library in the list
407    calling_convention_decoration = is_32bit_windows(libs[0])
408
409    # Extract symbols from libraries in parallel. This is a huge time saver when
410    # doing a debug build, as there are hundreds of thousands of symbols in each
411    # library.
412    pool = multiprocessing.Pool()
413    try:
414        # Only one argument can be passed to the mapping function, and we can't
415        # use a lambda or local function definition as that doesn't work on
416        # windows, so create a list of tuples which duplicates the arguments
417        # that are the same in all calls.
418        vals = [(get_symbols, should_keep_symbol, calling_convention_decoration, x) for x in libs]
419        # Do an async map then wait for the result to make sure that
420        # KeyboardInterrupt gets caught correctly (see
421        # http://bugs.python.org/issue8296)
422        result = pool.map_async(extract_symbols, vals)
423        pool.close()
424        libs_symbols = result.get(3600)
425    except KeyboardInterrupt:
426        # On Ctrl-C terminate everything and exit
427        pool.terminate()
428        pool.join()
429        exit(1)
430
431    # Merge everything into a single dict
432    symbols = dict()
433    for this_lib_symbols in libs_symbols:
434        for k,v in list(this_lib_symbols.items()):
435            symbols[k] = v + symbols.setdefault(k,0)
436
437    # Count instances of member functions of template classes, and map the
438    # symbol name to the function+class. We do this under the assumption that if
439    # a member function of a template class is instantiated many times it's
440    # probably declared in a public header file.
441    template_function_count = dict()
442    template_function_mapping = dict()
443    template_function_count[""] = 0
444    for k in symbols:
445        name = None
446        if args.mangling == 'microsoft':
447            # Member functions of templates start with
448            # ?<fn_name>@?$<class_name>@, so we map to <fn_name>@?$<class_name>.
449            # As manglings go from the innermost scope to the outermost scope
450            # this means:
451            #  * When we have a function member of a subclass of a template
452            #    class then <fn_name> will actually contain the mangling of
453            #    both the subclass and the function member. This is fine.
454            #  * When we have a function member of a template subclass of a
455            #    (possibly template) class then it's the innermost template
456            #    subclass that becomes <class_name>. This should be OK so long
457            #    as we don't have multiple classes with a template subclass of
458            #    the same name.
459            match = re.search("^\?(\??\w+\@\?\$\w+)\@", k)
460            if match:
461                name = match.group(1)
462        else:
463            # Find member functions of templates by demangling the name and
464            # checking if the second-to-last name in the list is a template.
465            match = re.match('_Z(T[VTIS])?(N.+)', k)
466            if match:
467                try:
468                    names, _ = parse_itanium_nested_name(match.group(2))
469                    if names and names[-2][1]:
470                        name = ''.join([x for x,_ in names])
471                except TooComplexName:
472                    # Manglings that are too complex should already have been
473                    # filtered out, but if we happen to somehow see one here
474                    # just leave it as-is.
475                    pass
476        if name:
477            old_count = template_function_count.setdefault(name,0)
478            template_function_count[name] = old_count + 1
479            template_function_mapping[k] = name
480        else:
481            template_function_mapping[k] = ""
482
483    # Print symbols which both:
484    #  * Appear in exactly one input, as symbols defined in multiple
485    #    objects/libraries are assumed to have public definitions.
486    #  * Aren't instances of member functions of templates which have been
487    #    instantiated 100 times or more, which are assumed to have public
488    #    definitions. (100 is an arbitrary guess here.)
489    if args.o:
490        outfile = open(args.o,'w')
491    else:
492        outfile = sys.stdout
493    for k,v in list(symbols.items()):
494        template_count = template_function_count[template_function_mapping[k]]
495        if v == 1 and template_count < 100:
496            print(k, file=outfile)
497