• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/env python
2
3"""A tool for extracting a list of symbols to export
4
5When exporting symbols from a dll or exe we either need to mark the symbols in
6the source code as __declspec(dllexport) or supply a list of symbols to the
7linker. This program automates the latter by inspecting the symbol tables of a
8list of link inputs and deciding which of those symbols need to be exported.
9
10We can't just export all the defined symbols, as there's a limit of 65535
11exported symbols and in clang we go way over that, particularly in a debug
12build. Therefore a large part of the work is pruning symbols either which can't
13be imported, or which we think are things that have definitions in public header
14files (i.e. template instantiations) and we would get defined in the thing
15importing these symbols anyway.
16"""
17
18from __future__ import print_function
19import sys
20import re
21import os
22import subprocess
23import multiprocessing
24import argparse
25
26# Define functions which extract a list of symbols from a library using several
27# different tools. We use subprocess.Popen and yield a symbol at a time instead
28# of using subprocess.check_output and returning a list as, especially on
29# Windows, waiting for the entire output to be ready can take a significant
30# amount of time.
31
32def dumpbin_get_symbols(lib):
33    process = subprocess.Popen(['dumpbin','/symbols',lib], bufsize=1,
34                               stdout=subprocess.PIPE, stdin=subprocess.PIPE,
35                               universal_newlines=True)
36    process.stdin.close()
37    for line in process.stdout:
38        # Look for external symbols that are defined in some section
39        match = re.match("^.+SECT.+External\s+\|\s+(\S+).*$", line)
40        if match:
41            yield match.group(1)
42    process.wait()
43
44def nm_get_symbols(lib):
45    process = subprocess.Popen(['nm',lib], bufsize=1,
46                               stdout=subprocess.PIPE, stdin=subprocess.PIPE,
47                               universal_newlines=True)
48    process.stdin.close()
49    for line in process.stdout:
50        # Look for external symbols that are defined in some section
51        match = re.match("^\S+\s+[BDGRSTVW]\s+(\S+)$", line)
52        if match:
53            yield match.group(1)
54    process.wait()
55
56def readobj_get_symbols(lib):
57    process = subprocess.Popen(['llvm-readobj','-symbols',lib], bufsize=1,
58                               stdout=subprocess.PIPE, stdin=subprocess.PIPE,
59                               universal_newlines=True)
60    process.stdin.close()
61    for line in process.stdout:
62        # When looking through the output of llvm-readobj we expect to see Name,
63        # Section, then StorageClass, so record Name and Section when we see
64        # them and decide if this is a defined external symbol when we see
65        # StorageClass.
66        match = re.search('Name: (\S+)', line)
67        if match:
68            name = match.group(1)
69        match = re.search('Section: (\S+)', line)
70        if match:
71            section = match.group(1)
72        match = re.search('StorageClass: (\S+)', line)
73        if match:
74            storageclass = match.group(1)
75            if section != 'IMAGE_SYM_ABSOLUTE' and \
76               section != 'IMAGE_SYM_UNDEFINED' and \
77               storageclass == 'External':
78                yield name
79    process.wait()
80
81# Define functions which determine if the target is 32-bit Windows (as that's
82# where calling convention name decoration happens).
83
84def dumpbin_is_32bit_windows(lib):
85    # dumpbin /headers can output a huge amount of data (>100MB in a debug
86    # build) so we read only up to the 'machine' line then close the output.
87    process = subprocess.Popen(['dumpbin','/headers',lib], bufsize=1,
88                               stdout=subprocess.PIPE, stdin=subprocess.PIPE,
89                               universal_newlines=True)
90    process.stdin.close()
91    retval = False
92    for line in process.stdout:
93        match = re.match('.+machine \((\S+)\)', line)
94        if match:
95            retval = (match.group(1) == 'x86')
96            break
97    process.stdout.close()
98    process.wait()
99    return retval
100
101def objdump_is_32bit_windows(lib):
102    output = subprocess.check_output(['objdump','-f',lib],
103                                     universal_newlines=True)
104    for line in output:
105        match = re.match('.+file format (\S+)', line)
106        if match:
107            return (match.group(1) == 'pe-i386')
108    return False
109
110def readobj_is_32bit_windows(lib):
111    output = subprocess.check_output(['llvm-readobj','-file-headers',lib],
112                                     universal_newlines=True)
113    for line in output:
114        match = re.match('Format: (\S+)', line)
115        if match:
116            return (match.group(1) == 'COFF-i386')
117    return False
118
119# MSVC mangles names to ?<identifier_mangling>@<type_mangling>. By examining the
120# identifier/type mangling we can decide which symbols could possibly be
121# required and which we can discard.
122def should_keep_microsoft_symbol(symbol, calling_convention_decoration):
123    # Keep unmangled (i.e. extern "C") names
124    if not '?' in symbol:
125        if calling_convention_decoration:
126            # Remove calling convention decoration from names
127            match = re.match('[_@]([^@]+)', symbol)
128            if match:
129                return match.group(1)
130        return symbol
131    # Function template instantiations start with ?$; keep the instantiations of
132    # clang::Type::getAs, as some of them are explipict specializations that are
133    # defined in clang's lib/AST/Type.cpp; discard the rest as it's assumed that
134    # the definition is public
135    elif re.match('\?\?\$getAs@.+@Type@clang@@', symbol):
136        return symbol
137    elif symbol.startswith('??$'):
138        return None
139    # Deleting destructors start with ?_G or ?_E and can be discarded because
140    # link.exe gives you a warning telling you they can't be exported if you
141    # don't
142    elif symbol.startswith('??_G') or symbol.startswith('??_E'):
143        return None
144    # Constructors (?0) and destructors (?1) of templates (?$) are assumed to be
145    # defined in headers and not required to be kept
146    elif symbol.startswith('??0?$') or symbol.startswith('??1?$'):
147        return None
148    # An anonymous namespace is mangled as ?A(maybe hex number)@. Any symbol
149    # that mentions an anonymous namespace can be discarded, as the anonymous
150    # namespace doesn't exist outside of that translation unit.
151    elif re.search('\?A(0x\w+)?@', symbol):
152        return None
153    # Keep mangled llvm:: and clang:: function symbols. How we detect these is a
154    # bit of a mess and imprecise, but that avoids having to completely demangle
155    # the symbol name. The outermost namespace is at the end of the identifier
156    # mangling, and the identifier mangling is followed by the type mangling, so
157    # we look for (llvm|clang)@@ followed by something that looks like a
158    # function type mangling. To spot a function type we use (this is derived
159    # from clang/lib/AST/MicrosoftMangle.cpp):
160    # <function-type> ::= <function-class> <this-cvr-qualifiers>
161    #                     <calling-convention> <return-type>
162    #                     <argument-list> <throw-spec>
163    # <function-class> ::= [A-Z]
164    # <this-cvr-qualifiers> ::= [A-Z0-9_]*
165    # <calling-convention> ::= [A-JQ]
166    # <return-type> ::= .+
167    # <argument-list> ::= X   (void)
168    #                 ::= .+@ (list of types)
169    #                 ::= .*Z (list of types, varargs)
170    # <throw-spec> ::= exceptions are not allowed
171    elif re.search('(llvm|clang)@@[A-Z][A-Z0-9_]*[A-JQ].+(X|.+@|.*Z)$', symbol):
172        return symbol
173    return None
174
175# Itanium manglings are of the form _Z<identifier_mangling><type_mangling>. We
176# demangle the identifier mangling to identify symbols that can be safely
177# discarded.
178def should_keep_itanium_symbol(symbol, calling_convention_decoration):
179    # Start by removing any calling convention decoration (which we expect to
180    # see on all symbols, even mangled C++ symbols)
181    if calling_convention_decoration and symbol.startswith('_'):
182        symbol = symbol[1:]
183    # Keep unmangled names
184    if not symbol.startswith('_') and not symbol.startswith('.'):
185        return symbol
186    # Discard manglings that aren't nested names
187    match = re.match('_Z(T[VTIS])?(N.+)', symbol)
188    if not match:
189        return None
190    # Demangle the name. If the name is too complex then we don't need to keep
191    # it, but it the demangling fails then keep the symbol just in case.
192    try:
193        names, _ = parse_itanium_nested_name(match.group(2))
194    except TooComplexName:
195        return None
196    if not names:
197        return symbol
198    # Constructors and destructors of templates classes are assumed to be
199    # defined in headers and not required to be kept
200    if re.match('[CD][123]', names[-1][0]) and names[-2][1]:
201        return None
202    # Keep the instantiations of clang::Type::getAs, as some of them are
203    # explipict specializations that are defined in clang's lib/AST/Type.cpp;
204    # discard any other function template instantiations as it's assumed that
205    # the definition is public
206    elif symbol.startswith('_ZNK5clang4Type5getAs'):
207        return symbol
208    elif names[-1][1]:
209        return None
210    # Keep llvm:: and clang:: names
211    elif names[0][0] == '4llvm' or names[0][0] == '5clang':
212        return symbol
213    # Discard everything else
214    else:
215        return None
216
217# Certain kinds of complex manglings we assume cannot be part of a public
218# interface, and we handle them by raising an exception.
219class TooComplexName(Exception):
220    pass
221
222# Parse an itanium mangled name from the start of a string and return a
223# (name, rest of string) pair.
224def parse_itanium_name(arg):
225    # Check for a normal name
226    match = re.match('(\d+)(.+)', arg)
227    if match:
228        n = int(match.group(1))
229        name = match.group(1)+match.group(2)[:n]
230        rest = match.group(2)[n:]
231        return name, rest
232    # Check for constructor/destructor names
233    match = re.match('([CD][123])(.+)', arg)
234    if match:
235        return match.group(1), match.group(2)
236    # Assume that a sequence of characters that doesn't end a nesting is an
237    # operator (this is very imprecise, but appears to be good enough)
238    match = re.match('([^E]+)(.+)', arg)
239    if match:
240        return match.group(1), match.group(2)
241    # Anything else: we can't handle it
242    return None, arg
243
244# Parse an itanium mangled template argument list from the start of a string
245# and throw it away, returning the rest of the string.
246def skip_itanium_template(arg):
247    # A template argument list starts with I
248    assert arg.startswith('I'), arg
249    tmp = arg[1:]
250    while tmp:
251        # Check for names
252        match = re.match('(\d+)(.+)', tmp)
253        if match:
254            n = int(match.group(1))
255            tmp =  match.group(2)[n:]
256            continue
257        # Check for substitutions
258        match = re.match('S[A-Z0-9]*_(.+)', tmp)
259        if match:
260            tmp = match.group(1)
261        # Start of a template
262        elif tmp.startswith('I'):
263            tmp = skip_itanium_template(tmp)
264        # Start of a nested name
265        elif tmp.startswith('N'):
266            _, tmp = parse_itanium_nested_name(tmp)
267        # Start of an expression: assume that it's too complicated
268        elif tmp.startswith('L') or tmp.startswith('X'):
269            raise TooComplexName
270        # End of the template
271        elif tmp.startswith('E'):
272            return tmp[1:]
273        # Something else: probably a type, skip it
274        else:
275            tmp = tmp[1:]
276    return None
277
278# Parse an itanium mangled nested name and transform it into a list of pairs of
279# (name, is_template), returning (list, rest of string).
280def parse_itanium_nested_name(arg):
281    # A nested name starts with N
282    assert arg.startswith('N'), arg
283    ret = []
284
285    # Skip past the N, and possibly a substitution
286    match = re.match('NS[A-Z0-9]*_(.+)', arg)
287    if match:
288        tmp = match.group(1)
289    else:
290        tmp = arg[1:]
291
292    # Skip past CV-qualifiers and ref qualifiers
293    match = re.match('[rVKRO]*(.+)', tmp);
294    if match:
295        tmp = match.group(1)
296
297    # Repeatedly parse names from the string until we reach the end of the
298    # nested name
299    while tmp:
300        # An E ends the nested name
301        if tmp.startswith('E'):
302            return ret, tmp[1:]
303        # Parse a name
304        name_part, tmp = parse_itanium_name(tmp)
305        if not name_part:
306            # If we failed then we don't know how to demangle this
307            return None, None
308        is_template = False
309        # If this name is a template record that, then skip the template
310        # arguments
311        if tmp.startswith('I'):
312            tmp = skip_itanium_template(tmp)
313            is_template = True
314        # Add the name to the list
315        ret.append((name_part, is_template))
316
317    # If we get here then something went wrong
318    return None, None
319
320def extract_symbols(arg):
321    get_symbols, should_keep_symbol, calling_convention_decoration, lib = arg
322    symbols = dict()
323    for symbol in get_symbols(lib):
324        symbol = should_keep_symbol(symbol, calling_convention_decoration)
325        if symbol:
326            symbols[symbol] = 1 + symbols.setdefault(symbol,0)
327    return symbols
328
329if __name__ == '__main__':
330    tool_exes = ['dumpbin','nm','objdump','llvm-readobj']
331    parser = argparse.ArgumentParser(
332        description='Extract symbols to export from libraries')
333    parser.add_argument('--mangling', choices=['itanium','microsoft'],
334                        required=True, help='expected symbol mangling scheme')
335    parser.add_argument('--tools', choices=tool_exes, nargs='*',
336                        help='tools to use to extract symbols and determine the'
337                        ' target')
338    parser.add_argument('libs', metavar='lib', type=str, nargs='+',
339                        help='libraries to extract symbols from')
340    parser.add_argument('-o', metavar='file', type=str, help='output to file')
341    args = parser.parse_args()
342
343    # Determine the function to use to get the list of symbols from the inputs,
344    # and the function to use to determine if the target is 32-bit windows.
345    tools = { 'dumpbin' : (dumpbin_get_symbols, dumpbin_is_32bit_windows),
346              'nm' : (nm_get_symbols, None),
347              'objdump' : (None, objdump_is_32bit_windows),
348              'llvm-readobj' : (readobj_get_symbols, readobj_is_32bit_windows) }
349    get_symbols = None
350    is_32bit_windows = None
351    # If we have a tools argument then use that for the list of tools to check
352    if args.tools:
353        tool_exes = args.tools
354    # Find a tool to use by trying each in turn until we find one that exists
355    # (subprocess.call will throw OSError when the program does not exist)
356    get_symbols = None
357    for exe in tool_exes:
358        try:
359            # Close std streams as we don't want any output and we don't
360            # want the process to wait for something on stdin.
361            p = subprocess.Popen([exe], stdout=subprocess.PIPE,
362                                 stderr=subprocess.PIPE,
363                                 stdin=subprocess.PIPE,
364                                 universal_newlines=True)
365            p.stdout.close()
366            p.stderr.close()
367            p.stdin.close()
368            p.wait()
369            # Keep going until we have a tool to use for both get_symbols and
370            # is_32bit_windows
371            if not get_symbols:
372                get_symbols = tools[exe][0]
373            if not is_32bit_windows:
374                is_32bit_windows = tools[exe][1]
375            if get_symbols and is_32bit_windows:
376                break
377        except OSError:
378            continue
379    if not get_symbols:
380        print("Couldn't find a program to read symbols with", file=sys.stderr)
381        exit(1)
382    if not is_32bit_windows:
383        print("Couldn't find a program to determining the target", file=sys.stderr)
384        exit(1)
385
386    # How we determine which symbols to keep and which to discard depends on
387    # the mangling scheme
388    if args.mangling == 'microsoft':
389        should_keep_symbol = should_keep_microsoft_symbol
390    else:
391        should_keep_symbol = should_keep_itanium_symbol
392
393    # Get the list of libraries to extract symbols from
394    libs = list()
395    for lib in args.libs:
396        # When invoked by cmake the arguments are the cmake target names of the
397        # libraries, so we need to add .lib/.a to the end and maybe lib to the
398        # start to get the filename. Also allow objects.
399        suffixes = ['.lib','.a','.obj','.o']
400        if not any([lib.endswith(s) for s in suffixes]):
401            for s in suffixes:
402                if os.path.exists(lib+s):
403                    lib = lib+s
404                    break
405                if os.path.exists('lib'+lib+s):
406                    lib = 'lib'+lib+s
407                    break
408        if not any([lib.endswith(s) for s in suffixes]):
409            print("Don't know what to do with argument "+lib, file=sys.stderr)
410            exit(1)
411        libs.append(lib)
412
413    # Check if calling convention decoration is used by inspecting the first
414    # library in the list
415    calling_convention_decoration = is_32bit_windows(libs[0])
416
417    # Extract symbols from libraries in parallel. This is a huge time saver when
418    # doing a debug build, as there are hundreds of thousands of symbols in each
419    # library.
420    pool = multiprocessing.Pool()
421    try:
422        # Only one argument can be passed to the mapping function, and we can't
423        # use a lambda or local function definition as that doesn't work on
424        # windows, so create a list of tuples which duplicates the arguments
425        # that are the same in all calls.
426        vals = [(get_symbols, should_keep_symbol, calling_convention_decoration, x) for x in libs]
427        # Do an async map then wait for the result to make sure that
428        # KeyboardInterrupt gets caught correctly (see
429        # http://bugs.python.org/issue8296)
430        result = pool.map_async(extract_symbols, vals)
431        pool.close()
432        libs_symbols = result.get(3600)
433    except KeyboardInterrupt:
434        # On Ctrl-C terminate everything and exit
435        pool.terminate()
436        pool.join()
437        exit(1)
438
439    # Merge everything into a single dict
440    symbols = dict()
441    for this_lib_symbols in libs_symbols:
442        for k,v in list(this_lib_symbols.items()):
443            symbols[k] = v + symbols.setdefault(k,0)
444
445    # Count instances of member functions of template classes, and map the
446    # symbol name to the function+class. We do this under the assumption that if
447    # a member function of a template class is instantiated many times it's
448    # probably declared in a public header file.
449    template_function_count = dict()
450    template_function_mapping = dict()
451    template_function_count[""] = 0
452    for k in symbols:
453        name = None
454        if args.mangling == 'microsoft':
455            # Member functions of templates start with
456            # ?<fn_name>@?$<class_name>@, so we map to <fn_name>@?$<class_name>.
457            # As manglings go from the innermost scope to the outermost scope
458            # this means:
459            #  * When we have a function member of a subclass of a template
460            #    class then <fn_name> will actually contain the mangling of
461            #    both the subclass and the function member. This is fine.
462            #  * When we have a function member of a template subclass of a
463            #    (possibly template) class then it's the innermost template
464            #    subclass that becomes <class_name>. This should be OK so long
465            #    as we don't have multiple classes with a template subclass of
466            #    the same name.
467            match = re.search("^\?(\??\w+\@\?\$\w+)\@", k)
468            if match:
469                name = match.group(1)
470        else:
471            # Find member functions of templates by demangling the name and
472            # checking if the second-to-last name in the list is a template.
473            match = re.match('_Z(T[VTIS])?(N.+)', k)
474            if match:
475                try:
476                    names, _ = parse_itanium_nested_name(match.group(2))
477                    if names and names[-2][1]:
478                        name = ''.join([x for x,_ in names])
479                except TooComplexName:
480                    # Manglings that are too complex should already have been
481                    # filtered out, but if we happen to somehow see one here
482                    # just leave it as-is.
483                    pass
484        if name:
485            old_count = template_function_count.setdefault(name,0)
486            template_function_count[name] = old_count + 1
487            template_function_mapping[k] = name
488        else:
489            template_function_mapping[k] = ""
490
491    # Print symbols which both:
492    #  * Appear in exactly one input, as symbols defined in multiple
493    #    objects/libraries are assumed to have public definitions.
494    #  * Aren't instances of member functions of templates which have been
495    #    instantiated 100 times or more, which are assumed to have public
496    #    definitions. (100 is an arbitrary guess here.)
497    if args.o:
498        outfile = open(args.o,'w')
499    else:
500        outfile = sys.stdout
501    for k,v in list(symbols.items()):
502        template_count = template_function_count[template_function_mapping[k]]
503        if v == 1 and template_count < 100:
504            print(k, file=outfile)
505