• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1"""Repository rule for NCCL."""
2
3load("@local_config_cuda//cuda:build_defs.bzl", "cuda_default_copts", "cuda_gpu_architectures")
4load("@bazel_tools//tools/cpp:toolchain_utils.bzl", "find_cpp_toolchain")
5
6# CUDA toolkit version as tuple (e.g. '(11, 1)').
7_cuda_version = %{cuda_version}
8
9def _gen_device_srcs_impl(ctx):
10    ops = ["sum", "prod", "min", "max", "premulsum", "sumpostdiv"]
11    # TF uses CUDA version > 11.0, so enable bf16 type unconditionally.
12    types = ["i8", "u8", "i32", "u32", "i64", "u64", "f16", "bf16", "f32", "f64"]
13    hdr_tail = "****************************************/"
14    defines = "\n\n#define NCCL_OP %d\n#define NCCL_TYPE %d"
15
16    files = []
17    for NCCL_OP, op in enumerate(ops):
18        for NCCL_TYPE, dt in enumerate(types):
19            substitutions = {
20                hdr_tail: hdr_tail + defines % (NCCL_OP, NCCL_TYPE),
21            }
22            for src in ctx.files.srcs:
23                name = "%s_%s_%s" % (op, dt, src.basename)
24                file = ctx.actions.declare_file(name, sibling = src)
25                ctx.actions.expand_template(
26                    output = file,
27                    template = src,
28                    substitutions = substitutions,
29                )
30                files.append(file)
31    return [DefaultInfo(files = depset(files))]
32
33gen_device_srcs = rule(
34    implementation = _gen_device_srcs_impl,
35    attrs = {
36        "srcs": attr.label_list(allow_files = True),
37    },
38)
39"""Adds prefix to each file name in srcs and adds #define NCCL_OP."""
40
41def _rdc_copts():
42    """Returns copts for compiling relocatable device code."""
43
44    # The global functions can not have a lower register count than the
45    # device functions. This is enforced by setting a fixed register count.
46    # https://github.com/NVIDIA/nccl/blob/f93fe9bfd94884cec2ba711897222e0df5569a53/makefiles/common.mk#L48
47    maxrregcount = "-maxrregcount=96"
48
49    return cuda_default_copts() + select({
50        "@local_config_cuda//:is_cuda_compiler_nvcc": [
51            "-nvcc_options",
52            "relocatable-device-code=true",
53            "-nvcc_options",
54            "ptxas-options=" + maxrregcount,
55            "-nvcc_options",
56            "extended-lambda",
57        ],
58        "@local_config_cuda//:is_cuda_compiler_clang": [
59            "-fcuda-rdc",
60            "-Xcuda-ptxas",
61            maxrregcount,
62        ],
63        "//conditions:default": [],
64    })
65
66def _lookup_file(filegroup, path):
67    """Extracts file at (relative) path in filegroup."""
68    for file in filegroup.files:
69        if file.path.endswith(path):
70            return file
71    return None
72
73def _pic_only(files):
74    """Returns the PIC files if there are any in 'files', otherwise 'files'."""
75    pic_only = [f for f in files if f.basename.find(".pic.") >= 0]
76    return pic_only if pic_only else files
77
78def _device_link_impl(ctx):
79    if not ctx.attr.gpu_archs:
80        fail("No GPU architecture specified. NCCL requires --config=cuda or similar.")
81
82    inputs = []
83    for dep in ctx.attr.deps:
84        inputs += dep.files.to_list()
85    inputs = _pic_only(inputs)
86
87    # Device-link to cubins for each architecture.
88    name = ctx.attr.name
89    register_h = None
90    cubins = []
91    images = []
92    for arch in ctx.attr.gpu_archs:
93        arch = arch.replace("compute_", "sm_")  # PTX is JIT-linked at runtime.
94        cubin = ctx.actions.declare_file("%s_%s.cubin" % (name, arch))
95        register_h = ctx.actions.declare_file("%s_register_%s.h" % (name, arch))
96        ctx.actions.run(
97            outputs = [register_h, cubin],
98            inputs = inputs,
99            executable = ctx.file._nvlink,
100            arguments = ctx.attr.nvlink_args + [
101                "--arch=%s" % arch,
102                "--register-link-binaries=%s" % register_h.path,
103                "--output-file=%s" % cubin.path,
104            ] + [file.path for file in inputs],
105            mnemonic = "nvlink",
106            use_default_shell_env = True,
107        )
108        cubins.append(cubin)
109        images.append("--image=profile=%s,file=%s" % (arch, cubin.path))
110
111    # Generate fatbin header from all cubins.
112    tmp_fatbin = ctx.actions.declare_file("%s.fatbin" % name)
113    fatbin_h = ctx.actions.declare_file("%s_fatbin.h" % name)
114    bin2c = ctx.file._bin2c
115    arguments_list = [
116        "-64",
117        "--cmdline=--compile-only",
118        "--link",
119        "--compress-all",
120        "--create=%s" % tmp_fatbin.path,
121        "--embedded-fatbin=%s" % fatbin_h.path,
122    ]
123    if _cuda_version <= (10, 1):
124        arguments_list.append("--bin2c-path=%s" % bin2c.dirname)
125    ctx.actions.run(
126        outputs = [tmp_fatbin, fatbin_h],
127        inputs = cubins,
128        executable = ctx.file._fatbinary,
129        arguments = arguments_list + images,
130        tools = [bin2c],
131        mnemonic = "fatbinary",
132        use_default_shell_env = True,
133    )
134
135    # Generate the source file #including the headers generated above.
136    ctx.actions.expand_template(
137        output = ctx.outputs.out,
138        template = ctx.file._link_stub,
139        substitutions = {
140            "REGISTERLINKBINARYFILE": '"%s"' % register_h.short_path,
141            "FATBINFILE": '"%s"' % fatbin_h.short_path,
142        },
143    )
144
145    return [DefaultInfo(files = depset([register_h, fatbin_h]))]
146
147_device_link = rule(
148    implementation = _device_link_impl,
149    attrs = {
150        "deps": attr.label_list(),
151        "out": attr.output(mandatory = True),
152        "gpu_archs": attr.string_list(),
153        "nvlink_args": attr.string_list(),
154        "_nvlink": attr.label(
155            default = Label("@local_config_cuda//cuda:cuda/bin/nvlink"),
156            allow_single_file = True,
157            executable = True,
158            cfg = "host",
159        ),
160        "_fatbinary": attr.label(
161            default = Label("@local_config_cuda//cuda:cuda/bin/fatbinary"),
162            allow_single_file = True,
163            executable = True,
164            cfg = "host",
165        ),
166        "_bin2c": attr.label(
167            default = Label("@local_config_cuda//cuda:cuda/bin/bin2c"),
168            allow_single_file = True,
169            executable = True,
170            cfg = "host",
171        ),
172        "_link_stub": attr.label(
173            default = Label("@local_config_cuda//cuda:cuda/bin/crt/link.stub"),
174            allow_single_file = True,
175        ),
176    },
177)
178"""Links device code and generates source code for kernel registration."""
179
180def _prune_relocatable_code_impl(ctx):
181    """Clears __nv_relfatbin section containing relocatable device code."""
182
183    if _cuda_version < (11, 3):
184        # -no-relocatable-elf not supported, return unpruned input.
185        return ctx.attr.input[DefaultInfo]
186
187    # nvcc --generate-code options for the active set of cuda architectures.
188    gencodes = []
189    for code in ctx.attr.gpu_archs:
190        arch = code.replace("compute_", "sm_")
191        if code != arch:
192            gencodes.append((arch, arch))
193        gencodes.append((arch, code))
194
195    outputs = []
196    for input in ctx.files.input:
197        output = ctx.actions.declare_file(
198            "pruned_" + input.basename,
199            sibling = input,
200        )
201        arguments = (
202            ["--generate-code=arch=%s,code=%s" % code for code in gencodes] +
203            ["-no-relocatable-elf", "--output-file=%s" % output.path, str(input.path)]
204        )
205        ctx.actions.run(
206            outputs = [output],
207            inputs = [input],
208            executable = ctx.file._nvprune,
209            arguments = arguments,
210            mnemonic = "nvprune",
211            use_default_shell_env = True,
212        )
213        outputs.append(output)
214
215    return DefaultInfo(files = depset(outputs))
216
217_prune_relocatable_code = rule(
218    implementation = _prune_relocatable_code_impl,
219    attrs = {
220        "input": attr.label(mandatory = True, allow_files = True),
221        "gpu_archs": attr.string_list(),
222        "_nvprune": attr.label(
223            default = Label("@local_config_cuda//cuda:cuda/bin/nvprune"),
224            allow_single_file = True,
225            executable = True,
226            cfg = "host",
227        ),
228    },
229)
230
231def _merge_archive_impl(ctx):
232    # Generate an mri script to the merge archives in srcs and pass it to 'ar'.
233    # See https://stackoverflow.com/a/23621751.
234    files = _pic_only(ctx.files.srcs)
235    mri_script = "create " + ctx.outputs.out.path
236    for f in files:
237        mri_script += r"\naddlib " + f.path
238    mri_script += r"\nsave\nend"
239
240    cc_toolchain = find_cpp_toolchain(ctx)
241    ctx.actions.run_shell(
242        inputs = ctx.files.srcs,  # + ctx.files._crosstool,
243        outputs = [ctx.outputs.out],
244        command = "echo -e \"%s\" | %s -M" % (mri_script, cc_toolchain.ar_executable),
245        use_default_shell_env = True,
246    )
247
248_merge_archive = rule(
249    implementation = _merge_archive_impl,
250    attrs = {
251        "srcs": attr.label_list(mandatory = True, allow_files = True),
252        "_cc_toolchain": attr.label(
253            default = "@bazel_tools//tools/cpp:current_cc_toolchain",
254        ),
255        # "_crosstool": attr.label_list(
256        #     cfg = "host",
257        #     default = ["@bazel_tools//tools/cpp:crosstool"]
258        # ),
259    },
260    outputs = {"out": "lib%{name}.a"},
261)
262"""Merges srcs into a single archive."""
263
264def cuda_rdc_library(name, hdrs = None, copts = None, linkstatic = True, **kwargs):
265    r"""Produces a cuda_library using separate compilation and linking.
266
267    CUDA separate compilation and linking allows device function calls across
268    translation units. This is different from the normal whole program
269    compilation where each translation unit contains all device code. For more
270    background, see
271    https://devblogs.nvidia.com/separate-compilation-linking-cuda-device-code/,
272    https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html#nvcc-options-for-separate-compilation
273
274    During separate compilation, the different CUDA source files are compiled
275    to 'relocatable device code' (RDC) and embedded in the host object files.
276    When using nvcc, linking the device code for each supported GPU
277    architecture and generating kernel registration code for the CUDA runtime
278    is handled automatically. Clang supports generating relocatable device
279    code, but it can't link it. We therefore rely on tools provided by the CUDA
280    SDK to link the device code and generate the host code to register the
281    kernels.
282
283    The nvlink tool extracts the RDC code from the object files and links it
284    into cubin files, one per GPU architecture. It also produces a header file
285    with a list of kernel names to register. The cubins are merged into a
286    binary blob using the fatbinary tool, and converted to a C header file with
287    the help of the bin2c tool. The registration header file, the fatbinary
288    header file, and the link.stub file (shipped with the CUDA SDK) are
289    compiled as ordinary host code.
290
291    Here is a diagram of the CUDA separate compilation trajectory:
292
293     x.cu.cc    y.cu.cc
294           \    /            cc_library (compile RDC and archive)
295            xy.a
296           /    \            * nvlink
297    register.h  xy.cubin
298          :      |           * fatbinary and bin2c
299          :     xy.fatbin.h
300          :      :           * #include
301          dlink.cc           * Expanded from crt/dlink.stub template
302             |               cc_library (host compile and archive)
303          dlink.a
304
305    The steps marked with '*' are implemented in the _device_link rule.
306
307    The intermediate relocatable device code in xy.a is no longer needed at
308    this point and the corresponding section is replaced with an empty one using
309    objcopy. We do not remove the section completely because it is referenced by
310    relocations, and removing those as well breaks fatbin registration.
311
312    The object files in both xy.a and dlink.a reference symbols defined in the
313    other archive. The separate archives are a side effect of using two
314    cc_library targets to implement a single compilation trajectory. We could
315    fix this once bazel supports C++ sandwich. For now, we just merge the two
316    archives to avoid unresolved symbols:
317
318                    xy.a
319                     |         objcopy --update-section __nv_relfatbin=''
320    dlink.a     xy_pruned.a
321         \           /         merge archive
322          xy_merged.a
323              |                cc_library (or alternatively, cc_import)
324         final target
325
326    Another complication is that cc_library produces (depending on the
327    configuration) both PIC and non-PIC archives, but the distinction
328    is hidden from Starlark until C++ sandwich becomes available. We work
329    around this by dropping the non-PIC files if PIC files are available.
330
331    Args:
332      name: Target name.
333      hdrs: Header files.
334      copts: Compiler options.
335      linkstatic: Must be true.
336      **kwargs: Any other arguments.
337    """
338
339    if not hdrs:
340        hdrs = []
341    if not copts:
342        copts = []
343
344    # Compile host and device code into library.
345    lib = name + "_lib"
346    native.cc_library(
347        name = lib,
348        hdrs = hdrs,
349        copts = _rdc_copts() + copts,
350        linkstatic = linkstatic,
351        **kwargs
352    )
353
354    # Generate source file containing linked device code.
355    dlink_hdrs = name + "_dlink_hdrs"
356    dlink_cc = name + "_dlink.cc"
357    _device_link(
358        name = dlink_hdrs,
359        deps = [lib],
360        out = dlink_cc,
361        gpu_archs = cuda_gpu_architectures(),
362        nvlink_args = select({
363            "@org_tensorflow//tensorflow:linux_x86_64": ["--cpu-arch=X86_64"],
364            "@org_tensorflow//tensorflow:linux_ppc64le": ["--cpu-arch=PPC64LE"],
365            "//conditions:default": [],
366        }),
367    )
368
369    # Compile the source file into a library.
370    dlink = name + "_dlink"
371    native.cc_library(
372        name = dlink,
373        srcs = [dlink_cc],
374        textual_hdrs = [dlink_hdrs],
375        deps = [
376            "@local_config_cuda//cuda:cuda_headers",
377        ],
378        defines = [
379            # Silence warning about including internal header.
380            "__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__",
381            # Macros that need to be defined starting with CUDA 10.
382            "__NV_EXTRA_INITIALIZATION=",
383            "__NV_EXTRA_FINALIZATION=",
384        ],
385        linkstatic = linkstatic,
386    )
387
388    # Remove intermediate relocatable device code.
389    pruned = name + "_pruned"
390    _prune_relocatable_code(
391        name = pruned,
392        input = lib,
393        gpu_archs = cuda_gpu_architectures(),
394    )
395
396    # Repackage the two libs into a single archive. This is required because
397    # both libs reference symbols defined in the other one. For details, see
398    # https://eli.thegreenplace.net/2013/07/09/library-order-in-static-linking
399    merged = name + "_merged"
400    _merge_archive(
401        name = merged,
402        srcs = [pruned, dlink],
403    )
404
405    # Create cc target from archive.
406    native.cc_library(
407        name = name,
408        srcs = [merged],
409        hdrs = hdrs,
410        linkstatic = linkstatic,
411    )
412