• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1"""Repository rule for NCCL."""
2
3load("@local_config_cuda//cuda:build_defs.bzl", "cuda_default_copts", "cuda_gpu_architectures")
4load("@bazel_tools//tools/cpp:toolchain_utils.bzl", "find_cpp_toolchain")
5
6# CUDA toolkit version as tuple (e.g. '(11, 1)').
7_cuda_version = %{cuda_version}
8
9def _gen_device_srcs_impl(ctx):
10    ops = ["sum", "prod", "min", "max"]
11    types = ["i8", "u8", "i32", "u32", "i64", "u64", "f16", "f32", "f64"]
12    hdr_tail = "****************************************/"
13    defines = "\n\n#define NCCL_OP %d\n#define NCCL_TYPE %d"
14
15    files = []
16    for NCCL_OP, op in enumerate(ops):
17        for NCCL_TYPE, dt in enumerate(types):
18            substitutions = {
19                hdr_tail: hdr_tail + defines % (NCCL_OP, NCCL_TYPE),
20            }
21            for src in ctx.files.srcs:
22                name = "%s_%s_%s" % (op, dt, src.basename)
23                file = ctx.actions.declare_file(name, sibling = src)
24                ctx.actions.expand_template(
25                    output = file,
26                    template = src,
27                    substitutions = substitutions,
28                )
29                files.append(file)
30    return [DefaultInfo(files = depset(files))]
31
32gen_device_srcs = rule(
33    implementation = _gen_device_srcs_impl,
34    attrs = {
35        "srcs": attr.label_list(allow_files = True),
36    },
37)
38"""Adds prefix to each file name in srcs and adds #define NCCL_OP."""
39
40def _rdc_copts():
41    """Returns copts for compiling relocatable device code."""
42
43    # The global functions can not have a lower register count than the
44    # device functions. This is enforced by setting a fixed register count.
45    # https://github.com/NVIDIA/nccl/blob/f93fe9bfd94884cec2ba711897222e0df5569a53/makefiles/common.mk#L48
46    maxrregcount = "-maxrregcount=96"
47
48    return cuda_default_copts() + select({
49        "@local_config_cuda//cuda:using_nvcc": [
50            "-nvcc_options",
51            "relocatable-device-code=true",
52            "-nvcc_options",
53            "ptxas-options=" + maxrregcount,
54        ],
55        "@local_config_cuda//cuda:using_clang": [
56            "-fcuda-rdc",
57            "-Xcuda-ptxas",
58            maxrregcount,
59        ],
60        "//conditions:default": [],
61    })
62
63def _lookup_file(filegroup, path):
64    """Extracts file at (relative) path in filegroup."""
65    for file in filegroup.files:
66        if file.path.endswith(path):
67            return file
68    return None
69
70def _pic_only(files):
71    """Returns the PIC files if there are any in 'files', otherwise 'files'."""
72    pic_only = [f for f in files if f.basename.find(".pic.") >= 0]
73    return pic_only if pic_only else files
74
75def _device_link_impl(ctx):
76    if not ctx.attr.gpu_archs:
77        fail("No GPU architecture specified. NCCL requires --config=cuda or similar.")
78
79    inputs = []
80    for dep in ctx.attr.deps:
81        inputs += dep.files.to_list()
82    inputs = _pic_only(inputs)
83
84    # Device-link to cubins for each architecture.
85    name = ctx.attr.name
86    register_h = None
87    cubins = []
88    images = []
89    for arch in ctx.attr.gpu_archs:
90        arch = arch.replace("compute_", "sm_")  # PTX is JIT-linked at runtime.
91        cubin = ctx.actions.declare_file("%s_%s.cubin" % (name, arch))
92        register_h = ctx.actions.declare_file("%s_register_%s.h" % (name, arch))
93        ctx.actions.run(
94            outputs = [register_h, cubin],
95            inputs = inputs,
96            executable = ctx.file._nvlink,
97            arguments = ctx.attr.nvlink_args + [
98                "--arch=%s" % arch,
99                "--register-link-binaries=%s" % register_h.path,
100                "--output-file=%s" % cubin.path,
101            ] + [file.path for file in inputs],
102            mnemonic = "nvlink",
103        )
104        cubins.append(cubin)
105        images.append("--image=profile=%s,file=%s" % (arch, cubin.path))
106
107    # Generate fatbin header from all cubins.
108    tmp_fatbin = ctx.actions.declare_file("%s.fatbin" % name)
109    fatbin_h = ctx.actions.declare_file("%s_fatbin.h" % name)
110    bin2c = ctx.file._bin2c
111    arguments_list = [
112        "-64",
113        "--cmdline=--compile-only",
114        "--link",
115        "--compress-all",
116        "--create=%s" % tmp_fatbin.path,
117        "--embedded-fatbin=%s" % fatbin_h.path,
118    ]
119    if _cuda_version <= (10, 1):
120        arguments_list.append("--bin2c-path=%s" % bin2c.dirname)
121    ctx.actions.run(
122        outputs = [tmp_fatbin, fatbin_h],
123        inputs = cubins,
124        executable = ctx.file._fatbinary,
125        arguments = arguments_list + images,
126        tools = [bin2c],
127        mnemonic = "fatbinary",
128    )
129
130    # Generate the source file #including the headers generated above.
131    ctx.actions.expand_template(
132        output = ctx.outputs.out,
133        template = ctx.file._link_stub,
134        substitutions = {
135            "REGISTERLINKBINARYFILE": '"%s"' % register_h.short_path,
136            "FATBINFILE": '"%s"' % fatbin_h.short_path,
137        },
138    )
139
140    return [DefaultInfo(files = depset([register_h, fatbin_h]))]
141
142_device_link = rule(
143    implementation = _device_link_impl,
144    attrs = {
145        "deps": attr.label_list(),
146        "out": attr.output(mandatory = True),
147        "gpu_archs": attr.string_list(),
148        "nvlink_args": attr.string_list(),
149        "_nvlink": attr.label(
150            default = Label("@local_config_cuda//cuda:cuda/bin/nvlink"),
151            allow_single_file = True,
152            executable = True,
153            cfg = "host",
154        ),
155        "_fatbinary": attr.label(
156            default = Label("@local_config_cuda//cuda:cuda/bin/fatbinary"),
157            allow_single_file = True,
158            executable = True,
159            cfg = "host",
160        ),
161        "_bin2c": attr.label(
162            default = Label("@local_config_cuda//cuda:cuda/bin/bin2c"),
163            allow_single_file = True,
164            executable = True,
165            cfg = "host",
166        ),
167        "_link_stub": attr.label(
168            default = Label("@local_config_cuda//cuda:cuda/bin/crt/link.stub"),
169            allow_single_file = True,
170        ),
171    },
172)
173"""Links device code and generates source code for kernel registration."""
174
175def _prune_relocatable_code_impl(ctx):
176    """Clears __nv_relfatbin section containing relocatable device code."""
177
178    if _cuda_version < (11, 3):
179        # -no-relocatable-elf not supported, return unpruned input.
180        return ctx.attr.input[DefaultInfo]
181
182    # nvcc --generate-code options for the active set of cuda architectures.
183    gencodes = []
184    for code in ctx.attr.gpu_archs:
185        arch = code.replace("compute_", "sm_")
186        if code != arch:
187            gencodes.append((arch, arch))
188        gencodes.append((arch, code))
189
190    outputs = []
191    for input in ctx.files.input:
192        output = ctx.actions.declare_file(
193            "pruned_" + input.basename,
194            sibling = input,
195        )
196        arguments = (
197            ["--generate-code=arch=%s,code=%s" % code for code in gencodes] +
198            ["-no-relocatable-elf", "--output-file=%s" % output.path, str(input.path)]
199        )
200        ctx.actions.run(
201            outputs = [output],
202            inputs = [input],
203            executable = ctx.file._nvprune,
204            arguments = arguments,
205            mnemonic = "nvprune",
206        )
207        output.append(outputs)
208
209    return DefaultInfo(files = depset(outputs))
210
211_prune_relocatable_code = rule(
212    implementation = _prune_relocatable_code_impl,
213    attrs = {
214        "input": attr.label(mandatory = True, allow_files = True),
215        "gpu_archs": attr.string_list(),
216        "_nvprune": attr.label(
217            default = Label("@local_config_cuda//cuda:cuda/bin/nvprune"),
218            allow_single_file = True,
219            executable = True,
220            cfg = "host",
221        ),
222    },
223)
224
225def _merge_archive_impl(ctx):
226    # Generate an mri script to the merge archives in srcs and pass it to 'ar'.
227    # See https://stackoverflow.com/a/23621751.
228    files = _pic_only(ctx.files.srcs)
229    mri_script = "create " + ctx.outputs.out.path
230    for f in files:
231        mri_script += r"\naddlib " + f.path
232    mri_script += r"\nsave\nend"
233
234    cc_toolchain = find_cpp_toolchain(ctx)
235    ctx.actions.run_shell(
236        inputs = ctx.files.srcs,  # + ctx.files._crosstool,
237        outputs = [ctx.outputs.out],
238        command = "echo -e \"%s\" | %s -M" % (mri_script, cc_toolchain.ar_executable),
239    )
240
241_merge_archive = rule(
242    implementation = _merge_archive_impl,
243    attrs = {
244        "srcs": attr.label_list(mandatory = True, allow_files = True),
245        "_cc_toolchain": attr.label(
246            default = "@bazel_tools//tools/cpp:current_cc_toolchain",
247        ),
248        # "_crosstool": attr.label_list(
249        #     cfg = "host",
250        #     default = ["@bazel_tools//tools/cpp:crosstool"]
251        # ),
252    },
253    outputs = {"out": "lib%{name}.a"},
254)
255"""Merges srcs into a single archive."""
256
257def cuda_rdc_library(name, hdrs = None, copts = None, linkstatic = True, **kwargs):
258    r"""Produces a cuda_library using separate compilation and linking.
259
260    CUDA separate compilation and linking allows device function calls across
261    translation units. This is different from the normal whole program
262    compilation where each translation unit contains all device code. For more
263    background, see
264    https://devblogs.nvidia.com/separate-compilation-linking-cuda-device-code/,
265    https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html#nvcc-options-for-separate-compilation
266
267    During separate compilation, the different CUDA source files are compiled
268    to 'relocatable device code' (RDC) and embedded in the host object files.
269    When using nvcc, linking the device code for each supported GPU
270    architecture and generating kernel registration code for the CUDA runtime
271    is handled automatically. Clang supports generating relocatable device
272    code, but it can't link it. We therefore rely on tools provided by the CUDA
273    SDK to link the device code and generate the host code to register the
274    kernels.
275
276    The nvlink tool extracts the RDC code from the object files and links it
277    into cubin files, one per GPU architecture. It also produces a header file
278    with a list of kernel names to register. The cubins are merged into a
279    binary blob using the fatbinary tool, and converted to a C header file with
280    the help of the bin2c tool. The registration header file, the fatbinary
281    header file, and the link.stub file (shipped with the CUDA SDK) are
282    compiled as ordinary host code.
283
284    Here is a diagram of the CUDA separate compilation trajectory:
285
286     x.cu.cc    y.cu.cc
287           \    /            cc_library (compile RDC and archive)
288            xy.a
289           /    \            * nvlink
290    register.h  xy.cubin
291          :      |           * fatbinary and bin2c
292          :     xy.fatbin.h
293          :      :           * #include
294          dlink.cc           * Expanded from crt/dlink.stub template
295             |               cc_library (host compile and archive)
296          dlink.a
297
298    The steps marked with '*' are implemented in the _device_link rule.
299
300    The intermediate relocatable device code in xy.a is no longer needed at
301    this point and the corresponding section is replaced with an empty one using
302    objcopy. We do not remove the section completely because it is referenced by
303    relocations, and removing those as well breaks fatbin registration.
304
305    The object files in both xy.a and dlink.a reference symbols defined in the
306    other archive. The separate archives are a side effect of using two
307    cc_library targets to implement a single compilation trajectory. We could
308    fix this once bazel supports C++ sandwich. For now, we just merge the two
309    archives to avoid unresolved symbols:
310
311                    xy.a
312                     |         objcopy --update-section __nv_relfatbin=''
313    dlink.a     xy_pruned.a
314         \           /         merge archive
315          xy_merged.a
316              |                cc_library (or alternatively, cc_import)
317         final target
318
319    Another complication is that cc_library produces (depending on the
320    configuration) both PIC and non-PIC archives, but the distinction
321    is hidden from Starlark until C++ sandwich becomes available. We work
322    around this by dropping the non-PIC files if PIC files are available.
323
324    Args:
325      name: Target name.
326      hdrs: Header files.
327      copts: Compiler options.
328      linkstatic: Must be true.
329      **kwargs: Any other arguments.
330    """
331
332    if not hdrs:
333        hdrs = []
334    if not copts:
335        copts = []
336
337    # Compile host and device code into library.
338    lib = name + "_lib"
339    native.cc_library(
340        name = lib,
341        hdrs = hdrs,
342        copts = _rdc_copts() + copts,
343        linkstatic = linkstatic,
344        **kwargs
345    )
346
347    # Generate source file containing linked device code.
348    dlink_hdrs = name + "_dlink_hdrs"
349    dlink_cc = name + "_dlink.cc"
350    _device_link(
351        name = dlink_hdrs,
352        deps = [lib],
353        out = dlink_cc,
354        gpu_archs = cuda_gpu_architectures(),
355        nvlink_args = select({
356            "@org_tensorflow//tensorflow:linux_x86_64": ["--cpu-arch=X86_64"],
357            "@org_tensorflow//tensorflow:linux_ppc64le": ["--cpu-arch=PPC64LE"],
358            "//conditions:default": [],
359        }),
360    )
361
362    # Compile the source file into a library.
363    dlink = name + "_dlink"
364    native.cc_library(
365        name = dlink,
366        srcs = [dlink_cc],
367        textual_hdrs = [dlink_hdrs],
368        deps = [
369            "@local_config_cuda//cuda:cuda_headers",
370        ],
371        defines = [
372            # Silence warning about including internal header.
373            "__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__",
374            # Macros that need to be defined starting with CUDA 10.
375            "__NV_EXTRA_INITIALIZATION=",
376            "__NV_EXTRA_FINALIZATION=",
377        ],
378        linkstatic = linkstatic,
379    )
380
381    # Remove intermediate relocatable device code.
382    pruned = name + "_pruned"
383    _prune_relocatable_code(
384        name = pruned,
385        input = lib,
386        gpu_archs = cuda_gpu_architectures(),
387    )
388
389    # Repackage the two libs into a single archive. This is required because
390    # both libs reference symbols defined in the other one. For details, see
391    # https://eli.thegreenplace.net/2013/07/09/library-order-in-static-linking
392    merged = name + "_merged"
393    _merge_archive(
394        name = merged,
395        srcs = [pruned, dlink],
396    )
397
398    # Create cc target from archive.
399    native.cc_library(
400        name = name,
401        srcs = [merged],
402        hdrs = hdrs,
403        linkstatic = linkstatic,
404    )
405