1"""Repository rule for NCCL.""" 2 3load("@local_config_cuda//cuda:build_defs.bzl", "cuda_default_copts", "cuda_gpu_architectures") 4load("@bazel_tools//tools/cpp:toolchain_utils.bzl", "find_cpp_toolchain") 5 6# CUDA toolkit version as tuple (e.g. '(11, 1)'). 7_cuda_version = %{cuda_version} 8 9def _gen_device_srcs_impl(ctx): 10 ops = ["sum", "prod", "min", "max", "premulsum", "sumpostdiv"] 11 # TF uses CUDA version > 11.0, so enable bf16 type unconditionally. 12 types = ["i8", "u8", "i32", "u32", "i64", "u64", "f16", "bf16", "f32", "f64"] 13 hdr_tail = "****************************************/" 14 defines = "\n\n#define NCCL_OP %d\n#define NCCL_TYPE %d" 15 16 files = [] 17 for NCCL_OP, op in enumerate(ops): 18 for NCCL_TYPE, dt in enumerate(types): 19 substitutions = { 20 hdr_tail: hdr_tail + defines % (NCCL_OP, NCCL_TYPE), 21 } 22 for src in ctx.files.srcs: 23 name = "%s_%s_%s" % (op, dt, src.basename) 24 file = ctx.actions.declare_file(name, sibling = src) 25 ctx.actions.expand_template( 26 output = file, 27 template = src, 28 substitutions = substitutions, 29 ) 30 files.append(file) 31 return [DefaultInfo(files = depset(files))] 32 33gen_device_srcs = rule( 34 implementation = _gen_device_srcs_impl, 35 attrs = { 36 "srcs": attr.label_list(allow_files = True), 37 }, 38) 39"""Adds prefix to each file name in srcs and adds #define NCCL_OP.""" 40 41def _rdc_copts(): 42 """Returns copts for compiling relocatable device code.""" 43 44 # The global functions can not have a lower register count than the 45 # device functions. This is enforced by setting a fixed register count. 46 # https://github.com/NVIDIA/nccl/blob/f93fe9bfd94884cec2ba711897222e0df5569a53/makefiles/common.mk#L48 47 maxrregcount = "-maxrregcount=96" 48 49 return cuda_default_copts() + select({ 50 "@local_config_cuda//:is_cuda_compiler_nvcc": [ 51 "-nvcc_options", 52 "relocatable-device-code=true", 53 "-nvcc_options", 54 "ptxas-options=" + maxrregcount, 55 "-nvcc_options", 56 "extended-lambda", 57 ], 58 "@local_config_cuda//:is_cuda_compiler_clang": [ 59 "-fcuda-rdc", 60 "-Xcuda-ptxas", 61 maxrregcount, 62 ], 63 "//conditions:default": [], 64 }) 65 66def _lookup_file(filegroup, path): 67 """Extracts file at (relative) path in filegroup.""" 68 for file in filegroup.files: 69 if file.path.endswith(path): 70 return file 71 return None 72 73def _pic_only(files): 74 """Returns the PIC files if there are any in 'files', otherwise 'files'.""" 75 pic_only = [f for f in files if f.basename.find(".pic.") >= 0] 76 return pic_only if pic_only else files 77 78def _device_link_impl(ctx): 79 if not ctx.attr.gpu_archs: 80 fail("No GPU architecture specified. NCCL requires --config=cuda or similar.") 81 82 inputs = [] 83 for dep in ctx.attr.deps: 84 inputs += dep.files.to_list() 85 inputs = _pic_only(inputs) 86 87 # Device-link to cubins for each architecture. 88 name = ctx.attr.name 89 register_h = None 90 cubins = [] 91 images = [] 92 for arch in ctx.attr.gpu_archs: 93 arch = arch.replace("compute_", "sm_") # PTX is JIT-linked at runtime. 94 cubin = ctx.actions.declare_file("%s_%s.cubin" % (name, arch)) 95 register_h = ctx.actions.declare_file("%s_register_%s.h" % (name, arch)) 96 ctx.actions.run( 97 outputs = [register_h, cubin], 98 inputs = inputs, 99 executable = ctx.file._nvlink, 100 arguments = ctx.attr.nvlink_args + [ 101 "--arch=%s" % arch, 102 "--register-link-binaries=%s" % register_h.path, 103 "--output-file=%s" % cubin.path, 104 ] + [file.path for file in inputs], 105 mnemonic = "nvlink", 106 use_default_shell_env = True, 107 ) 108 cubins.append(cubin) 109 images.append("--image=profile=%s,file=%s" % (arch, cubin.path)) 110 111 # Generate fatbin header from all cubins. 112 tmp_fatbin = ctx.actions.declare_file("%s.fatbin" % name) 113 fatbin_h = ctx.actions.declare_file("%s_fatbin.h" % name) 114 bin2c = ctx.file._bin2c 115 arguments_list = [ 116 "-64", 117 "--cmdline=--compile-only", 118 "--link", 119 "--compress-all", 120 "--create=%s" % tmp_fatbin.path, 121 "--embedded-fatbin=%s" % fatbin_h.path, 122 ] 123 if _cuda_version <= (10, 1): 124 arguments_list.append("--bin2c-path=%s" % bin2c.dirname) 125 ctx.actions.run( 126 outputs = [tmp_fatbin, fatbin_h], 127 inputs = cubins, 128 executable = ctx.file._fatbinary, 129 arguments = arguments_list + images, 130 tools = [bin2c], 131 mnemonic = "fatbinary", 132 use_default_shell_env = True, 133 ) 134 135 # Generate the source file #including the headers generated above. 136 ctx.actions.expand_template( 137 output = ctx.outputs.out, 138 template = ctx.file._link_stub, 139 substitutions = { 140 "REGISTERLINKBINARYFILE": '"%s"' % register_h.short_path, 141 "FATBINFILE": '"%s"' % fatbin_h.short_path, 142 }, 143 ) 144 145 return [DefaultInfo(files = depset([register_h, fatbin_h]))] 146 147_device_link = rule( 148 implementation = _device_link_impl, 149 attrs = { 150 "deps": attr.label_list(), 151 "out": attr.output(mandatory = True), 152 "gpu_archs": attr.string_list(), 153 "nvlink_args": attr.string_list(), 154 "_nvlink": attr.label( 155 default = Label("@local_config_cuda//cuda:cuda/bin/nvlink"), 156 allow_single_file = True, 157 executable = True, 158 cfg = "host", 159 ), 160 "_fatbinary": attr.label( 161 default = Label("@local_config_cuda//cuda:cuda/bin/fatbinary"), 162 allow_single_file = True, 163 executable = True, 164 cfg = "host", 165 ), 166 "_bin2c": attr.label( 167 default = Label("@local_config_cuda//cuda:cuda/bin/bin2c"), 168 allow_single_file = True, 169 executable = True, 170 cfg = "host", 171 ), 172 "_link_stub": attr.label( 173 default = Label("@local_config_cuda//cuda:cuda/bin/crt/link.stub"), 174 allow_single_file = True, 175 ), 176 }, 177) 178"""Links device code and generates source code for kernel registration.""" 179 180def _prune_relocatable_code_impl(ctx): 181 """Clears __nv_relfatbin section containing relocatable device code.""" 182 183 if _cuda_version < (11, 3): 184 # -no-relocatable-elf not supported, return unpruned input. 185 return ctx.attr.input[DefaultInfo] 186 187 # nvcc --generate-code options for the active set of cuda architectures. 188 gencodes = [] 189 for code in ctx.attr.gpu_archs: 190 arch = code.replace("compute_", "sm_") 191 if code != arch: 192 gencodes.append((arch, arch)) 193 gencodes.append((arch, code)) 194 195 outputs = [] 196 for input in ctx.files.input: 197 output = ctx.actions.declare_file( 198 "pruned_" + input.basename, 199 sibling = input, 200 ) 201 arguments = ( 202 ["--generate-code=arch=%s,code=%s" % code for code in gencodes] + 203 ["-no-relocatable-elf", "--output-file=%s" % output.path, str(input.path)] 204 ) 205 ctx.actions.run( 206 outputs = [output], 207 inputs = [input], 208 executable = ctx.file._nvprune, 209 arguments = arguments, 210 mnemonic = "nvprune", 211 use_default_shell_env = True, 212 ) 213 outputs.append(output) 214 215 return DefaultInfo(files = depset(outputs)) 216 217_prune_relocatable_code = rule( 218 implementation = _prune_relocatable_code_impl, 219 attrs = { 220 "input": attr.label(mandatory = True, allow_files = True), 221 "gpu_archs": attr.string_list(), 222 "_nvprune": attr.label( 223 default = Label("@local_config_cuda//cuda:cuda/bin/nvprune"), 224 allow_single_file = True, 225 executable = True, 226 cfg = "host", 227 ), 228 }, 229) 230 231def _merge_archive_impl(ctx): 232 # Generate an mri script to the merge archives in srcs and pass it to 'ar'. 233 # See https://stackoverflow.com/a/23621751. 234 files = _pic_only(ctx.files.srcs) 235 mri_script = "create " + ctx.outputs.out.path 236 for f in files: 237 mri_script += r"\naddlib " + f.path 238 mri_script += r"\nsave\nend" 239 240 cc_toolchain = find_cpp_toolchain(ctx) 241 ctx.actions.run_shell( 242 inputs = ctx.files.srcs, # + ctx.files._crosstool, 243 outputs = [ctx.outputs.out], 244 command = "echo -e \"%s\" | %s -M" % (mri_script, cc_toolchain.ar_executable), 245 use_default_shell_env = True, 246 ) 247 248_merge_archive = rule( 249 implementation = _merge_archive_impl, 250 attrs = { 251 "srcs": attr.label_list(mandatory = True, allow_files = True), 252 "_cc_toolchain": attr.label( 253 default = "@bazel_tools//tools/cpp:current_cc_toolchain", 254 ), 255 # "_crosstool": attr.label_list( 256 # cfg = "host", 257 # default = ["@bazel_tools//tools/cpp:crosstool"] 258 # ), 259 }, 260 outputs = {"out": "lib%{name}.a"}, 261) 262"""Merges srcs into a single archive.""" 263 264def cuda_rdc_library(name, hdrs = None, copts = None, linkstatic = True, **kwargs): 265 r"""Produces a cuda_library using separate compilation and linking. 266 267 CUDA separate compilation and linking allows device function calls across 268 translation units. This is different from the normal whole program 269 compilation where each translation unit contains all device code. For more 270 background, see 271 https://devblogs.nvidia.com/separate-compilation-linking-cuda-device-code/, 272 https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html#nvcc-options-for-separate-compilation 273 274 During separate compilation, the different CUDA source files are compiled 275 to 'relocatable device code' (RDC) and embedded in the host object files. 276 When using nvcc, linking the device code for each supported GPU 277 architecture and generating kernel registration code for the CUDA runtime 278 is handled automatically. Clang supports generating relocatable device 279 code, but it can't link it. We therefore rely on tools provided by the CUDA 280 SDK to link the device code and generate the host code to register the 281 kernels. 282 283 The nvlink tool extracts the RDC code from the object files and links it 284 into cubin files, one per GPU architecture. It also produces a header file 285 with a list of kernel names to register. The cubins are merged into a 286 binary blob using the fatbinary tool, and converted to a C header file with 287 the help of the bin2c tool. The registration header file, the fatbinary 288 header file, and the link.stub file (shipped with the CUDA SDK) are 289 compiled as ordinary host code. 290 291 Here is a diagram of the CUDA separate compilation trajectory: 292 293 x.cu.cc y.cu.cc 294 \ / cc_library (compile RDC and archive) 295 xy.a 296 / \ * nvlink 297 register.h xy.cubin 298 : | * fatbinary and bin2c 299 : xy.fatbin.h 300 : : * #include 301 dlink.cc * Expanded from crt/dlink.stub template 302 | cc_library (host compile and archive) 303 dlink.a 304 305 The steps marked with '*' are implemented in the _device_link rule. 306 307 The intermediate relocatable device code in xy.a is no longer needed at 308 this point and the corresponding section is replaced with an empty one using 309 objcopy. We do not remove the section completely because it is referenced by 310 relocations, and removing those as well breaks fatbin registration. 311 312 The object files in both xy.a and dlink.a reference symbols defined in the 313 other archive. The separate archives are a side effect of using two 314 cc_library targets to implement a single compilation trajectory. We could 315 fix this once bazel supports C++ sandwich. For now, we just merge the two 316 archives to avoid unresolved symbols: 317 318 xy.a 319 | objcopy --update-section __nv_relfatbin='' 320 dlink.a xy_pruned.a 321 \ / merge archive 322 xy_merged.a 323 | cc_library (or alternatively, cc_import) 324 final target 325 326 Another complication is that cc_library produces (depending on the 327 configuration) both PIC and non-PIC archives, but the distinction 328 is hidden from Starlark until C++ sandwich becomes available. We work 329 around this by dropping the non-PIC files if PIC files are available. 330 331 Args: 332 name: Target name. 333 hdrs: Header files. 334 copts: Compiler options. 335 linkstatic: Must be true. 336 **kwargs: Any other arguments. 337 """ 338 339 if not hdrs: 340 hdrs = [] 341 if not copts: 342 copts = [] 343 344 # Compile host and device code into library. 345 lib = name + "_lib" 346 native.cc_library( 347 name = lib, 348 hdrs = hdrs, 349 copts = _rdc_copts() + copts, 350 linkstatic = linkstatic, 351 **kwargs 352 ) 353 354 # Generate source file containing linked device code. 355 dlink_hdrs = name + "_dlink_hdrs" 356 dlink_cc = name + "_dlink.cc" 357 _device_link( 358 name = dlink_hdrs, 359 deps = [lib], 360 out = dlink_cc, 361 gpu_archs = cuda_gpu_architectures(), 362 nvlink_args = select({ 363 "@org_tensorflow//tensorflow:linux_x86_64": ["--cpu-arch=X86_64"], 364 "@org_tensorflow//tensorflow:linux_ppc64le": ["--cpu-arch=PPC64LE"], 365 "//conditions:default": [], 366 }), 367 ) 368 369 # Compile the source file into a library. 370 dlink = name + "_dlink" 371 native.cc_library( 372 name = dlink, 373 srcs = [dlink_cc], 374 textual_hdrs = [dlink_hdrs], 375 deps = [ 376 "@local_config_cuda//cuda:cuda_headers", 377 ], 378 defines = [ 379 # Silence warning about including internal header. 380 "__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__", 381 # Macros that need to be defined starting with CUDA 10. 382 "__NV_EXTRA_INITIALIZATION=", 383 "__NV_EXTRA_FINALIZATION=", 384 ], 385 linkstatic = linkstatic, 386 ) 387 388 # Remove intermediate relocatable device code. 389 pruned = name + "_pruned" 390 _prune_relocatable_code( 391 name = pruned, 392 input = lib, 393 gpu_archs = cuda_gpu_architectures(), 394 ) 395 396 # Repackage the two libs into a single archive. This is required because 397 # both libs reference symbols defined in the other one. For details, see 398 # https://eli.thegreenplace.net/2013/07/09/library-order-in-static-linking 399 merged = name + "_merged" 400 _merge_archive( 401 name = merged, 402 srcs = [pruned, dlink], 403 ) 404 405 # Create cc target from archive. 406 native.cc_library( 407 name = name, 408 srcs = [merged], 409 hdrs = hdrs, 410 linkstatic = linkstatic, 411 ) 412