1"""Repository rule for NCCL.""" 2 3load("@local_config_cuda//cuda:build_defs.bzl", "cuda_default_copts", "cuda_gpu_architectures") 4load("@bazel_tools//tools/cpp:toolchain_utils.bzl", "find_cpp_toolchain") 5 6# CUDA toolkit version as tuple (e.g. '(11, 1)'). 7_cuda_version = %{cuda_version} 8 9def _gen_device_srcs_impl(ctx): 10 ops = ["sum", "prod", "min", "max"] 11 types = ["i8", "u8", "i32", "u32", "i64", "u64", "f16", "f32", "f64"] 12 hdr_tail = "****************************************/" 13 defines = "\n\n#define NCCL_OP %d\n#define NCCL_TYPE %d" 14 15 files = [] 16 for NCCL_OP, op in enumerate(ops): 17 for NCCL_TYPE, dt in enumerate(types): 18 substitutions = { 19 hdr_tail: hdr_tail + defines % (NCCL_OP, NCCL_TYPE), 20 } 21 for src in ctx.files.srcs: 22 name = "%s_%s_%s" % (op, dt, src.basename) 23 file = ctx.actions.declare_file(name, sibling = src) 24 ctx.actions.expand_template( 25 output = file, 26 template = src, 27 substitutions = substitutions, 28 ) 29 files.append(file) 30 return [DefaultInfo(files = depset(files))] 31 32gen_device_srcs = rule( 33 implementation = _gen_device_srcs_impl, 34 attrs = { 35 "srcs": attr.label_list(allow_files = True), 36 }, 37) 38"""Adds prefix to each file name in srcs and adds #define NCCL_OP.""" 39 40def _rdc_copts(): 41 """Returns copts for compiling relocatable device code.""" 42 43 # The global functions can not have a lower register count than the 44 # device functions. This is enforced by setting a fixed register count. 45 # https://github.com/NVIDIA/nccl/blob/f93fe9bfd94884cec2ba711897222e0df5569a53/makefiles/common.mk#L48 46 maxrregcount = "-maxrregcount=96" 47 48 return cuda_default_copts() + select({ 49 "@local_config_cuda//cuda:using_nvcc": [ 50 "-nvcc_options", 51 "relocatable-device-code=true", 52 "-nvcc_options", 53 "ptxas-options=" + maxrregcount, 54 ], 55 "@local_config_cuda//cuda:using_clang": [ 56 "-fcuda-rdc", 57 "-Xcuda-ptxas", 58 maxrregcount, 59 ], 60 "//conditions:default": [], 61 }) 62 63def _lookup_file(filegroup, path): 64 """Extracts file at (relative) path in filegroup.""" 65 for file in filegroup.files: 66 if file.path.endswith(path): 67 return file 68 return None 69 70def _pic_only(files): 71 """Returns the PIC files if there are any in 'files', otherwise 'files'.""" 72 pic_only = [f for f in files if f.basename.find(".pic.") >= 0] 73 return pic_only if pic_only else files 74 75def _device_link_impl(ctx): 76 if not ctx.attr.gpu_archs: 77 fail("No GPU architecture specified. NCCL requires --config=cuda or similar.") 78 79 inputs = [] 80 for dep in ctx.attr.deps: 81 inputs += dep.files.to_list() 82 inputs = _pic_only(inputs) 83 84 # Device-link to cubins for each architecture. 85 name = ctx.attr.name 86 register_h = None 87 cubins = [] 88 images = [] 89 for arch in ctx.attr.gpu_archs: 90 arch = arch.replace("compute_", "sm_") # PTX is JIT-linked at runtime. 91 cubin = ctx.actions.declare_file("%s_%s.cubin" % (name, arch)) 92 register_h = ctx.actions.declare_file("%s_register_%s.h" % (name, arch)) 93 ctx.actions.run( 94 outputs = [register_h, cubin], 95 inputs = inputs, 96 executable = ctx.file._nvlink, 97 arguments = ctx.attr.nvlink_args + [ 98 "--arch=%s" % arch, 99 "--register-link-binaries=%s" % register_h.path, 100 "--output-file=%s" % cubin.path, 101 ] + [file.path for file in inputs], 102 mnemonic = "nvlink", 103 ) 104 cubins.append(cubin) 105 images.append("--image=profile=%s,file=%s" % (arch, cubin.path)) 106 107 # Generate fatbin header from all cubins. 108 tmp_fatbin = ctx.actions.declare_file("%s.fatbin" % name) 109 fatbin_h = ctx.actions.declare_file("%s_fatbin.h" % name) 110 bin2c = ctx.file._bin2c 111 arguments_list = [ 112 "-64", 113 "--cmdline=--compile-only", 114 "--link", 115 "--compress-all", 116 "--create=%s" % tmp_fatbin.path, 117 "--embedded-fatbin=%s" % fatbin_h.path, 118 ] 119 if _cuda_version <= (10, 1): 120 arguments_list.append("--bin2c-path=%s" % bin2c.dirname) 121 ctx.actions.run( 122 outputs = [tmp_fatbin, fatbin_h], 123 inputs = cubins, 124 executable = ctx.file._fatbinary, 125 arguments = arguments_list + images, 126 tools = [bin2c], 127 mnemonic = "fatbinary", 128 ) 129 130 # Generate the source file #including the headers generated above. 131 ctx.actions.expand_template( 132 output = ctx.outputs.out, 133 template = ctx.file._link_stub, 134 substitutions = { 135 "REGISTERLINKBINARYFILE": '"%s"' % register_h.short_path, 136 "FATBINFILE": '"%s"' % fatbin_h.short_path, 137 }, 138 ) 139 140 return [DefaultInfo(files = depset([register_h, fatbin_h]))] 141 142_device_link = rule( 143 implementation = _device_link_impl, 144 attrs = { 145 "deps": attr.label_list(), 146 "out": attr.output(mandatory = True), 147 "gpu_archs": attr.string_list(), 148 "nvlink_args": attr.string_list(), 149 "_nvlink": attr.label( 150 default = Label("@local_config_cuda//cuda:cuda/bin/nvlink"), 151 allow_single_file = True, 152 executable = True, 153 cfg = "host", 154 ), 155 "_fatbinary": attr.label( 156 default = Label("@local_config_cuda//cuda:cuda/bin/fatbinary"), 157 allow_single_file = True, 158 executable = True, 159 cfg = "host", 160 ), 161 "_bin2c": attr.label( 162 default = Label("@local_config_cuda//cuda:cuda/bin/bin2c"), 163 allow_single_file = True, 164 executable = True, 165 cfg = "host", 166 ), 167 "_link_stub": attr.label( 168 default = Label("@local_config_cuda//cuda:cuda/bin/crt/link.stub"), 169 allow_single_file = True, 170 ), 171 }, 172) 173"""Links device code and generates source code for kernel registration.""" 174 175def _prune_relocatable_code_impl(ctx): 176 """Clears __nv_relfatbin section containing relocatable device code.""" 177 178 if _cuda_version < (11, 3): 179 # -no-relocatable-elf not supported, return unpruned input. 180 return ctx.attr.input[DefaultInfo] 181 182 # nvcc --generate-code options for the active set of cuda architectures. 183 gencodes = [] 184 for code in ctx.attr.gpu_archs: 185 arch = code.replace("compute_", "sm_") 186 if code != arch: 187 gencodes.append((arch, arch)) 188 gencodes.append((arch, code)) 189 190 outputs = [] 191 for input in ctx.files.input: 192 output = ctx.actions.declare_file( 193 "pruned_" + input.basename, 194 sibling = input, 195 ) 196 arguments = ( 197 ["--generate-code=arch=%s,code=%s" % code for code in gencodes] + 198 ["-no-relocatable-elf", "--output-file=%s" % output.path, str(input.path)] 199 ) 200 ctx.actions.run( 201 outputs = [output], 202 inputs = [input], 203 executable = ctx.file._nvprune, 204 arguments = arguments, 205 mnemonic = "nvprune", 206 ) 207 output.append(outputs) 208 209 return DefaultInfo(files = depset(outputs)) 210 211_prune_relocatable_code = rule( 212 implementation = _prune_relocatable_code_impl, 213 attrs = { 214 "input": attr.label(mandatory = True, allow_files = True), 215 "gpu_archs": attr.string_list(), 216 "_nvprune": attr.label( 217 default = Label("@local_config_cuda//cuda:cuda/bin/nvprune"), 218 allow_single_file = True, 219 executable = True, 220 cfg = "host", 221 ), 222 }, 223) 224 225def _merge_archive_impl(ctx): 226 # Generate an mri script to the merge archives in srcs and pass it to 'ar'. 227 # See https://stackoverflow.com/a/23621751. 228 files = _pic_only(ctx.files.srcs) 229 mri_script = "create " + ctx.outputs.out.path 230 for f in files: 231 mri_script += r"\naddlib " + f.path 232 mri_script += r"\nsave\nend" 233 234 cc_toolchain = find_cpp_toolchain(ctx) 235 ctx.actions.run_shell( 236 inputs = ctx.files.srcs, # + ctx.files._crosstool, 237 outputs = [ctx.outputs.out], 238 command = "echo -e \"%s\" | %s -M" % (mri_script, cc_toolchain.ar_executable), 239 ) 240 241_merge_archive = rule( 242 implementation = _merge_archive_impl, 243 attrs = { 244 "srcs": attr.label_list(mandatory = True, allow_files = True), 245 "_cc_toolchain": attr.label( 246 default = "@bazel_tools//tools/cpp:current_cc_toolchain", 247 ), 248 # "_crosstool": attr.label_list( 249 # cfg = "host", 250 # default = ["@bazel_tools//tools/cpp:crosstool"] 251 # ), 252 }, 253 outputs = {"out": "lib%{name}.a"}, 254) 255"""Merges srcs into a single archive.""" 256 257def cuda_rdc_library(name, hdrs = None, copts = None, linkstatic = True, **kwargs): 258 r"""Produces a cuda_library using separate compilation and linking. 259 260 CUDA separate compilation and linking allows device function calls across 261 translation units. This is different from the normal whole program 262 compilation where each translation unit contains all device code. For more 263 background, see 264 https://devblogs.nvidia.com/separate-compilation-linking-cuda-device-code/, 265 https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html#nvcc-options-for-separate-compilation 266 267 During separate compilation, the different CUDA source files are compiled 268 to 'relocatable device code' (RDC) and embedded in the host object files. 269 When using nvcc, linking the device code for each supported GPU 270 architecture and generating kernel registration code for the CUDA runtime 271 is handled automatically. Clang supports generating relocatable device 272 code, but it can't link it. We therefore rely on tools provided by the CUDA 273 SDK to link the device code and generate the host code to register the 274 kernels. 275 276 The nvlink tool extracts the RDC code from the object files and links it 277 into cubin files, one per GPU architecture. It also produces a header file 278 with a list of kernel names to register. The cubins are merged into a 279 binary blob using the fatbinary tool, and converted to a C header file with 280 the help of the bin2c tool. The registration header file, the fatbinary 281 header file, and the link.stub file (shipped with the CUDA SDK) are 282 compiled as ordinary host code. 283 284 Here is a diagram of the CUDA separate compilation trajectory: 285 286 x.cu.cc y.cu.cc 287 \ / cc_library (compile RDC and archive) 288 xy.a 289 / \ * nvlink 290 register.h xy.cubin 291 : | * fatbinary and bin2c 292 : xy.fatbin.h 293 : : * #include 294 dlink.cc * Expanded from crt/dlink.stub template 295 | cc_library (host compile and archive) 296 dlink.a 297 298 The steps marked with '*' are implemented in the _device_link rule. 299 300 The intermediate relocatable device code in xy.a is no longer needed at 301 this point and the corresponding section is replaced with an empty one using 302 objcopy. We do not remove the section completely because it is referenced by 303 relocations, and removing those as well breaks fatbin registration. 304 305 The object files in both xy.a and dlink.a reference symbols defined in the 306 other archive. The separate archives are a side effect of using two 307 cc_library targets to implement a single compilation trajectory. We could 308 fix this once bazel supports C++ sandwich. For now, we just merge the two 309 archives to avoid unresolved symbols: 310 311 xy.a 312 | objcopy --update-section __nv_relfatbin='' 313 dlink.a xy_pruned.a 314 \ / merge archive 315 xy_merged.a 316 | cc_library (or alternatively, cc_import) 317 final target 318 319 Another complication is that cc_library produces (depending on the 320 configuration) both PIC and non-PIC archives, but the distinction 321 is hidden from Starlark until C++ sandwich becomes available. We work 322 around this by dropping the non-PIC files if PIC files are available. 323 324 Args: 325 name: Target name. 326 hdrs: Header files. 327 copts: Compiler options. 328 linkstatic: Must be true. 329 **kwargs: Any other arguments. 330 """ 331 332 if not hdrs: 333 hdrs = [] 334 if not copts: 335 copts = [] 336 337 # Compile host and device code into library. 338 lib = name + "_lib" 339 native.cc_library( 340 name = lib, 341 hdrs = hdrs, 342 copts = _rdc_copts() + copts, 343 linkstatic = linkstatic, 344 **kwargs 345 ) 346 347 # Generate source file containing linked device code. 348 dlink_hdrs = name + "_dlink_hdrs" 349 dlink_cc = name + "_dlink.cc" 350 _device_link( 351 name = dlink_hdrs, 352 deps = [lib], 353 out = dlink_cc, 354 gpu_archs = cuda_gpu_architectures(), 355 nvlink_args = select({ 356 "@org_tensorflow//tensorflow:linux_x86_64": ["--cpu-arch=X86_64"], 357 "@org_tensorflow//tensorflow:linux_ppc64le": ["--cpu-arch=PPC64LE"], 358 "//conditions:default": [], 359 }), 360 ) 361 362 # Compile the source file into a library. 363 dlink = name + "_dlink" 364 native.cc_library( 365 name = dlink, 366 srcs = [dlink_cc], 367 textual_hdrs = [dlink_hdrs], 368 deps = [ 369 "@local_config_cuda//cuda:cuda_headers", 370 ], 371 defines = [ 372 # Silence warning about including internal header. 373 "__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__", 374 # Macros that need to be defined starting with CUDA 10. 375 "__NV_EXTRA_INITIALIZATION=", 376 "__NV_EXTRA_FINALIZATION=", 377 ], 378 linkstatic = linkstatic, 379 ) 380 381 # Remove intermediate relocatable device code. 382 pruned = name + "_pruned" 383 _prune_relocatable_code( 384 name = pruned, 385 input = lib, 386 gpu_archs = cuda_gpu_architectures(), 387 ) 388 389 # Repackage the two libs into a single archive. This is required because 390 # both libs reference symbols defined in the other one. For details, see 391 # https://eli.thegreenplace.net/2013/07/09/library-order-in-static-linking 392 merged = name + "_merged" 393 _merge_archive( 394 name = merged, 395 srcs = [pruned, dlink], 396 ) 397 398 # Create cc target from archive. 399 native.cc_library( 400 name = name, 401 srcs = [merged], 402 hdrs = hdrs, 403 linkstatic = linkstatic, 404 ) 405