1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #ifndef TENSORFLOW_STREAM_EXECUTOR_GPU_ASM_COMPILER_H_ 17 #define TENSORFLOW_STREAM_EXECUTOR_GPU_ASM_COMPILER_H_ 18 19 #include <vector> 20 21 #include "absl/types/span.h" 22 #include "tensorflow/stream_executor/gpu/gpu_asm_opts.h" 23 #include "tensorflow/stream_executor/lib/statusor.h" 24 #include "tensorflow/stream_executor/platform/port.h" 25 26 namespace stream_executor { 27 namespace gpu { 28 class GpuContext; 29 } 30 31 // Compiles the given PTX string using ptxas and returns the resulting machine 32 // code (i.e. a cubin) as a byte array. The generated cubin matches the compute 33 // capabilities of the device associated with 'device_ordinal'. 34 // 35 // 'options' is used to query for the CUDA location in case it is 36 // customized in a passed flag, and for controlling ptxas optimizations. 37 port::StatusOr<std::vector<uint8>> CompileGpuAsm(int device_ordinal, 38 const char* ptx_contents, 39 GpuAsmOpts options); 40 41 // Compiles the given PTX string using ptxas and returns the resulting machine 42 // code (i.e. a cubin) as a byte array. The generated cubin matches the compute 43 // capabilities provided by 'cc_major' and 'cc_minor'. 44 // 45 // 'options' is used to query for the CUDA location in case it is 46 // customized in a passed flag, and for controlling ptxas optimizations. 47 port::StatusOr<std::vector<uint8>> CompileGpuAsm(int cc_major, int cc_minor, 48 const char* ptx_contents, 49 GpuAsmOpts options); 50 51 // Same as CompileGpuAsm, but caches the result, and returns unowned view of 52 // the compiled binary. 53 // 54 // A copy of the string provided in ptx will be made. 55 port::StatusOr<absl::Span<const uint8>> CompileGpuAsmOrGetCached( 56 int device_ordinal, const char* ptx, GpuAsmOpts compilation_options); 57 58 struct CubinOrPTXImage { 59 std::string profile; 60 std::vector<uint8> bytes; 61 }; 62 63 // Bundles the GPU machine code (cubins) and PTX if requested and returns the 64 // resulting binary (i.e. a fatbin) as a byte array. 65 port::StatusOr<std::vector<uint8>> BundleGpuAsm( 66 std::vector<CubinOrPTXImage> images, const std::string preferred_cuda_dir); 67 68 struct HsacoImage { 69 std::string gfx_arch; 70 std::vector<uint8> bytes; 71 }; 72 73 // Bundles the GPU machine code (HSA Code Object) and returns the resulting 74 // binary (i.e. a fatbin) as a byte array. 75 port::StatusOr<std::vector<uint8>> BundleGpuAsm( 76 std::vector<HsacoImage> images, const std::string rocm_root_dir); 77 78 // Links multiple relocatable GPU images (e.g. results of ptxas -c) into a 79 // single image. 80 port::StatusOr<std::vector<uint8>> LinkGpuAsm( 81 gpu::GpuContext* context, std::vector<CubinOrPTXImage> images); 82 83 } // namespace stream_executor 84 85 #endif // TENSORFLOW_STREAM_EXECUTOR_GPU_ASM_COMPILER_H_ 86