1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 // Kernel-loader specs are structures that describe how to load a data-parallel 17 // kernel on a given platform for subsequent launching. Headers that instantiate 18 // these data structures will typically be auto-generated. However, users can 19 // also instantiate them by hand. 20 // 21 // A kernel with the same exact functionality and type signature may be 22 // implemented on several different platforms. Typical usage is to create a 23 // singleton that describes how to load a kernel on the various supported 24 // platforms: 25 // 26 // static const MultiKernelLoaderSpec &SaxpySpec() { 27 // static auto *mkls = 28 // (new MultiKernelLoaderSpec{4 /* = arity */}) 29 // ->AddCudaPtxOnDisk(ptx_file_path, ptx_kernelname) 30 // ->AddOpenCLTextOnDisk(opencl_text_file_path, ocl_kernelname); 31 // }; 32 // 33 // return *mkls; 34 // } 35 // 36 // This lazily instantiates an object that describes how to load CUDA PTX 37 // present on disk that implements saxpy for the for the CUDA platform, or 38 // OpenCL text present on disk that implements saxpy for an OpenCL-based 39 // platform. The CudaPtxOnDisk and OpenCLTextOnDisk objects are subtypes of 40 // KernelLoaderSpec -- KernelLoaderSpec describes how to load a kernel for 41 // subsequent launching on a single platform. 42 // 43 // For the loader functionality that accepts these KernelLoaderSpecs in order 44 // to grab the kernel appropriately, see StreamExecutor::GetKernel(). 45 46 #ifndef TENSORFLOW_STREAM_EXECUTOR_KERNEL_SPEC_H_ 47 #define TENSORFLOW_STREAM_EXECUTOR_KERNEL_SPEC_H_ 48 49 #include <stddef.h> 50 51 #include <map> 52 #include <memory> 53 54 #include "absl/strings/string_view.h" 55 #include "absl/synchronization/mutex.h" 56 #include "tensorflow/stream_executor/platform/logging.h" 57 #include "tensorflow/stream_executor/platform/port.h" 58 59 namespace stream_executor { 60 61 // Describes how to load a kernel on a target platform. 62 // 63 // This is an abstract base class, subclassed for specific platforms. 64 // The filename_or_text field represents the program location (i.e. PTX or 65 // OpenCL loadable translation unit path) and is simply stored; whether it is a 66 // filename or text is exposed via more specifically named accessors in 67 // subclasses. 68 // 69 // These kernel loader specifications are typically auto-generated into header 70 // files at build time, but can also be specified manually. 71 class KernelLoaderSpec { 72 public: ~KernelLoaderSpec()73 virtual ~KernelLoaderSpec() {} 74 75 // Returns the kernel name to load out of the program. kernelname()76 const std::string &kernelname() const { return kernelname_; } 77 78 protected: 79 explicit KernelLoaderSpec(absl::string_view kernelname); 80 81 private: 82 // The kernel name that should be loaded out of the program description given 83 // above. 84 std::string kernelname_; 85 86 SE_DISALLOW_COPY_AND_ASSIGN(KernelLoaderSpec); 87 }; 88 89 // An abstract kernel loader spec that has an associated file path, where 90 // there's a canonical suffix for the filename; e.g. see CudaPtxOnDisk whose 91 // canonical filename suffix is ".ptx". 92 class OnDiskKernelLoaderSpec : public KernelLoaderSpec { 93 public: ~OnDiskKernelLoaderSpec()94 ~OnDiskKernelLoaderSpec() override {} 95 96 // Returns the path to the on-disk loadable kernel file. filename()97 const std::string &filename() const { return filename_; } 98 99 // Returns the canonical suffix for this on-disk kernel loader spec format; 100 // e.g. PTX files on disk have a canonical suffix of ".ptx". 101 virtual const char *CanonicalSuffix() const = 0; 102 103 protected: 104 OnDiskKernelLoaderSpec(absl::string_view filename, 105 absl::string_view kernelname); 106 107 std::string filename_; 108 109 private: 110 SE_DISALLOW_COPY_AND_ASSIGN(OnDiskKernelLoaderSpec); 111 }; 112 113 // Kernel loader specification for PTX text that resides on disk. 114 class CudaPtxOnDisk : public OnDiskKernelLoaderSpec { 115 public: 116 CudaPtxOnDisk(absl::string_view filename, absl::string_view kernelname); ~CudaPtxOnDisk()117 ~CudaPtxOnDisk() override {} 118 CanonicalSuffix()119 const char *CanonicalSuffix() const override { return ".ptx"; } 120 121 private: 122 SE_DISALLOW_COPY_AND_ASSIGN(CudaPtxOnDisk); 123 }; 124 125 // Kernel loader specification for CUBIN binary that resides on disk. 126 class CudaCubinOnDisk : public OnDiskKernelLoaderSpec { 127 public: 128 CudaCubinOnDisk(absl::string_view filename, absl::string_view kernelname); ~CudaCubinOnDisk()129 ~CudaCubinOnDisk() override {} 130 filename()131 const std::string &filename() const { return filename_; } 132 CanonicalSuffix()133 const char *CanonicalSuffix() const override { return ".cubin"; } 134 135 private: 136 std::string filename_; 137 138 SE_DISALLOW_COPY_AND_ASSIGN(CudaCubinOnDisk); 139 }; 140 141 // Kernel loader specification for PTX text that resides in memory. 142 class CudaPtxInMemory : public KernelLoaderSpec { 143 public: 144 // Components: compute capability major number, compute capability minor 145 // number, and PTX source. 146 typedef std::tuple<int, int, absl::string_view> PtxSpec; 147 148 // Single-PTX constructor. Adds the provided PTX version with an unknown 149 // compute capability. Since the CC is unknown, the PTX is assumed to be very 150 // generally usable - in other words, PTX specified in this manner is VERY 151 // likely to be used as the default! Note that the PTX can be compressed, 152 // which is indicated by the argument ptx_compressed. 153 // 154 // Warning: the string backing the provided absl::string_view ptx must outlive 155 // this instance. 156 CudaPtxInMemory(absl::string_view ptx, absl::string_view kernelname, 157 bool ptx_compressed = false); 158 159 // Multiple-PTX-version constructor. Adds each item in spec_list to this 160 // object. Note that the PTX can be compressed, which is indicated by the 161 // argument ptx_compressed. 162 CudaPtxInMemory(const std::initializer_list<PtxSpec> &spec_list, 163 absl::string_view kernel_name, bool ptx_compressed = false); ~CudaPtxInMemory()164 ~CudaPtxInMemory() override {} 165 166 // Add the PTX implementation described by ptx_spec to this object. On 167 // collision (i.e., if a version with the same compute_capability already 168 // exists), the existing implementation will be overwritten. 169 void AddSpec(PtxSpec ptx_spec); 170 171 // Returns pointer to the ptx of available implementation with the 172 // lowest-valued compute capability. For example, if PTX written to CC2.0, 173 // 3.0, and 3.5 are all available, the version for CC2.0 will be set. Returns 174 // nullptr on failed lookup (if any version is not available). 175 // When the ptx is compressed, returns the decompressed ptx. 176 const char *default_text() const; 177 178 // Similar to default_text(). 179 // When the ptx is compressed, returns the decompressed ptx. 180 const char *original_default_text() const; 181 182 // Returns pointer to the ptx for the requested compute capability. 183 // Returns nullptr on failed lookup (if the requested version is not 184 // available). 185 // When the ptx is compressed, returns the decompressed ptx. 186 const char *text(int compute_capability_major, 187 int compute_capability_minor) const; 188 189 // Similar to text(). 190 // When the ptx is compressed, returns the original compressed ptx. 191 const char *original_text(int compute_capability_major, 192 int compute_capability_minor) const; 193 194 // Decompresses the PTX string using bzip2. 195 static std::string DecompressPtx(const char *ptx); 196 197 private: 198 // PTX translation unit text contents in memory. The key is of as a tuple 199 // "<cc_major>,<cc_minor>", i.e., "2,0", "3,0", "3,5". Because CC's 200 // represented in this way have a clear sorting order, map::begin() will give 201 // the lowest-numbered version available, i.e. the default. 202 std::map<std::tuple<int, int>, const char *, 203 bool (*)(const std::tuple<int, int> &, const std::tuple<int, int> &)> 204 ptx_by_compute_capability_; 205 206 // Stores all decompressed ptx strings, with original ptx string as keys. 207 // It is marked as mutable for lazy decompression. 208 mutable std::map<const char *, std::string> decompressed_ptx_; 209 mutable absl::Mutex mu_; 210 211 // Defines the minimum compute capability possible. Used when PTX has no 212 // compute capability specified (in the single-PTX constructor). 213 static const std::tuple<int, int> kMinimumCapability; 214 215 SE_DISALLOW_COPY_AND_ASSIGN(CudaPtxInMemory); 216 }; 217 218 // Kernel loader specification for OpenCL text that resides on disk. 219 class OpenCLTextOnDisk : public OnDiskKernelLoaderSpec { 220 public: 221 OpenCLTextOnDisk(absl::string_view filename, absl::string_view kernelname); ~OpenCLTextOnDisk()222 ~OpenCLTextOnDisk() override {} 223 CanonicalSuffix()224 const char *CanonicalSuffix() const override { return ".ocl"; } 225 226 private: 227 SE_DISALLOW_COPY_AND_ASSIGN(OpenCLTextOnDisk); 228 }; 229 230 // Kernel loader specification for OpenCL binary that resides on disk. 231 class OpenCLBinaryOnDisk : public OnDiskKernelLoaderSpec { 232 public: 233 OpenCLBinaryOnDisk(absl::string_view filename, absl::string_view kernelname); ~OpenCLBinaryOnDisk()234 ~OpenCLBinaryOnDisk() override {} 235 CanonicalSuffix()236 const char *CanonicalSuffix() const override { return ".aocx"; } 237 238 private: 239 SE_DISALLOW_COPY_AND_ASSIGN(OpenCLBinaryOnDisk); 240 }; 241 242 // Kernel loader specification for OpenCL text that resides in memory. 243 class OpenCLTextInMemory : public KernelLoaderSpec { 244 public: 245 OpenCLTextInMemory(absl::string_view text, absl::string_view kernelname); ~OpenCLTextInMemory()246 ~OpenCLTextInMemory() override {} 247 248 // Returns the OpenCL text contents. text()249 const std::string &text() const { return text_; } 250 251 private: 252 // OpenCL translation unit text contents in memory. 253 std::string text_; 254 255 SE_DISALLOW_COPY_AND_ASSIGN(OpenCLTextInMemory); 256 }; 257 258 // Kernel loader specification for a CUBIN blob that resides in memory. 259 class CudaCubinInMemory : public KernelLoaderSpec { 260 public: 261 CudaCubinInMemory(const char *bytes, absl::string_view kernelname); ~CudaCubinInMemory()262 ~CudaCubinInMemory() override {} 263 bytes()264 const char *bytes() const { return bytes_; } 265 266 private: 267 const char *bytes_; 268 269 SE_DISALLOW_COPY_AND_ASSIGN(CudaCubinInMemory); 270 }; 271 272 // Describes how to load a kernel on any subset of a number of target platforms. 273 class MultiKernelLoaderSpec { 274 public: 275 explicit MultiKernelLoaderSpec(size_t arity); 276 277 // Returns the number of arguments that this kernel accepts. arity()278 size_t arity() const { return arity_; } 279 280 // Convenience getters for testing whether these platform variants have 281 // kernel loader specifications available. has_cuda_ptx_on_disk()282 bool has_cuda_ptx_on_disk() const { return cuda_ptx_on_disk_ != nullptr; } has_cuda_cubin_on_disk()283 bool has_cuda_cubin_on_disk() const { return cuda_cubin_on_disk_ != nullptr; } has_cuda_cubin_in_memory()284 bool has_cuda_cubin_in_memory() const { 285 return cuda_cubin_in_memory_ != nullptr; 286 } has_cuda_ptx_in_memory()287 bool has_cuda_ptx_in_memory() const { return cuda_ptx_in_memory_ != nullptr; } has_ocl_text_on_disk()288 bool has_ocl_text_on_disk() const { return ocl_text_on_disk_ != nullptr; } has_ocl_binary_on_disk()289 bool has_ocl_binary_on_disk() const { return ocl_binary_on_disk_ != nullptr; } has_ocl_text_in_memory()290 bool has_ocl_text_in_memory() const { return ocl_text_in_memory_ != nullptr; } 291 292 // Accessors for platform variant kernel load specifications. 293 // Precondition: corresponding has_* is true. cuda_ptx_on_disk()294 const CudaPtxOnDisk &cuda_ptx_on_disk() const { 295 CHECK(has_cuda_ptx_on_disk()); 296 return *cuda_ptx_on_disk_; 297 } cuda_cubin_on_disk()298 const CudaCubinOnDisk &cuda_cubin_on_disk() const { 299 CHECK(has_cuda_cubin_on_disk()); 300 return *cuda_cubin_on_disk_; 301 } cuda_cubin_in_memory()302 const CudaCubinInMemory &cuda_cubin_in_memory() const { 303 CHECK(has_cuda_cubin_in_memory()); 304 return *cuda_cubin_in_memory_; 305 } cuda_ptx_in_memory()306 const CudaPtxInMemory &cuda_ptx_in_memory() const { 307 CHECK(has_cuda_ptx_in_memory()); 308 return *cuda_ptx_in_memory_; 309 } ocl_text_on_disk()310 const OpenCLTextOnDisk &ocl_text_on_disk() const { 311 CHECK(has_ocl_text_on_disk()); 312 return *ocl_text_on_disk_; 313 } ocl_binary_on_disk()314 const OpenCLBinaryOnDisk &ocl_binary_on_disk() const { 315 CHECK(has_ocl_binary_on_disk()); 316 return *ocl_binary_on_disk_; 317 } ocl_text_in_memory()318 const OpenCLTextInMemory &ocl_text_in_memory() const { 319 CHECK(has_ocl_text_in_memory()); 320 return *ocl_text_in_memory_; 321 } 322 323 // Builder-pattern-like methods for use in initializing a 324 // MultiKernelLoaderSpec. Each of these should be used at most once for a 325 // single MultiKernelLoaderSpec object. See file comment for example usage. 326 // 327 // Note that the kernelname parameter must be consistent with the kernel in 328 // the PTX or OpenCL being loaded. Also be aware that in CUDA C++ the kernel 329 // name may be mangled by the compiler if it is not declared in an 330 // extern "C" scope. 331 MultiKernelLoaderSpec *AddOpenCLTextOnDisk(absl::string_view filename, 332 absl::string_view kernelname); 333 MultiKernelLoaderSpec *AddOpenCLBinaryOnDisk(absl::string_view filename, 334 absl::string_view kernelname); 335 MultiKernelLoaderSpec *AddOpenCLTextInMemory(absl::string_view ocl_text, 336 absl::string_view kernelname); 337 MultiKernelLoaderSpec *AddCudaPtxOnDisk(absl::string_view filename, 338 absl::string_view kernelname); 339 MultiKernelLoaderSpec *AddCudaCubinOnDisk(absl::string_view filename, 340 absl::string_view kernelname); 341 MultiKernelLoaderSpec *AddCudaCubinInMemory(const char *cubin_bytes, 342 absl::string_view kernelname); 343 MultiKernelLoaderSpec *AddCudaPtxInMemory(absl::string_view ptx, 344 absl::string_view kernelname); 345 MultiKernelLoaderSpec *AddCudaCompressedPtxInMemory( 346 absl::string_view ptx, absl::string_view kernelname); 347 MultiKernelLoaderSpec *AddCudaPtxInMemory( 348 std::initializer_list<CudaPtxInMemory::PtxSpec> spec_list, 349 absl::string_view kernelname); 350 MultiKernelLoaderSpec *AddCudaCompressedPtxInMemory( 351 std::initializer_list<CudaPtxInMemory::PtxSpec> spec_list, 352 absl::string_view kernelname); 353 354 private: 355 std::unique_ptr<CudaPtxOnDisk> 356 cuda_ptx_on_disk_; // PTX text that resides in a file. 357 std::unique_ptr<CudaCubinOnDisk> 358 cuda_cubin_on_disk_; // Binary CUDA program in a file. 359 std::unique_ptr<CudaCubinInMemory> 360 cuda_cubin_in_memory_; // Binary CUDA program in memory. 361 std::unique_ptr<CudaPtxInMemory> 362 cuda_ptx_in_memory_; // PTX text that resides in memory. 363 std::unique_ptr<OpenCLTextOnDisk> 364 ocl_text_on_disk_; // OpenCL text that resides on disk. 365 std::unique_ptr<OpenCLBinaryOnDisk> 366 ocl_binary_on_disk_; // OpenCL binary that resides on disk. 367 std::unique_ptr<OpenCLTextInMemory> 368 ocl_text_in_memory_; // OpenCL text that resides in memory. 369 370 // Number of parameters that the kernel takes. (This is nicer to have in a 371 // constexpr than having to determine it from the types via template 372 // metaprogramming). 373 size_t arity_; 374 }; 375 376 } // namespace stream_executor 377 378 #endif // TENSORFLOW_STREAM_EXECUTOR_KERNEL_SPEC_H_ 379