• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 // Kernel-loader specs are structures that describe how to load a data-parallel
17 // kernel on a given platform for subsequent launching. Headers that instantiate
18 // these data structures will typically be auto-generated. However, users can
19 // also instantiate them by hand.
20 //
21 // A kernel with the same exact functionality and type signature may be
22 // implemented on several different platforms. Typical usage is to create a
23 // singleton that describes how to load a kernel on the various supported
24 // platforms:
25 //
26 //  static const MultiKernelLoaderSpec &SaxpySpec() {
27 //    static auto *mkls =
28 //        (new MultiKernelLoaderSpec{4 /* = arity */})
29 //            ->AddCudaPtxOnDisk(ptx_file_path, ptx_kernelname)
30 //            ->AddOpenCLTextOnDisk(opencl_text_file_path, ocl_kernelname);
31 //    };
32 //
33 //    return *mkls;
34 //  }
35 //
36 // This lazily instantiates an object that describes how to load CUDA PTX
37 // present on disk that implements saxpy for the for the CUDA platform, or
38 // OpenCL text present on disk that implements saxpy for an OpenCL-based
39 // platform. The CudaPtxOnDisk and OpenCLTextOnDisk objects are subtypes of
40 // KernelLoaderSpec -- KernelLoaderSpec describes how to load a kernel for
41 // subsequent launching on a single platform.
42 //
43 // For the loader functionality that accepts these KernelLoaderSpecs in order
44 // to grab the kernel appropriately, see StreamExecutor::GetKernel().
45 
46 #ifndef TENSORFLOW_STREAM_EXECUTOR_KERNEL_SPEC_H_
47 #define TENSORFLOW_STREAM_EXECUTOR_KERNEL_SPEC_H_
48 
49 #include <stddef.h>
50 #include <map>
51 #include <memory>
52 #include "tensorflow/stream_executor/platform/port.h"
53 
54 #include "tensorflow/stream_executor/lib/stringpiece.h"
55 #include "tensorflow/stream_executor/platform/logging.h"
56 #include "tensorflow/stream_executor/platform/mutex.h"
57 #include "tensorflow/stream_executor/platform/port.h"
58 
59 namespace perftools {
60 namespace gputools {
61 
62 // Describes how to load a kernel on a target platform.
63 //
64 // This is an abstract base class, subclassed for specific platforms.
65 // The filename_or_text field represents the program location (i.e. PTX or
66 // OpenCL loadable translation unit path) and is simply stored; whether it is a
67 // filename or text is exposed via more specifically named accessors in
68 // subclasses.
69 //
70 // These kernel loader specifications are typically auto-generated into header
71 // files at build time, but can also be specified manually.
72 class KernelLoaderSpec {
73  public:
~KernelLoaderSpec()74   virtual ~KernelLoaderSpec() {}
75 
76   // Returns the kernel name to load out of the program.
kernelname()77   const string &kernelname() const { return kernelname_; }
78 
79  protected:
80   explicit KernelLoaderSpec(port::StringPiece kernelname);
81 
82  private:
83   // The kernel name that should be loaded out of the program description given
84   // above.
85   string kernelname_;
86 
87   SE_DISALLOW_COPY_AND_ASSIGN(KernelLoaderSpec);
88 };
89 
90 // An abstract kernel loader spec that has an associated file path, where
91 // there's a canonical suffix for the filename; e.g. see CudaPtxOnDisk whose
92 // canonical filename suffix is ".ptx".
93 class OnDiskKernelLoaderSpec : public KernelLoaderSpec {
94  public:
~OnDiskKernelLoaderSpec()95   ~OnDiskKernelLoaderSpec() override {}
96 
97   // Returns the path to the on-disk loadable kernel file.
filename()98   const string &filename() const { return filename_; }
99 
100   // Returns the canonical suffix for this on-disk kernel loader spec format;
101   // e.g. PTX files on disk have a canonical suffix of ".ptx".
102   virtual const char *CanonicalSuffix() const = 0;
103 
104  protected:
105   OnDiskKernelLoaderSpec(port::StringPiece filename,
106                          port::StringPiece kernelname);
107 
108   string filename_;
109 
110  private:
111   SE_DISALLOW_COPY_AND_ASSIGN(OnDiskKernelLoaderSpec);
112 };
113 
114 // Kernel loader specification for PTX text that resides on disk.
115 class CudaPtxOnDisk : public OnDiskKernelLoaderSpec {
116  public:
117   CudaPtxOnDisk(port::StringPiece filename, port::StringPiece kernelname);
~CudaPtxOnDisk()118   ~CudaPtxOnDisk() override {}
119 
CanonicalSuffix()120   const char *CanonicalSuffix() const override { return ".ptx"; }
121 
122  private:
123   SE_DISALLOW_COPY_AND_ASSIGN(CudaPtxOnDisk);
124 };
125 
126 // Kernel loader specification for CUBIN binary that resides on disk.
127 class CudaCubinOnDisk : public OnDiskKernelLoaderSpec {
128  public:
129   CudaCubinOnDisk(port::StringPiece filename, port::StringPiece kernelname);
~CudaCubinOnDisk()130   ~CudaCubinOnDisk() override {}
131 
filename()132   const string &filename() const { return filename_; }
133 
CanonicalSuffix()134   const char *CanonicalSuffix() const override { return ".cubin"; }
135 
136  private:
137   string filename_;
138 
139   SE_DISALLOW_COPY_AND_ASSIGN(CudaCubinOnDisk);
140 };
141 
142 // Kernel loader specification for PTX text that resides in memory.
143 class CudaPtxInMemory : public KernelLoaderSpec {
144  public:
145   // Components: compute capability major number, compute capability minor
146   // number, and PTX source.
147   typedef std::tuple<int, int, port::StringPiece> PtxSpec;
148 
149   // Single-PTX constructor. Adds the provided PTX version with an unknown
150   // compute capability. Since the CC is unknown, the PTX is assumed to be very
151   // generally usable - in other words, PTX specified in this manner is VERY
152   // likely to be used as the default! Note that the PTX can be compressed,
153   // which is indicated by the argument ptx_compressed.
154   //
155   // Warning: the string backing the provided port::StringPiece ptx must outlive this
156   // instance.
157   CudaPtxInMemory(port::StringPiece ptx, port::StringPiece kernelname,
158                   bool ptx_compressed = false);
159 
160   // Multiple-PTX-version constructor. Adds each item in spec_list to this
161   // object. Note that the PTX can be compressed, which is indicated by the
162   // argument ptx_compressed.
163   CudaPtxInMemory(const std::initializer_list<PtxSpec> &spec_list,
164                   port::StringPiece kernel_name, bool ptx_compressed = false);
~CudaPtxInMemory()165   ~CudaPtxInMemory() override {}
166 
167   // Add the PTX implementation described by ptx_spec to this object. On
168   // collision (i.e., if a version with the same compute_capability already
169   // exists), the existing implementation will be overwritten.
170   void AddSpec(PtxSpec ptx_spec);
171 
172   // Returns pointer to the ptx of available implementation with the
173   // lowest-valued compute capability. For example, if PTX written to CC2.0,
174   // 3.0, and 3.5 are all available, the version for CC2.0 will be set. Returns
175   // nullptr on failed lookup (if any version is not available).
176   // When the ptx is compressed, returns the decompressed ptx.
177   const char *default_text() const;
178 
179   // Similar to default_text().
180   // When the ptx is compressed, returns the decompressed ptx.
181   const char *original_default_text() const;
182 
183   // Returns pointer to the ptx for the requested compute capability.
184   // Returns nullptr on failed lookup (if the requested version is not
185   // available).
186   // When the ptx is compressed, returns the decompressed ptx.
187   const char *text(int compute_capability_major,
188                    int compute_capability_minor) const;
189 
190   // Similar to text().
191   // When the ptx is compressed, returns the original compressed ptx.
192   const char *original_text(int compute_capability_major,
193                             int compute_capability_minor) const;
194 
195   // Decompresses the PTX string using bzip2.
196   static string DecompressPtx(const char *ptx);
197 
198  private:
199   // PTX translation unit text contents in memory. The key is of as a tuple
200   // "<cc_major>,<cc_minor>", i.e., "2,0", "3,0", "3,5". Because CC's
201   // represented in this way have a clear sorting order, map::begin() will give
202   // the lowest-numbered version available, i.e. the default.
203   std::map<std::tuple<int, int>, const char *,
204            bool (*)(const std::tuple<int, int> &, const std::tuple<int, int> &)>
205       ptx_by_compute_capability_;
206 
207   // Stores all decompressed ptx strings, with original ptx string as keys.
208   // It is marked as mutable for lazy decompression.
209   mutable std::map<const char *, string> decompressed_ptx_;
210   mutable mutex mu_;
211 
212   // Defines the minimum compute capability possible. Used when PTX has no
213   // compute capability specified (in the single-PTX constructor).
214   static const std::tuple<int, int> kMinimumCapability;
215 
216   SE_DISALLOW_COPY_AND_ASSIGN(CudaPtxInMemory);
217 };
218 
219 // Kernel loader specification for OpenCL text that resides on disk.
220 class OpenCLTextOnDisk : public OnDiskKernelLoaderSpec {
221  public:
222   OpenCLTextOnDisk(port::StringPiece filename, port::StringPiece kernelname);
~OpenCLTextOnDisk()223   ~OpenCLTextOnDisk() override {}
224 
CanonicalSuffix()225   const char *CanonicalSuffix() const override { return ".ocl"; }
226 
227  private:
228   SE_DISALLOW_COPY_AND_ASSIGN(OpenCLTextOnDisk);
229 };
230 
231 // Kernel loader specification for OpenCL binary that resides on disk.
232 class OpenCLBinaryOnDisk : public OnDiskKernelLoaderSpec {
233  public:
234   OpenCLBinaryOnDisk(port::StringPiece filename, port::StringPiece kernelname);
~OpenCLBinaryOnDisk()235   ~OpenCLBinaryOnDisk() override {}
236 
CanonicalSuffix()237   const char *CanonicalSuffix() const override { return ".aocx"; }
238 
239  private:
240   SE_DISALLOW_COPY_AND_ASSIGN(OpenCLBinaryOnDisk);
241 };
242 
243 // Kernel loader specification for OpenCL text that resides in memory.
244 class OpenCLTextInMemory : public KernelLoaderSpec {
245  public:
246   OpenCLTextInMemory(port::StringPiece text, port::StringPiece kernelname);
~OpenCLTextInMemory()247   ~OpenCLTextInMemory() override {}
248 
249   // Returns the OpenCL text contents.
text()250   const string &text() const { return text_; }
251 
252  private:
253   // OpenCL translation unit text contents in memory.
254   string text_;
255 
256   SE_DISALLOW_COPY_AND_ASSIGN(OpenCLTextInMemory);
257 };
258 
259 // Kernel loader specification for a CUBIN blob that resides in memory.
260 class CudaCubinInMemory : public KernelLoaderSpec {
261  public:
262   CudaCubinInMemory(const char *bytes, port::StringPiece kernelname);
~CudaCubinInMemory()263   ~CudaCubinInMemory() override {}
264 
bytes()265   const char *bytes() const { return bytes_; }
266 
267  private:
268   const char *bytes_;
269 
270   SE_DISALLOW_COPY_AND_ASSIGN(CudaCubinInMemory);
271 };
272 
273 // Describes how to load a kernel on any subset of a number of target platforms.
274 class MultiKernelLoaderSpec {
275  public:
276   explicit MultiKernelLoaderSpec(size_t arity);
277 
278   // Returns the number of arguments that this kernel accepts.
arity()279   size_t arity() const { return arity_; }
280 
281   // Convenience getters for testing whether these platform variants have
282   // kernel loader specifications available.
has_cuda_ptx_on_disk()283   bool has_cuda_ptx_on_disk() const { return cuda_ptx_on_disk_ != nullptr; }
has_cuda_cubin_on_disk()284   bool has_cuda_cubin_on_disk() const { return cuda_cubin_on_disk_ != nullptr; }
has_cuda_cubin_in_memory()285   bool has_cuda_cubin_in_memory() const {
286     return cuda_cubin_in_memory_ != nullptr;
287   }
has_cuda_ptx_in_memory()288   bool has_cuda_ptx_in_memory() const { return cuda_ptx_in_memory_ != nullptr; }
has_ocl_text_on_disk()289   bool has_ocl_text_on_disk() const { return ocl_text_on_disk_ != nullptr; }
has_ocl_binary_on_disk()290   bool has_ocl_binary_on_disk() const { return ocl_binary_on_disk_ != nullptr; }
has_ocl_text_in_memory()291   bool has_ocl_text_in_memory() const { return ocl_text_in_memory_ != nullptr; }
292 
293   // Accessors for platform variant kernel load specifications.
294   // Precondition: corresponding has_* is true.
cuda_ptx_on_disk()295   const CudaPtxOnDisk &cuda_ptx_on_disk() const {
296     CHECK(has_cuda_ptx_on_disk());
297     return *cuda_ptx_on_disk_;
298   }
cuda_cubin_on_disk()299   const CudaCubinOnDisk &cuda_cubin_on_disk() const {
300     CHECK(has_cuda_cubin_on_disk());
301     return *cuda_cubin_on_disk_;
302   }
cuda_cubin_in_memory()303   const CudaCubinInMemory &cuda_cubin_in_memory() const {
304     CHECK(has_cuda_cubin_in_memory());
305     return *cuda_cubin_in_memory_;
306   }
cuda_ptx_in_memory()307   const CudaPtxInMemory &cuda_ptx_in_memory() const {
308     CHECK(has_cuda_ptx_in_memory());
309     return *cuda_ptx_in_memory_;
310   }
ocl_text_on_disk()311   const OpenCLTextOnDisk &ocl_text_on_disk() const {
312     CHECK(has_ocl_text_on_disk());
313     return *ocl_text_on_disk_;
314   }
ocl_binary_on_disk()315   const OpenCLBinaryOnDisk &ocl_binary_on_disk() const {
316     CHECK(has_ocl_binary_on_disk());
317     return *ocl_binary_on_disk_;
318   }
ocl_text_in_memory()319   const OpenCLTextInMemory &ocl_text_in_memory() const {
320     CHECK(has_ocl_text_in_memory());
321     return *ocl_text_in_memory_;
322   }
323 
324   // Builder-pattern-like methods for use in initializing a
325   // MultiKernelLoaderSpec. Each of these should be used at most once for a
326   // single MultiKernelLoaderSpec object. See file comment for example usage.
327   //
328   // Note that the kernelname parameter must be consistent with the kernel in
329   // the PTX or OpenCL being loaded. Also be aware that in CUDA C++ the kernel
330   // name may be mangled by the compiler if it is not declared in an
331   // extern "C" scope.
332   MultiKernelLoaderSpec *AddOpenCLTextOnDisk(port::StringPiece filename,
333                                              port::StringPiece kernelname);
334   MultiKernelLoaderSpec *AddOpenCLBinaryOnDisk(port::StringPiece filename,
335                                                port::StringPiece kernelname);
336   MultiKernelLoaderSpec *AddOpenCLTextInMemory(port::StringPiece ocl_text,
337                                                port::StringPiece kernelname);
338   MultiKernelLoaderSpec *AddCudaPtxOnDisk(port::StringPiece filename,
339                                           port::StringPiece kernelname);
340   MultiKernelLoaderSpec *AddCudaCubinOnDisk(port::StringPiece filename,
341                                             port::StringPiece kernelname);
342   MultiKernelLoaderSpec *AddCudaCubinInMemory(const char *cubin_bytes,
343                                               port::StringPiece kernelname);
344   MultiKernelLoaderSpec *AddCudaPtxInMemory(port::StringPiece ptx,
345                                             port::StringPiece kernelname);
346   MultiKernelLoaderSpec *AddCudaCompressedPtxInMemory(
347       port::StringPiece ptx, port::StringPiece kernelname);
348   MultiKernelLoaderSpec *AddCudaPtxInMemory(
349       std::initializer_list<CudaPtxInMemory::PtxSpec> spec_list,
350       port::StringPiece kernelname);
351   MultiKernelLoaderSpec *AddCudaCompressedPtxInMemory(
352       std::initializer_list<CudaPtxInMemory::PtxSpec> spec_list,
353       port::StringPiece kernelname);
354 
355  private:
356   std::unique_ptr<CudaPtxOnDisk>
357       cuda_ptx_on_disk_;  // PTX text that resides in a file.
358   std::unique_ptr<CudaCubinOnDisk>
359       cuda_cubin_on_disk_;  // Binary CUDA program in a file.
360   std::unique_ptr<CudaCubinInMemory>
361       cuda_cubin_in_memory_;  // Binary CUDA program in memory.
362   std::unique_ptr<CudaPtxInMemory>
363       cuda_ptx_in_memory_;  // PTX text that resides in memory.
364   std::unique_ptr<OpenCLTextOnDisk>
365       ocl_text_on_disk_;  // OpenCL text that resides on disk.
366   std::unique_ptr<OpenCLBinaryOnDisk>
367       ocl_binary_on_disk_;  // OpenCL binary that resides on disk.
368   std::unique_ptr<OpenCLTextInMemory>
369       ocl_text_in_memory_;  // OpenCL text that resides in memory.
370 
371   // Number of parameters that the kernel takes. (This is nicer to have in a
372   // constexpr than having to determine it from the types via template
373   // metaprogramming).
374   size_t arity_;
375 };
376 
377 }  // namespace gputools
378 }  // namespace perftools
379 
380 #endif  // TENSORFLOW_STREAM_EXECUTOR_KERNEL_SPEC_H_
381