• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/stream_executor/kernel_spec.h"
17 
18 
19 namespace perftools {
20 namespace gputools {
21 
KernelLoaderSpec(port::StringPiece kernelname)22 KernelLoaderSpec::KernelLoaderSpec(port::StringPiece kernelname)
23     : kernelname_(kernelname.ToString()) {}
24 
OnDiskKernelLoaderSpec(port::StringPiece filename,port::StringPiece kernelname)25 OnDiskKernelLoaderSpec::OnDiskKernelLoaderSpec(port::StringPiece filename,
26                                                port::StringPiece kernelname)
27     : KernelLoaderSpec(kernelname), filename_(filename.ToString()) {}
28 
CudaPtxOnDisk(port::StringPiece filename,port::StringPiece kernelname)29 CudaPtxOnDisk::CudaPtxOnDisk(port::StringPiece filename,
30                              port::StringPiece kernelname)
31     : OnDiskKernelLoaderSpec(filename, kernelname) {}
32 
CudaCubinOnDisk(port::StringPiece filename,port::StringPiece kernelname)33 CudaCubinOnDisk::CudaCubinOnDisk(port::StringPiece filename,
34                                  port::StringPiece kernelname)
35     : OnDiskKernelLoaderSpec(filename, kernelname) {}
36 
CudaCubinInMemory(const char * bytes,port::StringPiece kernelname)37 CudaCubinInMemory::CudaCubinInMemory(const char *bytes,
38                                      port::StringPiece kernelname)
39     : KernelLoaderSpec(kernelname), bytes_(bytes) {}
40 
CompareComputeCapability(const std::tuple<int,int> & lhs,const std::tuple<int,int> & rhs)41 bool CompareComputeCapability(const std::tuple<int, int> &lhs,
42                               const std::tuple<int, int> &rhs) {
43   return std::get<0>(lhs) < std::get<0>(rhs) ||
44          (std::get<0>(lhs) == std::get<0>(rhs) &&
45           std::get<1>(lhs) < std::get<1>(rhs));
46 }
47 
48 const std::tuple<int, int> CudaPtxInMemory::kMinimumCapability{1, 0};
49 
CudaPtxInMemory(port::StringPiece ptx,port::StringPiece kernel_name,bool ptx_compressed)50 CudaPtxInMemory::CudaPtxInMemory(port::StringPiece ptx,
51                                  port::StringPiece kernel_name,
52                                  bool ptx_compressed)
53     : KernelLoaderSpec(kernel_name),
54       ptx_by_compute_capability_(CompareComputeCapability) {
55   if (ptx_compressed) {
56     // Lazy decompression. Put an empty string in decompressed_ptx_ showing that
57     // the original ptx is compressed.
58     decompressed_ptx_[ptx.data()] = "";
59   }
60   ptx_by_compute_capability_[kMinimumCapability] = ptx.data();
61 }
62 
CudaPtxInMemory(const std::initializer_list<CudaPtxInMemory::PtxSpec> & spec_list,port::StringPiece kernel_name,bool ptx_compressed)63 CudaPtxInMemory::CudaPtxInMemory(
64     const std::initializer_list<CudaPtxInMemory::PtxSpec> &spec_list,
65     port::StringPiece kernel_name, bool ptx_compressed)
66     : KernelLoaderSpec(kernel_name),
67       ptx_by_compute_capability_(CompareComputeCapability) {
68   for (const auto &spec : spec_list) {
69     int major, minor;
70     port::StringPiece ptx;
71     std::tie(major, minor, ptx) = spec;
72     if (ptx_compressed) {
73       // Lazy decompression. Put an empty string in decompressed_ptx_ showing
74       // that the original ptx is compressed.
75       decompressed_ptx_[ptx.data()] = "";
76     }
77     ptx_by_compute_capability_[std::tuple<int, int>{major, minor}] = ptx.data();
78   }
79 }
80 
DecompressPtx(const char * ptx)81 string CudaPtxInMemory::DecompressPtx(const char *ptx) {
82   // Get the length of the PTX string from the beginning of the buffer.
83   uint64 ptx_length = *reinterpret_cast<const uint64 *>(ptx);
84   // Get the PTX string from the buffer with offset and length.
85   string compressed_ptx(ptx + sizeof(uint64),
86                         ptx + sizeof(uint64) + ptx_length);
87   string decompressed_ptx;
88   // Decompress the PTX string with bzip2.
89   LOG(FATAL) << "bzip2 decompression is not supported yet.";
90   return decompressed_ptx;
91 }
92 
default_text() const93 const char *CudaPtxInMemory::default_text() const {
94   if (ptx_by_compute_capability_.empty()) {
95     return nullptr;
96   }
97 
98   mutex_lock lock{mu_};
99 
100   auto ptx = ptx_by_compute_capability_.begin()->second;
101   // Check if there is an entry in decompressed ptx table.
102   auto decompressed_ptx_iter = decompressed_ptx_.find(ptx);
103   if (decompressed_ptx_iter != decompressed_ptx_.end()) {
104     // If the decompressed string is empty, which means the ptx hasn't been
105     // decompressed, decompress it here.
106     if (decompressed_ptx_iter->second.empty()) {
107       decompressed_ptx_iter->second = DecompressPtx(ptx);
108     }
109     return decompressed_ptx_iter->second.c_str();
110   }
111   return ptx;
112 }
113 
original_default_text() const114 const char *CudaPtxInMemory::original_default_text() const {
115   if (ptx_by_compute_capability_.empty()) {
116     return nullptr;
117   }
118 
119   return ptx_by_compute_capability_.begin()->second;
120 }
121 
text(int compute_capability_major,int compute_capability_minor) const122 const char *CudaPtxInMemory::text(int compute_capability_major,
123                                   int compute_capability_minor) const {
124   std::tuple<int, int> capability{compute_capability_major,
125                                   compute_capability_minor};
126 
127   auto ptx_iter = ptx_by_compute_capability_.find(capability);
128   if (ptx_iter == ptx_by_compute_capability_.end()) {
129     return nullptr;
130   }
131 
132   mutex_lock lock{mu_};
133 
134   // Check if there is an entry in decompressed ptx table.
135   auto decompressed_ptx_iter = decompressed_ptx_.find(ptx_iter->second);
136   if (decompressed_ptx_iter != decompressed_ptx_.end()) {
137     // If the decompressed string is empty, which means the ptx hasn't been
138     // decompressed, decompress it here.
139     if (decompressed_ptx_iter->second.empty()) {
140       decompressed_ptx_iter->second = DecompressPtx(ptx_iter->second);
141     }
142     return decompressed_ptx_iter->second.c_str();
143   }
144   return ptx_iter->second;
145 }
146 
original_text(int compute_capability_major,int compute_capability_minor) const147 const char *CudaPtxInMemory::original_text(int compute_capability_major,
148                                            int compute_capability_minor) const {
149   std::tuple<int, int> capability{compute_capability_major,
150                                   compute_capability_minor};
151 
152   auto ptx_iter = ptx_by_compute_capability_.find(capability);
153   if (ptx_iter == ptx_by_compute_capability_.end()) {
154     return nullptr;
155   }
156 
157   return ptx_iter->second;
158 }
159 
OpenCLTextOnDisk(port::StringPiece filename,port::StringPiece kernelname)160 OpenCLTextOnDisk::OpenCLTextOnDisk(port::StringPiece filename,
161                                    port::StringPiece kernelname)
162     : OnDiskKernelLoaderSpec(filename, kernelname) {}
163 
OpenCLTextInMemory(port::StringPiece text,port::StringPiece kernelname)164 OpenCLTextInMemory::OpenCLTextInMemory(port::StringPiece text,
165                                        port::StringPiece kernelname)
166     : KernelLoaderSpec(kernelname), text_(text.ToString()) {}
167 
OpenCLBinaryOnDisk(port::StringPiece filename,port::StringPiece kernelname)168 OpenCLBinaryOnDisk::OpenCLBinaryOnDisk(port::StringPiece filename,
169                                        port::StringPiece kernelname)
170     : OnDiskKernelLoaderSpec(filename, kernelname) {}
171 
AddOpenCLTextOnDisk(port::StringPiece filename,port::StringPiece kernelname)172 MultiKernelLoaderSpec *MultiKernelLoaderSpec::AddOpenCLTextOnDisk(
173     port::StringPiece filename, port::StringPiece kernelname) {
174   CHECK(ocl_text_on_disk_ == nullptr);
175   ocl_text_on_disk_.reset(new OpenCLTextOnDisk{filename, kernelname});
176   return this;
177 }
178 
AddOpenCLBinaryOnDisk(port::StringPiece filename,port::StringPiece kernelname)179 MultiKernelLoaderSpec *MultiKernelLoaderSpec::AddOpenCLBinaryOnDisk(
180     port::StringPiece filename, port::StringPiece kernelname) {
181   CHECK(ocl_binary_on_disk_ == nullptr);
182   ocl_binary_on_disk_.reset(new OpenCLBinaryOnDisk{filename, kernelname});
183   return this;
184 }
185 
AddOpenCLTextInMemory(port::StringPiece filename,port::StringPiece kernelname)186 MultiKernelLoaderSpec *MultiKernelLoaderSpec::AddOpenCLTextInMemory(
187     port::StringPiece filename, port::StringPiece kernelname) {
188   CHECK(ocl_text_in_memory_ == nullptr);
189   ocl_text_in_memory_.reset(new OpenCLTextInMemory{filename, kernelname});
190   return this;
191 }
192 
AddCudaPtxOnDisk(port::StringPiece filename,port::StringPiece kernelname)193 MultiKernelLoaderSpec *MultiKernelLoaderSpec::AddCudaPtxOnDisk(
194     port::StringPiece filename, port::StringPiece kernelname) {
195   CHECK(cuda_ptx_on_disk_ == nullptr);
196   cuda_ptx_on_disk_.reset(new CudaPtxOnDisk{filename, kernelname});
197   return this;
198 }
199 
AddCudaCubinInMemory(const char * bytes,port::StringPiece kernelname)200 MultiKernelLoaderSpec *MultiKernelLoaderSpec::AddCudaCubinInMemory(
201     const char *bytes, port::StringPiece kernelname) {
202   CHECK(cuda_cubin_in_memory_ == nullptr);
203   cuda_cubin_in_memory_.reset(new CudaCubinInMemory{bytes, kernelname});
204   return this;
205 }
206 
AddCudaCubinOnDisk(port::StringPiece filename,port::StringPiece kernelname)207 MultiKernelLoaderSpec *MultiKernelLoaderSpec::AddCudaCubinOnDisk(
208     port::StringPiece filename, port::StringPiece kernelname) {
209   CHECK(cuda_cubin_on_disk_ == nullptr);
210   cuda_cubin_on_disk_.reset(new CudaCubinOnDisk{filename, kernelname});
211   return this;
212 }
213 
AddCudaPtxInMemory(port::StringPiece ptx,port::StringPiece kernelname)214 MultiKernelLoaderSpec *MultiKernelLoaderSpec::AddCudaPtxInMemory(
215     port::StringPiece ptx, port::StringPiece kernelname) {
216   CHECK(cuda_ptx_in_memory_ == nullptr);
217   cuda_ptx_in_memory_.reset(
218       new CudaPtxInMemory{ptx, kernelname, false /* ptx_compressed */});
219   return this;
220 }
221 
AddCudaCompressedPtxInMemory(port::StringPiece ptx,port::StringPiece kernelname)222 MultiKernelLoaderSpec *MultiKernelLoaderSpec::AddCudaCompressedPtxInMemory(
223     port::StringPiece ptx, port::StringPiece kernelname) {
224   CHECK(cuda_ptx_in_memory_ == nullptr);
225   cuda_ptx_in_memory_.reset(
226       new CudaPtxInMemory{ptx, kernelname, true /* ptx_compressed */});
227   return this;
228 }
229 
AddCudaPtxInMemory(std::initializer_list<CudaPtxInMemory::PtxSpec> spec_list,port::StringPiece kernelname)230 MultiKernelLoaderSpec *MultiKernelLoaderSpec::AddCudaPtxInMemory(
231     std::initializer_list<CudaPtxInMemory::PtxSpec> spec_list,
232     port::StringPiece kernelname) {
233   CHECK(cuda_ptx_in_memory_ == nullptr);
234   cuda_ptx_in_memory_.reset(
235       new CudaPtxInMemory{spec_list, kernelname, false /* ptx_compressed */});
236   return this;
237 }
238 
AddCudaCompressedPtxInMemory(std::initializer_list<CudaPtxInMemory::PtxSpec> spec_list,port::StringPiece kernelname)239 MultiKernelLoaderSpec *MultiKernelLoaderSpec::AddCudaCompressedPtxInMemory(
240     std::initializer_list<CudaPtxInMemory::PtxSpec> spec_list,
241     port::StringPiece kernelname) {
242   CHECK(cuda_ptx_in_memory_ == nullptr);
243   cuda_ptx_in_memory_.reset(
244       new CudaPtxInMemory{spec_list, kernelname, true /* ptx_compressed */});
245   return this;
246 }
247 
MultiKernelLoaderSpec(size_t arity)248 MultiKernelLoaderSpec::MultiKernelLoaderSpec(size_t arity) : arity_(arity) {}
249 
250 }  // namespace gputools
251 }  // namespace perftools
252