1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/stream_executor/kernel_spec.h"
17
18
19 namespace perftools {
20 namespace gputools {
21
KernelLoaderSpec(port::StringPiece kernelname)22 KernelLoaderSpec::KernelLoaderSpec(port::StringPiece kernelname)
23 : kernelname_(kernelname.ToString()) {}
24
OnDiskKernelLoaderSpec(port::StringPiece filename,port::StringPiece kernelname)25 OnDiskKernelLoaderSpec::OnDiskKernelLoaderSpec(port::StringPiece filename,
26 port::StringPiece kernelname)
27 : KernelLoaderSpec(kernelname), filename_(filename.ToString()) {}
28
CudaPtxOnDisk(port::StringPiece filename,port::StringPiece kernelname)29 CudaPtxOnDisk::CudaPtxOnDisk(port::StringPiece filename,
30 port::StringPiece kernelname)
31 : OnDiskKernelLoaderSpec(filename, kernelname) {}
32
CudaCubinOnDisk(port::StringPiece filename,port::StringPiece kernelname)33 CudaCubinOnDisk::CudaCubinOnDisk(port::StringPiece filename,
34 port::StringPiece kernelname)
35 : OnDiskKernelLoaderSpec(filename, kernelname) {}
36
CudaCubinInMemory(const char * bytes,port::StringPiece kernelname)37 CudaCubinInMemory::CudaCubinInMemory(const char *bytes,
38 port::StringPiece kernelname)
39 : KernelLoaderSpec(kernelname), bytes_(bytes) {}
40
CompareComputeCapability(const std::tuple<int,int> & lhs,const std::tuple<int,int> & rhs)41 bool CompareComputeCapability(const std::tuple<int, int> &lhs,
42 const std::tuple<int, int> &rhs) {
43 return std::get<0>(lhs) < std::get<0>(rhs) ||
44 (std::get<0>(lhs) == std::get<0>(rhs) &&
45 std::get<1>(lhs) < std::get<1>(rhs));
46 }
47
48 const std::tuple<int, int> CudaPtxInMemory::kMinimumCapability{1, 0};
49
CudaPtxInMemory(port::StringPiece ptx,port::StringPiece kernel_name,bool ptx_compressed)50 CudaPtxInMemory::CudaPtxInMemory(port::StringPiece ptx,
51 port::StringPiece kernel_name,
52 bool ptx_compressed)
53 : KernelLoaderSpec(kernel_name),
54 ptx_by_compute_capability_(CompareComputeCapability) {
55 if (ptx_compressed) {
56 // Lazy decompression. Put an empty string in decompressed_ptx_ showing that
57 // the original ptx is compressed.
58 decompressed_ptx_[ptx.data()] = "";
59 }
60 ptx_by_compute_capability_[kMinimumCapability] = ptx.data();
61 }
62
CudaPtxInMemory(const std::initializer_list<CudaPtxInMemory::PtxSpec> & spec_list,port::StringPiece kernel_name,bool ptx_compressed)63 CudaPtxInMemory::CudaPtxInMemory(
64 const std::initializer_list<CudaPtxInMemory::PtxSpec> &spec_list,
65 port::StringPiece kernel_name, bool ptx_compressed)
66 : KernelLoaderSpec(kernel_name),
67 ptx_by_compute_capability_(CompareComputeCapability) {
68 for (const auto &spec : spec_list) {
69 int major, minor;
70 port::StringPiece ptx;
71 std::tie(major, minor, ptx) = spec;
72 if (ptx_compressed) {
73 // Lazy decompression. Put an empty string in decompressed_ptx_ showing
74 // that the original ptx is compressed.
75 decompressed_ptx_[ptx.data()] = "";
76 }
77 ptx_by_compute_capability_[std::tuple<int, int>{major, minor}] = ptx.data();
78 }
79 }
80
DecompressPtx(const char * ptx)81 string CudaPtxInMemory::DecompressPtx(const char *ptx) {
82 // Get the length of the PTX string from the beginning of the buffer.
83 uint64 ptx_length = *reinterpret_cast<const uint64 *>(ptx);
84 // Get the PTX string from the buffer with offset and length.
85 string compressed_ptx(ptx + sizeof(uint64),
86 ptx + sizeof(uint64) + ptx_length);
87 string decompressed_ptx;
88 // Decompress the PTX string with bzip2.
89 LOG(FATAL) << "bzip2 decompression is not supported yet.";
90 return decompressed_ptx;
91 }
92
default_text() const93 const char *CudaPtxInMemory::default_text() const {
94 if (ptx_by_compute_capability_.empty()) {
95 return nullptr;
96 }
97
98 mutex_lock lock{mu_};
99
100 auto ptx = ptx_by_compute_capability_.begin()->second;
101 // Check if there is an entry in decompressed ptx table.
102 auto decompressed_ptx_iter = decompressed_ptx_.find(ptx);
103 if (decompressed_ptx_iter != decompressed_ptx_.end()) {
104 // If the decompressed string is empty, which means the ptx hasn't been
105 // decompressed, decompress it here.
106 if (decompressed_ptx_iter->second.empty()) {
107 decompressed_ptx_iter->second = DecompressPtx(ptx);
108 }
109 return decompressed_ptx_iter->second.c_str();
110 }
111 return ptx;
112 }
113
original_default_text() const114 const char *CudaPtxInMemory::original_default_text() const {
115 if (ptx_by_compute_capability_.empty()) {
116 return nullptr;
117 }
118
119 return ptx_by_compute_capability_.begin()->second;
120 }
121
text(int compute_capability_major,int compute_capability_minor) const122 const char *CudaPtxInMemory::text(int compute_capability_major,
123 int compute_capability_minor) const {
124 std::tuple<int, int> capability{compute_capability_major,
125 compute_capability_minor};
126
127 auto ptx_iter = ptx_by_compute_capability_.find(capability);
128 if (ptx_iter == ptx_by_compute_capability_.end()) {
129 return nullptr;
130 }
131
132 mutex_lock lock{mu_};
133
134 // Check if there is an entry in decompressed ptx table.
135 auto decompressed_ptx_iter = decompressed_ptx_.find(ptx_iter->second);
136 if (decompressed_ptx_iter != decompressed_ptx_.end()) {
137 // If the decompressed string is empty, which means the ptx hasn't been
138 // decompressed, decompress it here.
139 if (decompressed_ptx_iter->second.empty()) {
140 decompressed_ptx_iter->second = DecompressPtx(ptx_iter->second);
141 }
142 return decompressed_ptx_iter->second.c_str();
143 }
144 return ptx_iter->second;
145 }
146
original_text(int compute_capability_major,int compute_capability_minor) const147 const char *CudaPtxInMemory::original_text(int compute_capability_major,
148 int compute_capability_minor) const {
149 std::tuple<int, int> capability{compute_capability_major,
150 compute_capability_minor};
151
152 auto ptx_iter = ptx_by_compute_capability_.find(capability);
153 if (ptx_iter == ptx_by_compute_capability_.end()) {
154 return nullptr;
155 }
156
157 return ptx_iter->second;
158 }
159
OpenCLTextOnDisk(port::StringPiece filename,port::StringPiece kernelname)160 OpenCLTextOnDisk::OpenCLTextOnDisk(port::StringPiece filename,
161 port::StringPiece kernelname)
162 : OnDiskKernelLoaderSpec(filename, kernelname) {}
163
OpenCLTextInMemory(port::StringPiece text,port::StringPiece kernelname)164 OpenCLTextInMemory::OpenCLTextInMemory(port::StringPiece text,
165 port::StringPiece kernelname)
166 : KernelLoaderSpec(kernelname), text_(text.ToString()) {}
167
OpenCLBinaryOnDisk(port::StringPiece filename,port::StringPiece kernelname)168 OpenCLBinaryOnDisk::OpenCLBinaryOnDisk(port::StringPiece filename,
169 port::StringPiece kernelname)
170 : OnDiskKernelLoaderSpec(filename, kernelname) {}
171
AddOpenCLTextOnDisk(port::StringPiece filename,port::StringPiece kernelname)172 MultiKernelLoaderSpec *MultiKernelLoaderSpec::AddOpenCLTextOnDisk(
173 port::StringPiece filename, port::StringPiece kernelname) {
174 CHECK(ocl_text_on_disk_ == nullptr);
175 ocl_text_on_disk_.reset(new OpenCLTextOnDisk{filename, kernelname});
176 return this;
177 }
178
AddOpenCLBinaryOnDisk(port::StringPiece filename,port::StringPiece kernelname)179 MultiKernelLoaderSpec *MultiKernelLoaderSpec::AddOpenCLBinaryOnDisk(
180 port::StringPiece filename, port::StringPiece kernelname) {
181 CHECK(ocl_binary_on_disk_ == nullptr);
182 ocl_binary_on_disk_.reset(new OpenCLBinaryOnDisk{filename, kernelname});
183 return this;
184 }
185
AddOpenCLTextInMemory(port::StringPiece filename,port::StringPiece kernelname)186 MultiKernelLoaderSpec *MultiKernelLoaderSpec::AddOpenCLTextInMemory(
187 port::StringPiece filename, port::StringPiece kernelname) {
188 CHECK(ocl_text_in_memory_ == nullptr);
189 ocl_text_in_memory_.reset(new OpenCLTextInMemory{filename, kernelname});
190 return this;
191 }
192
AddCudaPtxOnDisk(port::StringPiece filename,port::StringPiece kernelname)193 MultiKernelLoaderSpec *MultiKernelLoaderSpec::AddCudaPtxOnDisk(
194 port::StringPiece filename, port::StringPiece kernelname) {
195 CHECK(cuda_ptx_on_disk_ == nullptr);
196 cuda_ptx_on_disk_.reset(new CudaPtxOnDisk{filename, kernelname});
197 return this;
198 }
199
AddCudaCubinInMemory(const char * bytes,port::StringPiece kernelname)200 MultiKernelLoaderSpec *MultiKernelLoaderSpec::AddCudaCubinInMemory(
201 const char *bytes, port::StringPiece kernelname) {
202 CHECK(cuda_cubin_in_memory_ == nullptr);
203 cuda_cubin_in_memory_.reset(new CudaCubinInMemory{bytes, kernelname});
204 return this;
205 }
206
AddCudaCubinOnDisk(port::StringPiece filename,port::StringPiece kernelname)207 MultiKernelLoaderSpec *MultiKernelLoaderSpec::AddCudaCubinOnDisk(
208 port::StringPiece filename, port::StringPiece kernelname) {
209 CHECK(cuda_cubin_on_disk_ == nullptr);
210 cuda_cubin_on_disk_.reset(new CudaCubinOnDisk{filename, kernelname});
211 return this;
212 }
213
AddCudaPtxInMemory(port::StringPiece ptx,port::StringPiece kernelname)214 MultiKernelLoaderSpec *MultiKernelLoaderSpec::AddCudaPtxInMemory(
215 port::StringPiece ptx, port::StringPiece kernelname) {
216 CHECK(cuda_ptx_in_memory_ == nullptr);
217 cuda_ptx_in_memory_.reset(
218 new CudaPtxInMemory{ptx, kernelname, false /* ptx_compressed */});
219 return this;
220 }
221
AddCudaCompressedPtxInMemory(port::StringPiece ptx,port::StringPiece kernelname)222 MultiKernelLoaderSpec *MultiKernelLoaderSpec::AddCudaCompressedPtxInMemory(
223 port::StringPiece ptx, port::StringPiece kernelname) {
224 CHECK(cuda_ptx_in_memory_ == nullptr);
225 cuda_ptx_in_memory_.reset(
226 new CudaPtxInMemory{ptx, kernelname, true /* ptx_compressed */});
227 return this;
228 }
229
AddCudaPtxInMemory(std::initializer_list<CudaPtxInMemory::PtxSpec> spec_list,port::StringPiece kernelname)230 MultiKernelLoaderSpec *MultiKernelLoaderSpec::AddCudaPtxInMemory(
231 std::initializer_list<CudaPtxInMemory::PtxSpec> spec_list,
232 port::StringPiece kernelname) {
233 CHECK(cuda_ptx_in_memory_ == nullptr);
234 cuda_ptx_in_memory_.reset(
235 new CudaPtxInMemory{spec_list, kernelname, false /* ptx_compressed */});
236 return this;
237 }
238
AddCudaCompressedPtxInMemory(std::initializer_list<CudaPtxInMemory::PtxSpec> spec_list,port::StringPiece kernelname)239 MultiKernelLoaderSpec *MultiKernelLoaderSpec::AddCudaCompressedPtxInMemory(
240 std::initializer_list<CudaPtxInMemory::PtxSpec> spec_list,
241 port::StringPiece kernelname) {
242 CHECK(cuda_ptx_in_memory_ == nullptr);
243 cuda_ptx_in_memory_.reset(
244 new CudaPtxInMemory{spec_list, kernelname, true /* ptx_compressed */});
245 return this;
246 }
247
MultiKernelLoaderSpec(size_t arity)248 MultiKernelLoaderSpec::MultiKernelLoaderSpec(size_t arity) : arity_(arity) {}
249
250 } // namespace gputools
251 } // namespace perftools
252