1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #ifndef TENSORFLOW_CORE_KERNELS_CONV_OPS_GPU_H_
17 #define TENSORFLOW_CORE_KERNELS_CONV_OPS_GPU_H_
18
19 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
20
21 #include <tuple>
22 #include <unordered_map>
23
24 #include "absl/strings/str_cat.h"
25 #include "tensorflow/core/framework/op_kernel.h"
26 #include "tensorflow/core/kernels/gpu_utils.h"
27 #include "tensorflow/core/lib/gtl/inlined_vector.h"
28 #include "tensorflow/core/lib/hash/hash.h"
29 #include "tensorflow/core/util/tensor_format.h"
30
31 namespace tensorflow {
32
33 // Returns true if the given StreamExecutor is for a Volta or newer nvidia GPU.
IsVoltaOrLater(const se::StreamExecutor & stream_exec)34 inline bool IsVoltaOrLater(const se::StreamExecutor& stream_exec) {
35 int major, minor;
36 CHECK(stream_exec // Crash OK
37 .GetDeviceDescription()
38 .cuda_compute_capability(&major, &minor));
39 return major >= 7;
40 }
41
42 // Get the Dnn workspace limit from the environment variable, which is in MB.
43 // Return the workspace memory limit in bytes. If no value is set, return the
44 // default value.
45 int64 GetDnnWorkspaceLimit(const string& envvar_in_mb,
46 int64 default_value_in_bytes);
47
48 // A class to provide scratch-space allocator for Stream-Executor Cudnn
49 // callback. TensorFlow is responsible for releasing the temporary buffers after
50 // the kernel finishes.
51 class DnnScratchAllocator : public se::ScratchAllocator {
52 public:
~DnnScratchAllocator()53 virtual ~DnnScratchAllocator() {}
DnnScratchAllocator(int64 memory_limit,OpKernelContext * context)54 DnnScratchAllocator(int64 memory_limit, OpKernelContext* context)
55 : memory_limit_(memory_limit), total_byte_size_(0), context_(context) {}
GetMemoryLimitInBytes()56 int64 GetMemoryLimitInBytes() override { return memory_limit_; }
AllocateBytes(int64 byte_size)57 se::port::StatusOr<se::DeviceMemory<uint8>> AllocateBytes(
58 int64 byte_size) override {
59 Tensor temporary_memory;
60 if (byte_size < 0) {
61 return se::port::Status{se::port::error::INVALID_ARGUMENT,
62 "Requested negative byte size!"};
63 }
64 if (byte_size > memory_limit_) {
65 return se::port::Status{se::port::error::UNAVAILABLE,
66 absl::StrCat("Requested memory size (", byte_size,
67 ") exceeds the max memory limit (",
68 memory_limit_, ").")};
69 }
70 AllocationAttributes allocation_attr;
71 allocation_attr.retry_on_failure = false;
72 Status allocation_status(context_->allocate_temp(
73 DT_UINT8, TensorShape({byte_size}), &temporary_memory,
74 AllocatorAttributes(), allocation_attr));
75 if (!allocation_status.ok()) {
76 return se::port::Status{
77 se::port::error::UNAVAILABLE,
78 absl::StrCat("Failed to allocate the requested memory size (",
79 byte_size, ").")};
80 }
81 // Hold the reference of the allocated tensors until the end of the
82 // allocator.
83 allocated_tensors_.push_back(temporary_memory);
84 total_byte_size_ += byte_size;
85 return se::port::StatusOr<se::DeviceMemory<uint8>>(
86 AsDeviceMemory(temporary_memory.flat<uint8>().data(),
87 temporary_memory.flat<uint8>().size()));
88 }
TotalByteSize()89 int64 TotalByteSize() { return total_byte_size_; }
90
91 private:
92 int64 memory_limit_;
93 int64 total_byte_size_;
94 OpKernelContext* context_;
95 std::vector<Tensor> allocated_tensors_;
96 };
97
98 // Encapsulate all the shape information that is used in both forward and
99 // backward conv operations.
100 class ConvParameters {
101 public:
102 using SpatialArray = gtl::InlinedVector<int64, 3>;
103 ConvParameters(int64 batch, int64 in_depths, const SpatialArray& in,
104 TensorFormat data_format, int64 out_depths,
105 const SpatialArray& filter, const SpatialArray& dilation,
106 const SpatialArray& stride, const SpatialArray& padding,
107 DataType dtype, int device_id, int group_count = 1)
batch_(batch)108 : batch_(batch),
109 in_depths_(in_depths),
110 out_depths_(out_depths),
111 in_(CheckSpatialArraySize(in)),
112 data_format_(data_format),
113 filter_(CheckSpatialArraySize(filter)),
114 dilation_(CheckSpatialArraySize(dilation)),
115 stride_(CheckSpatialArraySize(stride)),
116 padding_(CheckSpatialArraySize(padding)),
117 dtype_(dtype),
118 device_id_(device_id),
119 group_count_(group_count) {
120 hash_code_ = batch;
121 hash_code_ = Hash64Combine(hash_code_, in_depths);
122 for (int64 val : in) hash_code_ = Hash64Combine(hash_code_, val);
123 hash_code_ = Hash64Combine(hash_code_, data_format);
124 hash_code_ = Hash64Combine(hash_code_, out_depths);
125 for (int64 val : filter) hash_code_ = Hash64Combine(hash_code_, val);
126 for (int64 val : dilation) hash_code_ = Hash64Combine(hash_code_, val);
127 for (int64 val : stride) hash_code_ = Hash64Combine(hash_code_, val);
128 for (int64 val : padding) hash_code_ = Hash64Combine(hash_code_, val);
129 hash_code_ = Hash64Combine(hash_code_, dtype);
130 hash_code_ = Hash64Combine(hash_code_, device_id);
131 hash_code_ = Hash64Combine(hash_code_, group_count);
132 }
133
134 bool operator==(const ConvParameters& other) const {
135 return this->get_data_as_tuple() == other.get_data_as_tuple();
136 }
137
138 bool operator!=(const ConvParameters& other) const {
139 return !(*this == other);
140 }
hash()141 uint64 hash() const { return hash_code_; }
142
ToString()143 string ToString() const {
144 // clang-format off
145 return strings::StrCat(
146 batch_, ", ", in_depths_, ", ",
147 "(", str_util::Join(in_, ", "), "), ",
148 ::tensorflow::ToString(data_format_), ", ",
149 out_depths_, ", ",
150 "(", str_util::Join(filter_, ", "), "), ",
151 "(", str_util::Join(dilation_, ", "), "), ",
152 "(", str_util::Join(stride_, ", "), "), ",
153 "(", str_util::Join(padding_, ", "), "), ",
154 dtype_, ", ",
155 device_id_,
156 group_count_);
157 // clang-format on
158 }
159
160 // The purpose of this function is to disable winograd nonfused conv algorithm
161 // for certain input parameters so as to avoid a bug in cuDNNv5 and cuDNNv6.
162 template <typename T>
ShouldIncludeWinogradNonfusedAlgo(se::StreamExecutor * stream_exec)163 bool ShouldIncludeWinogradNonfusedAlgo(
164 se::StreamExecutor* stream_exec) const {
165 auto* dnn_support = stream_exec->AsDnn();
166 if (!dnn_support) {
167 return false;
168 }
169 // Skip this check for cuDNN 7 and newer.
170 auto version = dnn_support->GetVersion();
171 if (version.ok() && version.ValueOrDie().major_version() >= 7) {
172 return true;
173 }
174 return ShouldIncludeWinogradNonfusedAlgoPreCudnn7<T>();
175 }
176
177 protected:
178 using ParameterDataType =
179 std::tuple<int64, int64, SpatialArray, TensorFormat, int64, SpatialArray,
180 SpatialArray, SpatialArray, SpatialArray, DataType, int, int>;
181
get_data_as_tuple()182 ParameterDataType get_data_as_tuple() const {
183 return std::make_tuple(batch_, in_depths_, in_, data_format_, out_depths_,
184 filter_, dilation_, stride_, padding_, dtype_,
185 device_id_, group_count_);
186 }
187
188 uint64 hash_code_;
189
190 private:
191 friend struct ConvParametersPeer; // For testing purposes.
192
CheckSpatialArraySize(const SpatialArray & array)193 static const SpatialArray& CheckSpatialArraySize(const SpatialArray& array) {
194 CHECK_LE(array.size(), 3); // Catch corruptions related to b/124313574.
195 return array;
196 }
197
198 template <typename T>
ShouldIncludeWinogradNonfusedAlgoPreCudnn7()199 bool ShouldIncludeWinogradNonfusedAlgoPreCudnn7() const {
200 int64 total_size = 16 * std::ceil(batch_ / 16.0) *
201 std::max(in_depths_, out_depths_) * in_[0] * in_[1] *
202 sizeof(T);
203 int64 threshold = 1LL << 31;
204 if (total_size >= threshold) {
205 return false;
206 } else {
207 return true;
208 }
209 }
210
211 int64 batch_;
212 int64 in_depths_;
213 int64 out_depths_;
214 SpatialArray in_;
215 TensorFormat data_format_;
216 SpatialArray filter_;
217 SpatialArray dilation_;
218 SpatialArray stride_;
219 SpatialArray padding_;
220 DataType dtype_;
221 int device_id_;
222 int group_count_;
223 };
224
225 typedef Eigen::GpuDevice GPUDevice;
226
227 } // namespace tensorflow
228
229 #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
230
231 #endif // TENSORFLOW_CORE_KERNELS_CONV_OPS_GPU_H_
232