1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #ifndef TENSORFLOW_CORE_KERNELS_CONV_OPS_GPU_H_ 17 #define TENSORFLOW_CORE_KERNELS_CONV_OPS_GPU_H_ 18 19 #if GOOGLE_CUDA 20 21 #include <tuple> 22 #include <unordered_map> 23 #include "tensorflow/core/framework/op_kernel.h" 24 #include "tensorflow/core/kernels/gpu_utils.h" 25 #include "tensorflow/core/lib/gtl/inlined_vector.h" 26 #include "tensorflow/core/lib/hash/hash.h" 27 28 namespace tensorflow { 29 30 // Get the Dnn workspace limit from the environment variable, which is in MB. 31 // Return the workspace memory limit in bytes. If no value is set, return the 32 // default value. 33 int64 GetDnnWorkspaceLimit(const string& envvar_in_mb, 34 int64 default_value_in_bytes); 35 36 // A class to provide scratch-space allocator for Stream-Executor Cudnn 37 // callback. TensorFlow is responsible for releasing the temporary buffers after 38 // the kernel finishes. 39 class DnnScratchAllocator : public se::ScratchAllocator { 40 public: ~DnnScratchAllocator()41 virtual ~DnnScratchAllocator() {} DnnScratchAllocator(int64 memory_limit,OpKernelContext * context)42 DnnScratchAllocator(int64 memory_limit, OpKernelContext* context) 43 : memory_limit_(memory_limit), total_byte_size_(0), context_(context) {} GetMemoryLimitInBytes(se::Stream * stream)44 int64 GetMemoryLimitInBytes(se::Stream* stream) override { 45 return memory_limit_; 46 } AllocateBytes(se::Stream * stream,int64 byte_size)47 se::port::StatusOr<se::DeviceMemory<uint8>> AllocateBytes( 48 se::Stream* stream, int64 byte_size) override { 49 Tensor temporary_memory; 50 if (byte_size < 0) { 51 return se::port::Status{se::port::error::INVALID_ARGUMENT, 52 "Requested negative byte size!"}; 53 } 54 if (byte_size > memory_limit_) { 55 return se::port::StatusOr<se::DeviceMemory<uint8>>(); 56 } 57 AllocationAttributes allocation_attr; 58 allocation_attr.no_retry_on_failure = true; 59 Status allocation_status(context_->allocate_temp( 60 DT_UINT8, TensorShape({byte_size}), &temporary_memory, 61 AllocatorAttributes(), allocation_attr)); 62 if (!allocation_status.ok()) { 63 return se::port::StatusOr<se::DeviceMemory<uint8>>(); 64 } 65 // Hold the reference of the allocated tensors until the end of the 66 // allocator. 67 allocated_tensors_.push_back(temporary_memory); 68 total_byte_size_ += byte_size; 69 return se::port::StatusOr<se::DeviceMemory<uint8>>( 70 AsDeviceMemory(temporary_memory.flat<uint8>().data(), 71 temporary_memory.flat<uint8>().size())); 72 } TotalByteSize()73 int64 TotalByteSize() { return total_byte_size_; } 74 75 private: 76 int64 memory_limit_; 77 int64 total_byte_size_; 78 OpKernelContext* context_; 79 std::vector<Tensor> allocated_tensors_; 80 }; 81 82 // Encapsulate all the shape information that is used in both forward and 83 // backward conv operations. 84 class ConvParameters { 85 public: 86 using SpatialArray = gtl::InlinedVector<int64, 3>; ConvParameters(int64 batch,int64 in_depths,const SpatialArray & in,TensorFormat data_format,int64 out_depths,const SpatialArray & filter,const SpatialArray & dilation,const SpatialArray & stride,const SpatialArray & padding,DataType dtype,int device_id)87 ConvParameters(int64 batch, int64 in_depths, const SpatialArray& in, 88 TensorFormat data_format, int64 out_depths, 89 const SpatialArray& filter, const SpatialArray& dilation, 90 const SpatialArray& stride, const SpatialArray& padding, 91 DataType dtype, int device_id) 92 : batch_(batch), 93 in_depths_(in_depths), 94 out_depths_(out_depths), 95 in_(CheckSpatialArraySize(in)), 96 data_format_(data_format), 97 filter_(CheckSpatialArraySize(filter)), 98 dilation_(CheckSpatialArraySize(dilation)), 99 stride_(CheckSpatialArraySize(stride)), 100 padding_(CheckSpatialArraySize(padding)), 101 dtype_(dtype), 102 device_id_(device_id) { 103 hash_code_ = batch; 104 hash_code_ = Hash64Combine(hash_code_, in_depths); 105 for (int64 val : in) hash_code_ = Hash64Combine(hash_code_, val); 106 hash_code_ = Hash64Combine(hash_code_, data_format); 107 hash_code_ = Hash64Combine(hash_code_, out_depths); 108 for (int64 val : filter) hash_code_ = Hash64Combine(hash_code_, val); 109 for (int64 val : dilation) hash_code_ = Hash64Combine(hash_code_, val); 110 for (int64 val : stride) hash_code_ = Hash64Combine(hash_code_, val); 111 for (int64 val : padding) hash_code_ = Hash64Combine(hash_code_, val); 112 hash_code_ = Hash64Combine(hash_code_, dtype); 113 hash_code_ = Hash64Combine(hash_code_, device_id); 114 } 115 bool operator==(const ConvParameters& other) const { 116 return this->get_data_as_tuple() == other.get_data_as_tuple(); 117 } 118 119 bool operator!=(const ConvParameters& other) const { 120 return !(*this == other); 121 } hash()122 uint64 hash() const { return hash_code_; } 123 ToString()124 string ToString() const { 125 // clang-format off 126 return strings::StrCat( 127 batch_, ", ", in_depths_, ", ", 128 "(", str_util::Join(in_, ", "), "), ", 129 ::tensorflow::ToString(data_format_), ", ", 130 out_depths_, ", ", 131 "(", str_util::Join(filter_, ", "), "), ", 132 "(", str_util::Join(dilation_, ", "), "), ", 133 "(", str_util::Join(stride_, ", "), "), ", 134 "(", str_util::Join(padding_, ", "), "), ", 135 dtype_, ", ", 136 device_id_); 137 // clang-format on 138 } 139 140 // The purpose of this function is to disable winograd nonfused conv algorithm 141 // for certain input parameters so as to avoid a bug in cuDNNv5 and cuDNNv6. 142 template <typename T> ShouldIncludeWinogradNonfusedAlgo(se::StreamExecutor * stream_exec)143 bool ShouldIncludeWinogradNonfusedAlgo( 144 se::StreamExecutor* stream_exec) const { 145 auto* dnn_support = stream_exec->AsDnn(); 146 if (!dnn_support) { 147 return false; 148 } 149 // Skip this check for cuDNN 7 and newer. 150 auto version = dnn_support->GetVersion(); 151 if (version.ok() && version.ValueOrDie().major_version() >= 7) { 152 return true; 153 } 154 return ShouldIncludeWinogradNonfusedAlgoPreCudnn7<T>(); 155 } 156 157 protected: 158 using ParameterDataType = 159 std::tuple<int64, int64, SpatialArray, TensorFormat, int64, SpatialArray, 160 SpatialArray, SpatialArray, SpatialArray, DataType, int>; 161 get_data_as_tuple()162 ParameterDataType get_data_as_tuple() const { 163 return std::make_tuple(batch_, in_depths_, in_, data_format_, out_depths_, 164 filter_, dilation_, stride_, padding_, dtype_, 165 device_id_); 166 } 167 168 uint64 hash_code_; 169 170 private: 171 friend struct ConvParametersPeer; // For testing purposes. 172 CheckSpatialArraySize(const SpatialArray & array)173 static const SpatialArray& CheckSpatialArraySize(const SpatialArray& array) { 174 CHECK_LE(array.size(), 3); // Catch corruptions related to b/124313574. 175 return array; 176 } 177 178 template <typename T> ShouldIncludeWinogradNonfusedAlgoPreCudnn7()179 bool ShouldIncludeWinogradNonfusedAlgoPreCudnn7() const { 180 int64 total_size = 16 * std::ceil(batch_ / 16.0) * 181 std::max(in_depths_, out_depths_) * in_[0] * in_[1] * 182 sizeof(T); 183 int64 threshold = 1LL << 31; 184 if (total_size >= threshold) { 185 return false; 186 } else { 187 return true; 188 } 189 } 190 191 int64 batch_; 192 int64 in_depths_; 193 int64 out_depths_; 194 SpatialArray in_; 195 TensorFormat data_format_; 196 SpatialArray filter_; 197 SpatialArray dilation_; 198 SpatialArray stride_; 199 SpatialArray padding_; 200 DataType dtype_; 201 int device_id_; 202 }; 203 204 typedef Eigen::GpuDevice GPUDevice; 205 206 } // namespace tensorflow 207 208 #endif // GOOGLE_CUDA 209 210 #endif // TENSORFLOW_CORE_KERNELS_CONV_OPS_GPU_H_ 211