1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #ifndef TENSORFLOW_CORE_KERNELS_GPU_UTILS_H_
17 #define TENSORFLOW_CORE_KERNELS_GPU_UTILS_H_
18
19 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
20
21 #include <unordered_map>
22
23 #include "absl/types/span.h"
24 #include "tensorflow/core/framework/tensor.h"
25 #include "tensorflow/core/lib/core/status.h"
26 #include "tensorflow/core/lib/strings/str_util.h"
27 #include "tensorflow/core/lib/strings/strcat.h"
28 #include "tensorflow/core/lib/strings/stringprintf.h"
29 #include "tensorflow/core/platform/logging.h"
30 #include "tensorflow/core/platform/stream_executor.h"
31
32 namespace stream_executor {
33 class RedzoneAllocator;
34 } // namespace stream_executor
35
36 namespace tensorflow {
37
38 class NodeDef;
39 class AutotuneResult;
40
41 // Return whether the redzone check is disabled.
42 //
43 // Controlled by the TF_DISABLE_RZ_CHECK environment variable.
44 bool RedzoneCheckDisabled();
45
46 // Return an allocated buffer with redzones the size of `buffer`. Does
47 // *not* copy the contents of the `buffer` into the newly allocated buffer:
48 // assumes that buffer is a pure out-parameter.
49 //
50 // Returns `buffer` if RedzoneCheckDisabled() is true.
51 //
52 // On error, return `buffer`, and log an error message (once).
53 se::DeviceMemoryBase WrapRedzoneBestEffort(se::RedzoneAllocator* rz_allocator,
54 se::DeviceMemoryBase buffer);
55
56 // Check the passed allocator for redzone violations.
57 // If violations have occurred, mark the corresponding autotune result
58 // as a failure.
59 void CheckRedzones(const se::RedzoneAllocator& rz_allocator,
60 AutotuneResult* autotune_result);
61
62 template <typename T>
AsDeviceMemory(const T * cuda_memory,uint64 size)63 inline se::DeviceMemory<T> AsDeviceMemory(const T* cuda_memory, uint64 size) {
64 se::DeviceMemoryBase wrapped(const_cast<T*>(cuda_memory), size * sizeof(T));
65 se::DeviceMemory<T> typed(wrapped);
66 return typed;
67 }
68
69 // A helper class that looks up the best autotuned config from parameters.
70 // Due to the noisy nature of autotune, especially with multiple devices, it
71 // only accepts a config if its margin exceeds a threshold.
72 // For the same shape configs, if a new best config matches the previous best,
73 // they get promoted; otherwise, the winner gets demoted. This process stops
74 // when the winner's score exceeds the threshold.
75 // In a bad case when two configs are very close to each other and flips
76 // back and forth randomly, the expected number of experiments before autotune
77 // settles is O(threshold ^ 2). So we recommend that number of warmup runs
78 // for any benchmarks.
79 template <typename Parameters, typename Config>
80 class AutotuneMap {
81 private:
82 // Retrieves the hash code of Parameters class.
83 struct Hasher {
operatorHasher84 std::size_t operator()(const Parameters& parameter) const {
85 return parameter.hash();
86 }
87 };
88
89 public:
Find(const Parameters & params,Config * config)90 bool Find(const Parameters& params, Config* config) const {
91 mutex_lock lock(mu_);
92 auto iter = params_config_map_.find(params);
93 if (iter == params_config_map_.end() ||
94 (iter->second.score < min_score_threshold_ &&
95 iter->second.count <= max_autotune_count_)) {
96 return false;
97 }
98 *config = iter->second.config;
99 return true;
100 }
Insert(const Parameters & params,const Config & config)101 void Insert(const Parameters& params, const Config& config) {
102 mutex_lock lock(mu_);
103 auto iter = params_config_map_.find(params);
104 int new_score = 0;
105 if (iter == params_config_map_.end()) {
106 // Create a new entry if params is new.
107 VLOG(1) << GetActionSummary("creates", params, config);
108 params_config_map_.insert(
109 std::make_pair(params, ValueType{config, 1, 1}));
110 new_score = 1;
111 } else if (iter->second.score < min_score_threshold_ &&
112 iter->second.count <= max_autotune_count_) {
113 DCHECK_GT(iter->second.score, 0);
114 if (iter->second.config != config) {
115 // If it is different from the current winner, demotes the winner.
116 VLOG(1) << GetActionSummary("demotes", params, config);
117 new_score = --iter->second.score;
118 ++iter->second.count;
119 if (new_score <= 0) {
120 VLOG(1) << GetActionSummary("erases", params, config);
121 params_config_map_.erase(iter);
122 }
123 } else {
124 // If it is the same as the current winner, promotes the winner.
125 VLOG(1) << GetActionSummary("promotes", params, config);
126 new_score = ++iter->second.score;
127 ++iter->second.count;
128 }
129 }
130 if (new_score >= min_score_threshold_) {
131 VLOG(1) << GetActionSummary("accepts", params, config);
132 } else if (autotune_global_count_ >= max_autotune_global_count_) {
133 // The autotuning exceeds the max iteration threshold and we accept the
134 // the winner if it exists in the map, otherwise we accept the current
135 // winner.
136 auto winner = params_config_map_.find(params);
137 if (winner == params_config_map_.end()) {
138 VLOG(1) << GetActionSummary("creates", params, config);
139 for (int i = 0; i < min_score_threshold_; ++i) {
140 VLOG(1) << GetActionSummary("promotes", params, config);
141 }
142 params_config_map_.insert(
143 std::make_pair(params, ValueType{config, min_score_threshold_, 1}));
144 } else {
145 int promotes_times = min_score_threshold_ - winner->second.score;
146 for (int i = 0; i < promotes_times; ++i) {
147 VLOG(1) << GetActionSummary("promotes", params, config);
148 }
149 winner->second.score = min_score_threshold_;
150 }
151 VLOG(1) << GetActionSummary("accepts", params, config);
152 }
153 autotune_global_count_++;
154 }
155
GetMap()156 std::unordered_map<Parameters, Config, Hasher> GetMap() const {
157 mutex_lock lock(mu_);
158 std::unordered_map<Parameters, Config, Hasher> map;
159 for (const auto& entry : params_config_map_) {
160 map.insert(std::make_pair(entry.first, entry.second.config));
161 }
162 return map;
163 }
164
165 // Only for testing
ClearMap()166 void ClearMap() {
167 mutex_lock lock(mu_);
168 params_config_map_.clear();
169 }
170
171 private:
172 // Underlying data structure of values in the map.
173 struct ValueType {
174 Config config;
175 int32 score;
176 int32 count;
177 };
AutotuneMap(const std::string & name)178 AutotuneMap(const std::string& name) : name_(name) {
179 min_score_threshold_ = 1;
180 int min_warmup_iterations = 10;
181 const char* threshold_str = getenv("TF_AUTOTUNE_THRESHOLD");
182 if (threshold_str != nullptr) {
183 VLOG(1) << "TF_AUTOTUNE_THRESHOLD = " << threshold_str;
184 strings::safe_strto32(threshold_str, &min_score_threshold_);
185 }
186 const char* min_warmup_iteration_str =
187 getenv("TF_AUTOTUNE_MIN_WARMUP_ITERATIONS");
188 if (min_warmup_iteration_str != nullptr) {
189 strings::safe_strto32(min_warmup_iteration_str, &min_warmup_iterations);
190 }
191 min_score_threshold_ = std::max(min_score_threshold_, 1);
192 max_autotune_count_ = std::max(
193 5 * min_score_threshold_ * min_score_threshold_, min_warmup_iterations);
194 max_autotune_global_count_ = 2 * max_autotune_count_;
195 autotune_global_count_ = 0;
196 }
197
198 template <class Group, class Params, class Cfg>
199 friend class AutotuneSingleton;
200
GetActionSummary(StringPiece action,const Parameters & params,const Config & config)201 std::string GetActionSummary(StringPiece action, const Parameters& params,
202 const Config& config) {
203 return strings::Printf("autotune_map %s %s: %s -> (%s)", name_.c_str(),
204 string(action).c_str(), params.ToString().c_str(),
205 config.ToString().c_str());
206 }
207
208 mutable mutex mu_;
209
210 std::unordered_map<Parameters, ValueType, Hasher> params_config_map_
211 TF_GUARDED_BY(mu_);
212 std::string name_;
213 int32 min_score_threshold_;
214 int32 max_autotune_count_;
215 int32 max_autotune_global_count_;
216 int32 autotune_global_count_;
217
218 TF_DISALLOW_COPY_AND_ASSIGN(AutotuneMap);
219 };
220
221 // A Singleton helper that manages the global autotune results by groups.
222 // The caller specified arbitrary Group type that can distinguish between
223 // different autotune results, even if their Parameters and Configs are the
224 // same.
225 template <class Group, typename Parameters, typename Config>
226 class AutotuneSingleton {
227 public:
228 typedef AutotuneMap<Parameters, Config> AutotuneType;
GetInstance()229 static AutotuneType* GetInstance() {
230 static AutotuneType* instance = new AutotuneType(Group::name());
231 return instance;
232 }
233 };
234
235 // Logs convolution results to customized back-storage.
236 void LogConvAutotuneResults(se::dnn::ConvolutionKind kind,
237 se::dnn::DataType element_type,
238 se::DeviceMemoryBase input_buffer,
239 se::DeviceMemoryBase filter_buffer,
240 se::DeviceMemoryBase output_buffer,
241 const se::dnn::BatchDescriptor& input_desc,
242 const se::dnn::FilterDescriptor& filter_desc,
243 const se::dnn::BatchDescriptor& output_desc,
244 const se::dnn::ConvolutionDescriptor& conv_desc,
245 se::StreamExecutor* stream_exec,
246 absl::Span<const AutotuneResult> results);
247
248 // Logs fused convolution results to customized back-storage.
249 void LogFusedConvForwardAutotuneResults(
250 se::dnn::DataType element_type, se::DeviceMemoryBase input_buffer,
251 se::DeviceMemoryBase filter_buffer, se::DeviceMemoryBase output_buffer,
252 se::DeviceMemoryBase bias_buffer, se::DeviceMemoryBase side_input_buffer,
253 const se::dnn::BatchDescriptor& input_desc,
254 const se::dnn::FilterDescriptor& filter_desc,
255 const se::dnn::BatchDescriptor& output_desc,
256 const se::dnn::ConvolutionDescriptor& conv_desc, double conv_scale,
257 double side_value_scale, se::dnn::ActivationMode activation_mode,
258 se::StreamExecutor* stream_exec, absl::Span<const AutotuneResult> results);
259
260 // Returns the best algorithms for the config, one is the fastest, the other is
261 // other is fastest with 0 scratch space. Unsuccessful autotuning results are
262 // allowed and ignored. The "plans" can be null when Cudnn frontend APIs are not
263 // used.
264 Status BestCudnnConvAlgorithm(
265 absl::Span<const AutotuneResult> results,
266 std::vector<std::unique_ptr<se::dnn::ConvolveExecutionPlan>>* plans,
267 se::dnn::AlgorithmConfig* algo);
268
269 } // namespace tensorflow
270
271 #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
272
273 #endif // TENSORFLOW_CORE_KERNELS_GPU_UTILS_H_
274