• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 // The ROCM-specific DNN library support, implementing the general DnnSupport
17 // interface.
18 
19 #ifndef TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_DNN_H_
20 #define TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_DNN_H_
21 
22 #include "tensorflow/stream_executor/dnn.h"
23 #include "tensorflow/stream_executor/lib/status.h"
24 #include "tensorflow/stream_executor/platform/mutex.h"
25 #include "tensorflow/stream_executor/platform/thread_annotations.h"
26 #include "tensorflow/stream_executor/plugin_registry.h"
27 #include "tensorflow/stream_executor/temporary_device_memory.h"
28 
29 namespace stream_executor {
30 namespace gpu {
31 
32 class GpuExecutor;
33 class MIOpenRnnDescriptor;
34 class MIOpenRnnSequenceTensorDescriptor;
35 class MIOpenRnnStateTensorDescriptor;
36 // Opaque and unique identifier for the MIOpen plugin.
37 extern const PluginId kMIOpenPlugin;
38 
39 // miopen-library based DNN support. For details on overridden interface
40 // functions, see dnn.h.
41 class MIOpenSupport : public dnn::DnnSupport {
42  public:
43   explicit MIOpenSupport(GpuExecutor* parent);
44 
45   port::Status Init() override;
46   port::StatusOr<perftools::gputools::dnn::VersionInfo> GetVersion() override;
47 
48   port::StatusOr<std::unique_ptr<dnn::RnnDescriptor>> createRnnDescriptor(
49       int num_layers, int hidden_size, int input_size, int batch_size,
50       dnn::RnnInputMode input_mode, dnn::RnnDirectionMode direction_mode,
51       dnn::RnnMode rnn_mode, dnn::DataType data_type,
52       const dnn::AlgorithmConfig& algorithm_config, float dropout, uint64 seed,
53       ScratchAllocator* state_allocator) override;
54 
55   port::StatusOr<std::unique_ptr<dnn::RnnSequenceTensorDescriptor>>
56   createRnnSequenceTensorDescriptor(int seq_length, int batch_size,
57                                     int data_size,
58                                     dnn::DataType data_type) override;
59 
60   port::StatusOr<std::unique_ptr<dnn::RnnStateTensorDescriptor>>
61   createRnnStateTensorDescriptor(int num_layer, int batch_size, int data_size,
62                                  dnn::DataType data_type) override;
63 
64   bool DoRnnForward(Stream* stream, const dnn::RnnDescriptor& rnn_desc,
65                     const dnn::RnnSequenceTensorDescriptor& input_desc,
66                     const DeviceMemory<Eigen::half>& input_data,
67                     const dnn::RnnStateTensorDescriptor& input_h_desc,
68                     const DeviceMemory<Eigen::half>& input_h_data,
69                     const dnn::RnnStateTensorDescriptor& input_c_desc,
70                     const DeviceMemory<Eigen::half>& input_c_data,
71                     const DeviceMemory<Eigen::half>& params,
72                     const dnn::RnnSequenceTensorDescriptor& output_desc,
73                     DeviceMemory<Eigen::half>* output_data,
74                     const dnn::RnnStateTensorDescriptor& output_h_desc,
75                     DeviceMemory<Eigen::half>* output_h_data,
76                     const dnn::RnnStateTensorDescriptor& output_c_desc,
77                     DeviceMemory<Eigen::half>* output_c_data, bool is_training,
78                     ScratchAllocator* reserve_space_allocator,
79                     ScratchAllocator* workspace_allocator,
80                     dnn::ProfileResult* output_profile_result) override;
81 
82   bool DoRnnForward(Stream* stream, const dnn::RnnDescriptor& rnn_desc,
83                     const dnn::RnnSequenceTensorDescriptor& input_desc,
84                     const DeviceMemory<float>& input_data,
85                     const dnn::RnnStateTensorDescriptor& input_h_desc,
86                     const DeviceMemory<float>& input_h_data,
87                     const dnn::RnnStateTensorDescriptor& input_c_desc,
88                     const DeviceMemory<float>& input_c_data,
89                     const DeviceMemory<float>& params,
90                     const dnn::RnnSequenceTensorDescriptor& output_desc,
91                     DeviceMemory<float>* output_data,
92                     const dnn::RnnStateTensorDescriptor& output_h_desc,
93                     DeviceMemory<float>* output_h_data,
94                     const dnn::RnnStateTensorDescriptor& output_c_desc,
95                     DeviceMemory<float>* output_c_data, bool is_training,
96                     ScratchAllocator* reserve_space_allocator,
97                     ScratchAllocator* workspace_allocator,
98                     dnn::ProfileResult* output_profile_result) override;
99 
100   bool DoRnnForward(Stream* stream, const dnn::RnnDescriptor& rnn_desc,
101                     const dnn::RnnSequenceTensorDescriptor& input_desc,
102                     const DeviceMemory<double>& input_data,
103                     const dnn::RnnStateTensorDescriptor& input_h_desc,
104                     const DeviceMemory<double>& input_h_data,
105                     const dnn::RnnStateTensorDescriptor& input_c_desc,
106                     const DeviceMemory<double>& input_c_data,
107                     const DeviceMemory<double>& params,
108                     const dnn::RnnSequenceTensorDescriptor& output_desc,
109                     DeviceMemory<double>* output_data,
110                     const dnn::RnnStateTensorDescriptor& output_h_desc,
111                     DeviceMemory<double>* output_h_data,
112                     const dnn::RnnStateTensorDescriptor& output_c_desc,
113                     DeviceMemory<double>* output_c_data, bool is_training,
114                     ScratchAllocator* reserve_space_allocator,
115                     ScratchAllocator* workspace_allocator,
116                     dnn::ProfileResult* output_profile_result) override;
117 
118   bool DoRnnBackward(Stream* stream, const dnn::RnnDescriptor& rnn_desc,
119                      const dnn::RnnSequenceTensorDescriptor& input_desc,
120                      const DeviceMemory<Eigen::half>& input_data,
121                      const dnn::RnnStateTensorDescriptor& input_h_desc,
122                      const DeviceMemory<Eigen::half>& input_h_data,
123                      const dnn::RnnStateTensorDescriptor& input_c_desc,
124                      const DeviceMemory<Eigen::half>& input_c_data,
125                      const DeviceMemory<Eigen::half>& params,
126                      const dnn::RnnSequenceTensorDescriptor& output_desc,
127                      const DeviceMemory<Eigen::half>& output_data,
128                      const dnn::RnnStateTensorDescriptor& output_h_desc,
129                      const DeviceMemory<Eigen::half>& output_h_data,
130                      const dnn::RnnStateTensorDescriptor& output_c_desc,
131                      const DeviceMemory<Eigen::half>& output_c_data,
132                      const DeviceMemory<Eigen::half>& output_backprop_data,
133                      const DeviceMemory<Eigen::half>& output_h_backprop_data,
134                      const DeviceMemory<Eigen::half>& output_c_backprop_data,
135                      DeviceMemory<Eigen::half>* input_backprop_data,
136                      DeviceMemory<Eigen::half>* input_h_backprop_data,
137                      DeviceMemory<Eigen::half>* input_c_backprop_data,
138                      DeviceMemory<Eigen::half>* params_backprop_data,
139                      DeviceMemory<uint8>* reserve_space_data,
140                      ScratchAllocator* workspace_allocator,
141                      dnn::ProfileResult* output_profile_result) override;
142 
143   bool DoRnnBackward(Stream* stream, const dnn::RnnDescriptor& rnn_desc,
144                      const dnn::RnnSequenceTensorDescriptor& input_desc,
145                      const DeviceMemory<float>& input_data,
146                      const dnn::RnnStateTensorDescriptor& input_h_desc,
147                      const DeviceMemory<float>& input_h_data,
148                      const dnn::RnnStateTensorDescriptor& input_c_desc,
149                      const DeviceMemory<float>& input_c_data,
150                      const DeviceMemory<float>& params,
151                      const dnn::RnnSequenceTensorDescriptor& output_desc,
152                      const DeviceMemory<float>& output_data,
153                      const dnn::RnnStateTensorDescriptor& output_h_desc,
154                      const DeviceMemory<float>& output_h_data,
155                      const dnn::RnnStateTensorDescriptor& output_c_desc,
156                      const DeviceMemory<float>& output_c_data,
157                      const DeviceMemory<float>& output_backprop_data,
158                      const DeviceMemory<float>& output_h_backprop_data,
159                      const DeviceMemory<float>& output_c_backprop_data,
160                      DeviceMemory<float>* input_backprop_data,
161                      DeviceMemory<float>* input_h_backprop_data,
162                      DeviceMemory<float>* input_c_backprop_data,
163                      DeviceMemory<float>* params_backprop_data,
164                      DeviceMemory<uint8>* reserve_space_data,
165                      ScratchAllocator* workspace_allocator,
166                      dnn::ProfileResult* output_profile_result) override;
167 
168   bool DoRnnBackward(Stream* stream, const dnn::RnnDescriptor& rnn_desc,
169                      const dnn::RnnSequenceTensorDescriptor& input_desc,
170                      const DeviceMemory<double>& input_data,
171                      const dnn::RnnStateTensorDescriptor& input_h_desc,
172                      const DeviceMemory<double>& input_h_data,
173                      const dnn::RnnStateTensorDescriptor& input_c_desc,
174                      const DeviceMemory<double>& input_c_data,
175                      const DeviceMemory<double>& params,
176                      const dnn::RnnSequenceTensorDescriptor& output_desc,
177                      const DeviceMemory<double>& output_data,
178                      const dnn::RnnStateTensorDescriptor& output_h_desc,
179                      const DeviceMemory<double>& output_h_data,
180                      const dnn::RnnStateTensorDescriptor& output_c_desc,
181                      const DeviceMemory<double>& output_c_data,
182                      const DeviceMemory<double>& output_backprop_data,
183                      const DeviceMemory<double>& output_h_backprop_data,
184                      const DeviceMemory<double>& output_c_backprop_data,
185                      DeviceMemory<double>* input_backprop_data,
186                      DeviceMemory<double>* input_h_backprop_data,
187                      DeviceMemory<double>* input_c_backprop_data,
188                      DeviceMemory<double>* params_backprop_data,
189                      DeviceMemory<uint8>* reserve_space_data,
190                      ScratchAllocator* workspace_allocator,
191                      dnn::ProfileResult* output_profile_result) override;
192 
193   bool GetConvolveAlgorithms(
194       bool with_winograd_nonfused, int cc_major, int cc_minor,
195       std::vector<dnn::AlgorithmDesc>* out_algorithms) override;
196 
197   bool GetRnnAlgorithms(
198       std::vector<dnn::AlgorithmDesc>* out_algorithms) override;
199 
200   bool GetConvolveBackwardDataAlgorithms(
201       bool with_winograd_nonfused, int cc_major, int cc_minor,
202       std::vector<dnn::AlgorithmDesc>* out_algorithms) override;
203 
204   bool GetConvolveBackwardFilterAlgorithms(
205       bool with_winograd_nonfused, int cc_major, int cc_minor,
206       std::vector<dnn::AlgorithmDesc>* out_algorithms) override;
207 
208   bool DoBatchNormalizationForward(
209       Stream* stream, const DeviceMemory<float>& x,
210       const DeviceMemory<float>& scale, const DeviceMemory<float>& offset,
211       const DeviceMemory<float>& estimated_mean,
212       const DeviceMemory<float>& estimated_variance,
213       const dnn::BatchDescriptor& x_desc,
214       const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
215       DeviceMemory<float>* y, DeviceMemory<float>* batch_mean,
216       DeviceMemory<float>* batch_var, DeviceMemory<float>* saved_mean,
217       DeviceMemory<float>* saved_inv_var, bool is_training,
218       std::function<const DeviceMemory<float>&()> var_to_inv_var,
219       std::function<void()> inv_var_to_var) override;
220 
221   bool DoBatchNormalizationForward(
222       Stream* stream, const DeviceMemory<Eigen::half>& x,
223       const DeviceMemory<float>& scale, const DeviceMemory<float>& offset,
224       const DeviceMemory<float>& estimated_mean,
225       const DeviceMemory<float>& estimated_variance,
226       const dnn::BatchDescriptor& x_desc,
227       const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
228       DeviceMemory<Eigen::half>* y, DeviceMemory<float>* batch_mean,
229       DeviceMemory<float>* batch_var, DeviceMemory<float>* saved_mean,
230       DeviceMemory<float>* saved_inv_var, bool is_training,
231       std::function<const DeviceMemory<float>&()> var_to_inv_var,
232       std::function<void()> inv_var_to_var) override;
233 
234   bool DoBatchNormalizationBackward(
235       Stream* stream, const DeviceMemory<float>& y_backprop,
236       const DeviceMemory<float>& x, const DeviceMemory<float>& scale,
237       const DeviceMemory<float>& mean, const DeviceMemory<float>& variance,
238       const dnn::BatchDescriptor& x_desc,
239       const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
240       DeviceMemory<float>* x_backprop, DeviceMemory<float>* scale_backprop,
241       DeviceMemory<float>* offset_backprop) override;
242 
243   bool DoBatchNormalizationBackward(
244       Stream* stream, const DeviceMemory<Eigen::half>& y_backprop,
245       const DeviceMemory<Eigen::half>& x, const DeviceMemory<float>& scale,
246       const DeviceMemory<float>& mean, const DeviceMemory<float>& inv_var,
247       const dnn::BatchDescriptor& x_desc,
248       const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
249       DeviceMemory<Eigen::half>* x_backprop,
250       DeviceMemory<float>* scale_backprop,
251       DeviceMemory<float>* offset_backprop) override;
252 
253   port::Status DoConvolve(
254       dnn::ConvolutionKind kind, dnn::DataType element_type, Stream* stream,
255       const dnn::BatchDescriptor& input_descriptor, DeviceMemoryBase input_data,
256       const dnn::FilterDescriptor& filter_descriptor,
257       DeviceMemoryBase filter_data,
258       const dnn::BatchDescriptor& output_descriptor,
259       DeviceMemoryBase output_data,
260       const dnn::ConvolutionDescriptor& convolution_descriptor,
261       dnn::AlgorithmDesc algorithm_desc, DeviceMemory<uint8> scratch_memory,
262       dnn::ProfileResult* output_profile_result) override;
263 
264   bool DoFusedConvolve(
265       Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor,
266       const DeviceMemory<double>& conv_input_data, double conv_input_scale,
267       const dnn::FilterDescriptor& filter_descriptor,
268       const DeviceMemory<double>& filter_data,
269       const dnn::ConvolutionDescriptor& convolution_descriptor,
270       const DeviceMemory<double>& side_input_data, double side_input_scale,
271       const dnn::BatchDescriptor& bias_descriptor,
272       const DeviceMemory<double>& biases, dnn::ActivationMode activation_mode,
273       const dnn::BatchDescriptor& output_descriptor,
274       DeviceMemory<double>* output_data, ScratchAllocator* scratch_allocator,
275       const dnn::AlgorithmConfig& algorithm_config,
276       dnn::ProfileResult* output_profile_result) override;
277 
278   bool DoFusedConvolve(
279       Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor,
280       const DeviceMemory<float>& conv_input_data, float conv_input_scale,
281       const dnn::FilterDescriptor& filter_descriptor,
282       const DeviceMemory<float>& filter_data,
283       const dnn::ConvolutionDescriptor& convolution_descriptor,
284       const DeviceMemory<float>& side_input_data, float side_input_scale,
285       const dnn::BatchDescriptor& bias_descriptor,
286       const DeviceMemory<float>& biases, dnn::ActivationMode activation_mode,
287       const dnn::BatchDescriptor& output_descriptor,
288       DeviceMemory<float>* output_data, ScratchAllocator* scratch_allocator,
289       const dnn::AlgorithmConfig& algorithm_config,
290       dnn::ProfileResult* output_profile_result) override;
291 
292   bool DoFusedConvolve(Stream* stream,
293                        const dnn::BatchDescriptor& conv_input_descriptor,
294                        const DeviceMemory<Eigen::half>& conv_input_data,
295                        float conv_input_scale,
296                        const dnn::FilterDescriptor& filter_descriptor,
297                        const DeviceMemory<Eigen::half>& filter_data,
298                        const dnn::ConvolutionDescriptor& convolution_descriptor,
299                        const DeviceMemory<Eigen::half>& side_input_data,
300                        float side_input_scale,
301                        const dnn::BatchDescriptor& bias_descriptor,
302                        const DeviceMemory<Eigen::half>& biases,
303                        dnn::ActivationMode activation_mode,
304                        const dnn::BatchDescriptor& output_descriptor,
305                        DeviceMemory<Eigen::half>* output_data,
306                        ScratchAllocator* scratch_allocator,
307                        const dnn::AlgorithmConfig& algorithm_config,
308                        dnn::ProfileResult* output_profile_result) override;
309 
310   bool DoFusedConvolve(
311       Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor,
312       const DeviceMemory<int8>& conv_input_data, float conv_input_scale,
313       const dnn::FilterDescriptor& filter_descriptor,
314       const DeviceMemory<int8>& filter_data,
315       const dnn::ConvolutionDescriptor& convolution_descriptor,
316       const DeviceMemory<int8>& side_input_data, float side_input_scale,
317       const dnn::BatchDescriptor& bias_descriptor,
318       const DeviceMemory<float>& biases, dnn::ActivationMode activation_mode,
319       const dnn::BatchDescriptor& output_descriptor,
320       DeviceMemory<int8>* output_data, ScratchAllocator* scratch_allocator,
321       const dnn::AlgorithmConfig& algorithm_config,
322       dnn::ProfileResult* output_profile_result) override;
323 
DoConvolveQuantized(Stream * stream,const dnn::BatchDescriptor & input_descriptor,const DeviceMemory<float> & input_data,const dnn::FilterDescriptor & filter_descriptor,const DeviceMemory<int8> & filter_coefficients,const DeviceMemory<float> & coefficient_scales,const dnn::ConvolutionDescriptor & convolution_descriptor,const dnn::BatchDescriptor & output_descriptor,DeviceMemory<float> * output_data)324   bool DoConvolveQuantized(
325       Stream* stream, const dnn::BatchDescriptor& input_descriptor,
326       const DeviceMemory<float>& input_data,
327       const dnn::FilterDescriptor& filter_descriptor,
328       const DeviceMemory<int8>& filter_coefficients,
329       const DeviceMemory<float>& coefficient_scales,
330       const dnn::ConvolutionDescriptor& convolution_descriptor,
331       const dnn::BatchDescriptor& output_descriptor,
332       DeviceMemory<float>* output_data) override {
333     LOG(ERROR) << "DoConvolveQuantized not supported by MIOpen";
334     return false;
335   }
336 
DoConvolveQuantized(Stream * stream,const dnn::BatchDescriptor & input_descriptor,const DeviceMemory<float> & input_data,const dnn::FilterDescriptor & filter_descriptor,const DeviceMemory<int16> & filter_coefficients,const DeviceMemory<float> & coefficient_scales,const dnn::ConvolutionDescriptor & convolution_descriptor,const dnn::BatchDescriptor & output_descriptor,DeviceMemory<float> * output_data)337   bool DoConvolveQuantized(
338       Stream* stream, const dnn::BatchDescriptor& input_descriptor,
339       const DeviceMemory<float>& input_data,
340       const dnn::FilterDescriptor& filter_descriptor,
341       const DeviceMemory<int16>& filter_coefficients,
342       const DeviceMemory<float>& coefficient_scales,
343       const dnn::ConvolutionDescriptor& convolution_descriptor,
344       const dnn::BatchDescriptor& output_descriptor,
345       DeviceMemory<float>* output_data) override {
346     LOG(ERROR) << "DoConvolveQuantized not supported by MIOpen";
347     return false;
348   }
349 
DoSeparableConvolve(Stream * stream,const dnn::BatchDescriptor & batch_descriptor,const DeviceMemory<float> & input_data,const dnn::FilterDescriptor & filter_descriptor,int depth_multiplier,const DeviceMemory<float> & first_weights,const DeviceMemory<float> & second_weights,const dnn::ConvolutionDescriptor & convolution_descriptor,const dnn::BatchDescriptor & output_descriptor,DeviceMemory<float> * output_data)350   bool DoSeparableConvolve(
351       Stream* stream, const dnn::BatchDescriptor& batch_descriptor,
352       const DeviceMemory<float>& input_data,
353       const dnn::FilterDescriptor& filter_descriptor, int depth_multiplier,
354       const DeviceMemory<float>& first_weights,
355       const DeviceMemory<float>& second_weights,
356       const dnn::ConvolutionDescriptor& convolution_descriptor,
357       const dnn::BatchDescriptor& output_descriptor,
358       DeviceMemory<float>* output_data) override {
359     LOG(ERROR) << "separable convolution not supported by MIOpen";
360     return false;
361   }
362 
363   bool DoConvolveBackwardBias(
364       Stream* stream, const dnn::BatchDescriptor& input_descriptor,
365       const DeviceMemory<double>& input_data,
366       const dnn::BatchDescriptor& bias_descriptor,
367       DeviceMemory<double>* backward_bias_data) override;
368 
369   bool DoConvolveBackwardBias(Stream* stream,
370                               const dnn::BatchDescriptor& input_descriptor,
371                               const DeviceMemory<float>& input_data,
372                               const dnn::BatchDescriptor& bias_descriptor,
373                               DeviceMemory<float>* backward_bias_data) override;
374 
375   bool DoConvolveBackwardBias(
376       Stream* stream, const dnn::BatchDescriptor& input_descriptor,
377       const DeviceMemory<Eigen::half>& input_data,
378       const dnn::BatchDescriptor& bias_descriptor,
379       DeviceMemory<Eigen::half>* backward_bias_data) override;
380 
381   bool DoMatMul(Stream* stream, const DeviceMemory<float>& input_data,
382                 const DeviceMemory<float>& weights,
383                 const dnn::BatchDescriptor& input_dimensions,
384                 const dnn::BatchDescriptor& output_dimensions,
385                 DeviceMemory<float>* output_data) override;
386 
DoMatMulQuantized(Stream * stream,const DeviceMemory<float> & input_data,const DeviceMemory<int8> & quantized_weights,const DeviceMemory<float> & weight_scales,const dnn::BatchDescriptor & input_dimensions,const dnn::BatchDescriptor & output_dimensions,DeviceMemory<float> * output_data)387   bool DoMatMulQuantized(Stream* stream, const DeviceMemory<float>& input_data,
388                          const DeviceMemory<int8>& quantized_weights,
389                          const DeviceMemory<float>& weight_scales,
390                          const dnn::BatchDescriptor& input_dimensions,
391                          const dnn::BatchDescriptor& output_dimensions,
392                          DeviceMemory<float>* output_data) override {
393     LOG(ERROR) << "DNN MatMulQuantized not supported by MIOpen";
394     return false;
395   }
396 
DoMatMulQuantized(Stream * stream,const DeviceMemory<float> & input_data,const DeviceMemory<int16> & quantized_weights,const DeviceMemory<float> & weight_scales,const dnn::BatchDescriptor & input_dimensions,const dnn::BatchDescriptor & output_dimensions,DeviceMemory<float> * output_data)397   bool DoMatMulQuantized(Stream* stream, const DeviceMemory<float>& input_data,
398                          const DeviceMemory<int16>& quantized_weights,
399                          const DeviceMemory<float>& weight_scales,
400                          const dnn::BatchDescriptor& input_dimensions,
401                          const dnn::BatchDescriptor& output_dimensions,
402                          DeviceMemory<float>* output_data) override {
403     LOG(ERROR) << "DNN MatMulQuantized not supported by MIOpen";
404     return false;
405   }
406 
407   bool DoBiasAdd(Stream* stream, const DeviceMemory<float>& input_data,
408                  const DeviceMemory<float>& biases,
409                  const dnn::BatchDescriptor& dimensions,
410                  DeviceMemory<float>* output_data) override;
411 
412   bool DoActivate(Stream* stream, dnn::ActivationMode activation_mode,
413                   const dnn::BatchDescriptor& dimensions,
414                   const DeviceMemory<float>& input_data,
415                   DeviceMemory<float>* output_data, uint64 options) override;
416 
417   bool DoPoolForward(Stream* stream,
418                      const dnn::PoolingDescriptor& pooling_dimensions,
419                      const dnn::BatchDescriptor& input_dimensions,
420                      const DeviceMemory<double>& input_data,
421                      const dnn::BatchDescriptor& output_dimensions,
422                      DeviceMemory<double>* output_data,
423                      ScratchAllocator* workspace_allocator = nullptr) override;
424 
425   bool DoPoolForward(Stream* stream,
426                      const dnn::PoolingDescriptor& pooling_dimensions,
427                      const dnn::BatchDescriptor& input_dimensions,
428                      const DeviceMemory<float>& input_data,
429                      const dnn::BatchDescriptor& output_dimensions,
430                      DeviceMemory<float>* output_data,
431                      ScratchAllocator* workspace_allocator = nullptr) override;
432 
433   bool DoPoolForward(Stream* stream,
434                      const dnn::PoolingDescriptor& pooling_dimensions,
435                      const dnn::BatchDescriptor& input_dimensions,
436                      const DeviceMemory<Eigen::half>& input_data,
437                      const dnn::BatchDescriptor& output_dimensions,
438                      DeviceMemory<Eigen::half>* output_data,
439                      ScratchAllocator* workspace_allocator = nullptr) override;
440 
441   bool DoPoolBackward(Stream* stream,
442                       const dnn::PoolingDescriptor& pooling_dimensions,
443                       const dnn::BatchDescriptor& input_dimensions,
444                       const DeviceMemory<double>& input_data,
445                       const dnn::BatchDescriptor& output_dimensions,
446                       const DeviceMemory<double>& output_data,
447                       const DeviceMemory<double>& input_diff_data,
448                       DeviceMemory<double>* output_diff_data,
449                       ScratchAllocator* workspace_allocator = nullptr) override;
450 
451   bool DoPoolBackward(Stream* stream,
452                       const dnn::PoolingDescriptor& pooling_dimensions,
453                       const dnn::BatchDescriptor& input_dimensions,
454                       const DeviceMemory<float>& input_data,
455                       const dnn::BatchDescriptor& output_dimensions,
456                       const DeviceMemory<float>& output_data,
457                       const DeviceMemory<float>& input_diff_data,
458                       DeviceMemory<float>* output_diff_data,
459                       ScratchAllocator* workspace_allocator = nullptr) override;
460 
461   bool DoPoolBackward(Stream* stream,
462                       const dnn::PoolingDescriptor& pooling_dimensions,
463                       const dnn::BatchDescriptor& input_dimensions,
464                       const DeviceMemory<Eigen::half>& input_data,
465                       const dnn::BatchDescriptor& output_dimensions,
466                       const DeviceMemory<Eigen::half>& output_data,
467                       const DeviceMemory<Eigen::half>& input_diff_data,
468                       DeviceMemory<Eigen::half>* output_diff_data,
469                       ScratchAllocator* workspace_allocator = nullptr) override;
470 
471   bool DoNormalizeWithDimensions(
472       Stream* stream, const dnn::NormalizeDescriptor& normalize_descriptor,
473       const dnn::BatchDescriptor& dimensions,
474       const DeviceMemory<float>& input_data,
475       DeviceMemory<float>* output_data) override;
476 
477   bool DoNormalizeBackwardWithDimensions(
478       Stream* stream, const dnn::NormalizeDescriptor& normalize_descriptor,
479       const dnn::BatchDescriptor& dimensions,
480       const DeviceMemory<float>& raw_data,
481       const DeviceMemory<float>& normalized_data,
482       const DeviceMemory<float>& normalized_variable_gradient,
483       DeviceMemory<float>* raw_variable_gradient,
484       ScratchAllocator* workspace_allocator = nullptr) override;
485 
486   bool DoDepthConcatenate(
487       Stream* stream, port::ArraySlice<dnn::BatchDescriptor> input_dimensions,
488       port::ArraySlice<const DeviceMemory<float>*> input_data,
489       DeviceMemory<float>* output_data) override;
490 
491   bool DoElementwiseOperate(
492       Stream* stream, dnn::ElementwiseOperation operation,
493       port::ArraySlice<dnn::BatchDescriptor> input_dimensions,
494       port::ArraySlice<const DeviceMemory<float>*> input_data,
495       const dnn::BatchDescriptor& output_dimensions,
496       DeviceMemory<float>* output_data) override;
497 
498   bool DoXYPad(Stream* stream, const dnn::BatchDescriptor& dimensions,
499                const DeviceMemory<float>& input_data, int64 left_pad,
500                int64 right_pad, int64 top_pad, int64 bottom_pad,
501                DeviceMemory<float>* output_data) override;
502 
503   bool DoXYSlice(Stream* stream, const dnn::BatchDescriptor& dimensions,
504                  const DeviceMemory<float>& input_data, int64 left_trim,
505                  int64 right_trim, int64 top_trim, int64 bottom_trim,
506                  DeviceMemory<float>* output_data) override;
507 
508   bool DoMemcpyD2HQuantized(Stream* stream,
509                             const DeviceMemory<float>& device_unquantized_src,
510                             dnn::QuantizedActivationMode mode, void* host_dst,
511                             int64 size) override;
512 
513   bool DoMemcpyH2DQuantized(
514       Stream* stream, const void* host_src, int64 size,
515       dnn::QuantizedActivationMode mode,
516       DeviceMemory<float>* device_unquantized_dst) override;
517 
518   // Derives an output batch descriptor from an input batch and convolution
519   // descriptors.
520   bool DeriveOutputBatchDescriptor(
521       const dnn::BatchDescriptor& batch_descriptor,
522       const dnn::FilterDescriptor& filter_descriptor,
523       const dnn::ConvolutionDescriptor& convolution_descriptor,
524       dnn::BatchDescriptor* output_batch_descriptor);
525 
526   bool DoTransformTensor(Stream* stream, const dnn::BatchDescriptor& input_desc,
527                          dnn::DataType input_type,
528                          const DeviceMemoryBase& input_data,
529                          const dnn::BatchDescriptor& output_desc,
530                          dnn::DataType output_type, float scale,
531                          DeviceMemoryBase* output_data) override;
532 
533   bool DoFusedConvolutionBiasActivation(
534       Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor,
535       const DeviceMemory<float>& conv_input_data,
536       const dnn::FilterDescriptor& filter_descriptor,
537       const DeviceMemory<float>& filter_data,
538       const dnn::ConvolutionDescriptor& convolution_descriptor,
539       const dnn::BatchDescriptor& bias_descriptor,
540       const DeviceMemory<float>& bias_data, dnn::ActivationMode activation_mode,
541       const dnn::BatchDescriptor& output_descriptor,
542       DeviceMemory<float>* output_data,
543       dnn::ProfileResult* output_profile_result) override;
544 
545   bool DoFusedBatchNormActivationInference(
546       Stream* stream, const dnn::BatchDescriptor& x_descriptor,
547       const DeviceMemory<float>& x_data,
548       const dnn::BatchDescriptor& scale_mean_variance_descriptor,
549       const DeviceMemory<float>& scale_data,
550       const DeviceMemory<float>& offset_data,
551       const DeviceMemory<float>& mean_data,
552       const DeviceMemory<float>& variance_data, double epsilon,
553       dnn::ActivationMode activation_mode, DeviceMemory<float>* y_data,
554       dnn::ProfileResult* output_profile_result) override;
555 
556   bool DoFusedBatchNormActivationInference(
557       Stream* stream, const dnn::BatchDescriptor& x_descriptor,
558       const DeviceMemory<Eigen::half>& x_data,
559       const dnn::BatchDescriptor& scale_mean_variance_descriptor,
560       const DeviceMemory<float>& scale_data,
561       const DeviceMemory<float>& offset_data,
562       const DeviceMemory<float>& mean_data,
563       const DeviceMemory<float>& variance_data, double epsilon,
564       dnn::ActivationMode activation_mode, DeviceMemory<Eigen::half>* y_data,
565       dnn::ProfileResult* output_profile_result) override;
566 
567   bool DoFusedBatchNormActivationForward(
568       Stream* stream, const dnn::BatchDescriptor& x_descriptor,
569       const DeviceMemory<float>& x_data,
570       const dnn::BatchDescriptor& scale_offset_mean_variance_descriptor,
571       const DeviceMemory<float>& scale_data,
572       const DeviceMemory<float>& offset_data, double epsilon,
573       dnn::ActivationMode activation_mode, DeviceMemory<float>* y_data,
574       DeviceMemory<float>* batch_mean_data, DeviceMemory<float>* batch_var_data,
575       DeviceMemory<float>* saved_mean_data, DeviceMemory<float>* saved_var_data,
576       dnn::ProfileResult* output_profile_result) override;
577 
578   bool DoFusedBatchNormActivationForward(
579       Stream* stream, const dnn::BatchDescriptor& x_descriptor,
580       const DeviceMemory<Eigen::half>& x_data,
581       const dnn::BatchDescriptor& scale_offset_mean_variance_descriptor,
582       const DeviceMemory<float>& scale_data,
583       const DeviceMemory<float>& offset_data, double epsilon,
584       dnn::ActivationMode activation_mode, DeviceMemory<Eigen::half>* y_data,
585       DeviceMemory<float>* batch_mean_data, DeviceMemory<float>* batch_var_data,
586       DeviceMemory<float>* saved_mean_data, DeviceMemory<float>* saved_var_data,
587       dnn::ProfileResult* output_profile_result) override;
588 
589   bool DoFusedBatchNormActivationBackward(
590       Stream* stream, const dnn::BatchDescriptor& y_act_backprop_descriptor,
591       const DeviceMemory<float>& y_act_backprop_data,
592       const DeviceMemory<float>& y_act_data,
593       dnn::ActivationMode activation_mode, const DeviceMemory<float>& x_bn_data,
594       const dnn::BatchDescriptor& scale_offset_mean_variance_descriptor,
595       const DeviceMemory<float>& scale_data,
596       const DeviceMemory<float>& offset_data,
597       const DeviceMemory<float>& saved_mean_data,
598       const DeviceMemory<float>& saved_var_data,
599       DeviceMemory<float>* x_bn_backprop_data,
600       DeviceMemory<float>* scale_backprop_data,
601       DeviceMemory<float>* offset_backprop_data,
602       dnn::ProfileResult* output_profile_result) override;
603 
604   bool DoFusedBatchNormActivationBackward(
605       Stream* stream, const dnn::BatchDescriptor& y_act_backprop_descriptor,
606       const DeviceMemory<Eigen::half>& y_act_backprop_data,
607       const DeviceMemory<Eigen::half>& y_act_data,
608       dnn::ActivationMode activation_mode,
609       const DeviceMemory<Eigen::half>& x_bn_data,
610       const dnn::BatchDescriptor& scale_offset_mean_variance_descriptor,
611       const DeviceMemory<float>& scale_data,
612       const DeviceMemory<float>& offset_data,
613       const DeviceMemory<float>& saved_mean_data,
614       const DeviceMemory<float>& saved_var_data,
615       DeviceMemory<Eigen::half>* x_bn_backprop_data,
616       DeviceMemory<float>* scale_backprop_data,
617       DeviceMemory<float>* offset_backprop_data,
618       dnn::ProfileResult* output_profile_result) override;
619 
GetParentExecutor()620   GpuExecutor* GetParentExecutor() { return parent_; }
621 
622  private:
623   GpuExecutor* parent_;  // Parent executor object. Not owned.
624 
625   // Provide access to the MIOpen handle.
626   std::unique_ptr<class MIOpenAccess> miopen_;
627 
628   template <class T, class U>
629   bool DoBatchNormalizationForwardImpl(
630       Stream* stream, dnn::DataType input_data_type,
631       dnn::DataType scale_data_type, const DeviceMemory<T>& x,
632       const DeviceMemory<U>& scale, const DeviceMemory<U>& offset,
633       const DeviceMemory<U>& estimated_mean,
634       const DeviceMemory<U>& estimated_variance,
635       const dnn::BatchDescriptor& x_desc,
636       const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
637       DeviceMemory<T>* y, DeviceMemory<U>* batch_mean,
638       DeviceMemory<U>* batch_var, DeviceMemory<U>* saved_mean,
639       DeviceMemory<U>* saved_inv_var, bool is_training,
640       std::function<const DeviceMemory<U>&()> var_to_inv_var,
641       std::function<void()> inv_var_to_var);
642 
643   template <class T, class U>
644   bool DoBatchNormalizationBackwardImpl(
645       Stream* stream, int miopen_input_type, int miopen_scale_type,
646       const DeviceMemory<T>& y_backprop, const DeviceMemory<T>& x,
647       const DeviceMemory<U>& scale, const DeviceMemory<U>& mean,
648       const DeviceMemory<U>& variance, const dnn::BatchDescriptor& x_desc,
649       const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
650       DeviceMemory<T>* x_backprop, DeviceMemory<U>* scale_backprop,
651       DeviceMemory<U>* offset_backprop);
652 
653   template <class T>
654   bool DoConvolveBackwardBiasImpl(
655       Stream* stream,
656       int miopen_type,  // Actually miopenDataType_t.
657       const dnn::BatchDescriptor& input_descriptor,
658       const DeviceMemory<T>& input_data,
659       const dnn::BatchDescriptor& bias_descriptor,
660       DeviceMemory<T>* backward_bias_data);
661 
662   template <class T>
663   bool DoRnnForwardImpl(Stream* stream, const MIOpenRnnDescriptor& rnn_desc,
664                         const MIOpenRnnSequenceTensorDescriptor& input_desc,
665                         const DeviceMemory<T>& input_data,
666                         const MIOpenRnnStateTensorDescriptor& input_h_desc,
667                         const DeviceMemory<T>& input_h_data,
668                         const MIOpenRnnStateTensorDescriptor& input_c_desc,
669                         const DeviceMemory<T>& input_c_data,
670                         const DeviceMemory<T>& params,
671                         const MIOpenRnnSequenceTensorDescriptor& output_desc,
672                         DeviceMemory<T>* output_data,
673                         const MIOpenRnnStateTensorDescriptor& output_h_desc,
674                         DeviceMemory<T>* output_h_data,
675                         const MIOpenRnnStateTensorDescriptor& output_c_desc,
676                         DeviceMemory<T>* output_c_data, bool is_training,
677                         ScratchAllocator* reserve_space_allocator,
678                         ScratchAllocator* workspace_allocator);
679   template <class T>
680   bool DoRnnBackwardImpl(Stream* stream, const MIOpenRnnDescriptor& rnn_desc,
681                          const MIOpenRnnSequenceTensorDescriptor& input_desc,
682                          const DeviceMemory<T>& input_data,
683                          const MIOpenRnnStateTensorDescriptor& input_h_desc,
684                          const DeviceMemory<T>& input_h_data,
685                          const MIOpenRnnStateTensorDescriptor& input_c_desc,
686                          const DeviceMemory<T>& input_c_data,
687                          const DeviceMemory<T>& params,
688                          const MIOpenRnnSequenceTensorDescriptor& output_desc,
689                          const DeviceMemory<T>& output_data,
690                          const MIOpenRnnStateTensorDescriptor& output_h_desc,
691                          const DeviceMemory<T>& output_h_data,
692                          const MIOpenRnnStateTensorDescriptor& output_c_desc,
693                          const DeviceMemory<T>& output_c_data,
694                          const DeviceMemory<T>& output_backprop_data,
695                          const DeviceMemory<T>& output_h_backprop_data,
696                          const DeviceMemory<T>& output_c_backprop_data,
697                          DeviceMemory<T>* input_backprop_data,
698                          DeviceMemory<T>* input_h_backprop_data,
699                          DeviceMemory<T>* input_c_backprop_data,
700                          DeviceMemory<T>* params_backprop_data,
701                          DeviceMemory<uint8>* reserve_space_data,
702                          ScratchAllocator* workspace_allocator);
703 
704   template <typename T>
705   bool DoFusedConvolutionBiasActivationImpl(
706       Stream* stream,
707       int miopen_type,  // Actually miopenDataType_t.
708       const dnn::BatchDescriptor& conv_input_descriptor,
709       const DeviceMemory<T>& conv_input_data,
710       const dnn::FilterDescriptor& filter_descriptor,
711       const DeviceMemory<T>& filter_data,
712       const dnn::ConvolutionDescriptor& convolution_descriptor,
713       const dnn::BatchDescriptor& bias_descriptor,
714       const DeviceMemory<T>& bias_data, dnn::ActivationMode activation_mode,
715       const dnn::BatchDescriptor& output_descriptor,
716       DeviceMemory<T>* output_data, dnn::ProfileResult* output_profile_result);
717 
718   template <typename T, typename U>
719   bool DoFusedBatchNormActivationInferenceImpl(
720       Stream* stream,
721       int miopen_type,  // Actually miopenDataType_t.
722       const dnn::BatchDescriptor& x_descriptor, const DeviceMemory<T>& x_data,
723       const dnn::BatchDescriptor& scale_offset_mean_variance_descriptor,
724       const DeviceMemory<U>& scale_data, const DeviceMemory<U>& offset_data,
725       const DeviceMemory<U>& mean_data, const DeviceMemory<U>& variance_data,
726       double epsilon, dnn::ActivationMode activation_mode,
727       DeviceMemory<T>* y_data, dnn::ProfileResult* output_profile_result);
728 
729   template <typename T, typename U>
730   bool DoFusedBatchNormActivationForwardImpl(
731       Stream* stream,
732       int miopen_type,  // Actually miopenDataType_t.
733       const dnn::BatchDescriptor& x_descriptor, const DeviceMemory<T>& x_data,
734       const dnn::BatchDescriptor& scale_offset_mean_variance_descriptor,
735       const DeviceMemory<U>& scale_data, const DeviceMemory<U>& offset_data,
736       double epsilon, dnn::ActivationMode activation_mode,
737       DeviceMemory<T>* y_data, DeviceMemory<U>* batch_mean_data,
738       DeviceMemory<U>* batch_var_data, DeviceMemory<U>* saved_mean_data,
739       DeviceMemory<U>* saved_var_data,
740       dnn::ProfileResult* output_profile_result);
741 
742   template <typename T, typename U>
743   bool DoFusedBatchNormActivationBackwardImpl(
744       Stream* stream,
745       int miopen_type,  // Actually miopenDataType_t.
746       const dnn::BatchDescriptor& y_act_backprop_descriptor,
747       const DeviceMemory<T>& y_act_backprop_data,
748       const DeviceMemory<T>& y_act_data, dnn::ActivationMode activation_mode,
749       const DeviceMemory<T>& x_bn_data,
750       const dnn::BatchDescriptor& scale_offset_mean_variance_descriptor,
751       const DeviceMemory<U>& scale_data, const DeviceMemory<U>& offset_data,
752       const DeviceMemory<U>& saved_mean_data,
753       const DeviceMemory<U>& saved_var_data,
754       DeviceMemory<T>* x_bn_backprop_data, DeviceMemory<U>* scale_backprop_data,
755       DeviceMemory<U>* offset_backprop_data,
756       dnn::ProfileResult* output_profile_result);
757 
758   port::Status DoPrepareForConvolution(
759       dnn::ConvolutionKind kind, dnn::DataType element_type, Stream* stream,
760       const dnn::BatchDescriptor& input_descriptor, DeviceMemoryBase input_data,
761       const dnn::FilterDescriptor& filter_descriptor,
762       DeviceMemoryBase filter_data,
763       const dnn::BatchDescriptor& output_descriptor,
764       DeviceMemoryBase output_data,
765       const dnn::ConvolutionDescriptor& convolution_descriptor,
766       const dnn::AlgorithmConfig& algorithm_config,
767       ScratchAllocator* scratch_allocator, dnn::AlgorithmDesc* algorithm_desc,
768       DeviceMemory<uint8>* scratch_memory) override;
769 
770   SE_DISALLOW_COPY_AND_ASSIGN(MIOpenSupport);
771 };
772 
773 }  // namespace gpu
774 }  // namespace stream_executor
775 
776 #endif  // TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_DNN_H_
777