1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 // The ROCM-specific DNN library support, implementing the general DnnSupport 17 // interface. 18 19 #ifndef TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_DNN_H_ 20 #define TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_DNN_H_ 21 22 #include "tensorflow/stream_executor/dnn.h" 23 #include "tensorflow/stream_executor/lib/status.h" 24 #include "tensorflow/stream_executor/platform/mutex.h" 25 #include "tensorflow/stream_executor/platform/thread_annotations.h" 26 #include "tensorflow/stream_executor/plugin_registry.h" 27 #include "tensorflow/stream_executor/temporary_device_memory.h" 28 29 namespace stream_executor { 30 namespace gpu { 31 32 class GpuExecutor; 33 class MIOpenRnnDescriptor; 34 class MIOpenRnnSequenceTensorDescriptor; 35 class MIOpenRnnStateTensorDescriptor; 36 // Opaque and unique identifier for the MIOpen plugin. 37 extern const PluginId kMIOpenPlugin; 38 39 // miopen-library based DNN support. For details on overridden interface 40 // functions, see dnn.h. 41 class MIOpenSupport : public dnn::DnnSupport { 42 public: 43 explicit MIOpenSupport(GpuExecutor* parent); 44 45 port::Status Init() override; 46 port::StatusOr<perftools::gputools::dnn::VersionInfo> GetVersion() override; 47 48 port::StatusOr<std::unique_ptr<dnn::RnnDescriptor>> createRnnDescriptor( 49 int num_layers, int hidden_size, int input_size, int batch_size, 50 dnn::RnnInputMode input_mode, dnn::RnnDirectionMode direction_mode, 51 dnn::RnnMode rnn_mode, dnn::DataType data_type, 52 const dnn::AlgorithmConfig& algorithm_config, float dropout, uint64 seed, 53 ScratchAllocator* state_allocator) override; 54 55 port::StatusOr<std::unique_ptr<dnn::RnnSequenceTensorDescriptor>> 56 createRnnSequenceTensorDescriptor(int seq_length, int batch_size, 57 int data_size, 58 dnn::DataType data_type) override; 59 60 port::StatusOr<std::unique_ptr<dnn::RnnStateTensorDescriptor>> 61 createRnnStateTensorDescriptor(int num_layer, int batch_size, int data_size, 62 dnn::DataType data_type) override; 63 64 bool DoRnnForward(Stream* stream, const dnn::RnnDescriptor& rnn_desc, 65 const dnn::RnnSequenceTensorDescriptor& input_desc, 66 const DeviceMemory<Eigen::half>& input_data, 67 const dnn::RnnStateTensorDescriptor& input_h_desc, 68 const DeviceMemory<Eigen::half>& input_h_data, 69 const dnn::RnnStateTensorDescriptor& input_c_desc, 70 const DeviceMemory<Eigen::half>& input_c_data, 71 const DeviceMemory<Eigen::half>& params, 72 const dnn::RnnSequenceTensorDescriptor& output_desc, 73 DeviceMemory<Eigen::half>* output_data, 74 const dnn::RnnStateTensorDescriptor& output_h_desc, 75 DeviceMemory<Eigen::half>* output_h_data, 76 const dnn::RnnStateTensorDescriptor& output_c_desc, 77 DeviceMemory<Eigen::half>* output_c_data, bool is_training, 78 ScratchAllocator* reserve_space_allocator, 79 ScratchAllocator* workspace_allocator, 80 dnn::ProfileResult* output_profile_result) override; 81 82 bool DoRnnForward(Stream* stream, const dnn::RnnDescriptor& rnn_desc, 83 const dnn::RnnSequenceTensorDescriptor& input_desc, 84 const DeviceMemory<float>& input_data, 85 const dnn::RnnStateTensorDescriptor& input_h_desc, 86 const DeviceMemory<float>& input_h_data, 87 const dnn::RnnStateTensorDescriptor& input_c_desc, 88 const DeviceMemory<float>& input_c_data, 89 const DeviceMemory<float>& params, 90 const dnn::RnnSequenceTensorDescriptor& output_desc, 91 DeviceMemory<float>* output_data, 92 const dnn::RnnStateTensorDescriptor& output_h_desc, 93 DeviceMemory<float>* output_h_data, 94 const dnn::RnnStateTensorDescriptor& output_c_desc, 95 DeviceMemory<float>* output_c_data, bool is_training, 96 ScratchAllocator* reserve_space_allocator, 97 ScratchAllocator* workspace_allocator, 98 dnn::ProfileResult* output_profile_result) override; 99 100 bool DoRnnForward(Stream* stream, const dnn::RnnDescriptor& rnn_desc, 101 const dnn::RnnSequenceTensorDescriptor& input_desc, 102 const DeviceMemory<double>& input_data, 103 const dnn::RnnStateTensorDescriptor& input_h_desc, 104 const DeviceMemory<double>& input_h_data, 105 const dnn::RnnStateTensorDescriptor& input_c_desc, 106 const DeviceMemory<double>& input_c_data, 107 const DeviceMemory<double>& params, 108 const dnn::RnnSequenceTensorDescriptor& output_desc, 109 DeviceMemory<double>* output_data, 110 const dnn::RnnStateTensorDescriptor& output_h_desc, 111 DeviceMemory<double>* output_h_data, 112 const dnn::RnnStateTensorDescriptor& output_c_desc, 113 DeviceMemory<double>* output_c_data, bool is_training, 114 ScratchAllocator* reserve_space_allocator, 115 ScratchAllocator* workspace_allocator, 116 dnn::ProfileResult* output_profile_result) override; 117 118 bool DoRnnBackward(Stream* stream, const dnn::RnnDescriptor& rnn_desc, 119 const dnn::RnnSequenceTensorDescriptor& input_desc, 120 const DeviceMemory<Eigen::half>& input_data, 121 const dnn::RnnStateTensorDescriptor& input_h_desc, 122 const DeviceMemory<Eigen::half>& input_h_data, 123 const dnn::RnnStateTensorDescriptor& input_c_desc, 124 const DeviceMemory<Eigen::half>& input_c_data, 125 const DeviceMemory<Eigen::half>& params, 126 const dnn::RnnSequenceTensorDescriptor& output_desc, 127 const DeviceMemory<Eigen::half>& output_data, 128 const dnn::RnnStateTensorDescriptor& output_h_desc, 129 const DeviceMemory<Eigen::half>& output_h_data, 130 const dnn::RnnStateTensorDescriptor& output_c_desc, 131 const DeviceMemory<Eigen::half>& output_c_data, 132 const DeviceMemory<Eigen::half>& output_backprop_data, 133 const DeviceMemory<Eigen::half>& output_h_backprop_data, 134 const DeviceMemory<Eigen::half>& output_c_backprop_data, 135 DeviceMemory<Eigen::half>* input_backprop_data, 136 DeviceMemory<Eigen::half>* input_h_backprop_data, 137 DeviceMemory<Eigen::half>* input_c_backprop_data, 138 DeviceMemory<Eigen::half>* params_backprop_data, 139 DeviceMemory<uint8>* reserve_space_data, 140 ScratchAllocator* workspace_allocator, 141 dnn::ProfileResult* output_profile_result) override; 142 143 bool DoRnnBackward(Stream* stream, const dnn::RnnDescriptor& rnn_desc, 144 const dnn::RnnSequenceTensorDescriptor& input_desc, 145 const DeviceMemory<float>& input_data, 146 const dnn::RnnStateTensorDescriptor& input_h_desc, 147 const DeviceMemory<float>& input_h_data, 148 const dnn::RnnStateTensorDescriptor& input_c_desc, 149 const DeviceMemory<float>& input_c_data, 150 const DeviceMemory<float>& params, 151 const dnn::RnnSequenceTensorDescriptor& output_desc, 152 const DeviceMemory<float>& output_data, 153 const dnn::RnnStateTensorDescriptor& output_h_desc, 154 const DeviceMemory<float>& output_h_data, 155 const dnn::RnnStateTensorDescriptor& output_c_desc, 156 const DeviceMemory<float>& output_c_data, 157 const DeviceMemory<float>& output_backprop_data, 158 const DeviceMemory<float>& output_h_backprop_data, 159 const DeviceMemory<float>& output_c_backprop_data, 160 DeviceMemory<float>* input_backprop_data, 161 DeviceMemory<float>* input_h_backprop_data, 162 DeviceMemory<float>* input_c_backprop_data, 163 DeviceMemory<float>* params_backprop_data, 164 DeviceMemory<uint8>* reserve_space_data, 165 ScratchAllocator* workspace_allocator, 166 dnn::ProfileResult* output_profile_result) override; 167 168 bool DoRnnBackward(Stream* stream, const dnn::RnnDescriptor& rnn_desc, 169 const dnn::RnnSequenceTensorDescriptor& input_desc, 170 const DeviceMemory<double>& input_data, 171 const dnn::RnnStateTensorDescriptor& input_h_desc, 172 const DeviceMemory<double>& input_h_data, 173 const dnn::RnnStateTensorDescriptor& input_c_desc, 174 const DeviceMemory<double>& input_c_data, 175 const DeviceMemory<double>& params, 176 const dnn::RnnSequenceTensorDescriptor& output_desc, 177 const DeviceMemory<double>& output_data, 178 const dnn::RnnStateTensorDescriptor& output_h_desc, 179 const DeviceMemory<double>& output_h_data, 180 const dnn::RnnStateTensorDescriptor& output_c_desc, 181 const DeviceMemory<double>& output_c_data, 182 const DeviceMemory<double>& output_backprop_data, 183 const DeviceMemory<double>& output_h_backprop_data, 184 const DeviceMemory<double>& output_c_backprop_data, 185 DeviceMemory<double>* input_backprop_data, 186 DeviceMemory<double>* input_h_backprop_data, 187 DeviceMemory<double>* input_c_backprop_data, 188 DeviceMemory<double>* params_backprop_data, 189 DeviceMemory<uint8>* reserve_space_data, 190 ScratchAllocator* workspace_allocator, 191 dnn::ProfileResult* output_profile_result) override; 192 193 bool GetConvolveAlgorithms( 194 bool with_winograd_nonfused, int cc_major, int cc_minor, 195 std::vector<dnn::AlgorithmDesc>* out_algorithms) override; 196 197 bool GetRnnAlgorithms( 198 std::vector<dnn::AlgorithmDesc>* out_algorithms) override; 199 200 bool GetConvolveBackwardDataAlgorithms( 201 bool with_winograd_nonfused, int cc_major, int cc_minor, 202 std::vector<dnn::AlgorithmDesc>* out_algorithms) override; 203 204 bool GetConvolveBackwardFilterAlgorithms( 205 bool with_winograd_nonfused, int cc_major, int cc_minor, 206 std::vector<dnn::AlgorithmDesc>* out_algorithms) override; 207 208 bool DoBatchNormalizationForward( 209 Stream* stream, const DeviceMemory<float>& x, 210 const DeviceMemory<float>& scale, const DeviceMemory<float>& offset, 211 const DeviceMemory<float>& estimated_mean, 212 const DeviceMemory<float>& estimated_variance, 213 const dnn::BatchDescriptor& x_desc, 214 const dnn::BatchDescriptor& scale_offset_desc, const double epsilon, 215 DeviceMemory<float>* y, DeviceMemory<float>* batch_mean, 216 DeviceMemory<float>* batch_var, DeviceMemory<float>* saved_mean, 217 DeviceMemory<float>* saved_inv_var, bool is_training, 218 std::function<const DeviceMemory<float>&()> var_to_inv_var, 219 std::function<void()> inv_var_to_var) override; 220 221 bool DoBatchNormalizationForward( 222 Stream* stream, const DeviceMemory<Eigen::half>& x, 223 const DeviceMemory<float>& scale, const DeviceMemory<float>& offset, 224 const DeviceMemory<float>& estimated_mean, 225 const DeviceMemory<float>& estimated_variance, 226 const dnn::BatchDescriptor& x_desc, 227 const dnn::BatchDescriptor& scale_offset_desc, const double epsilon, 228 DeviceMemory<Eigen::half>* y, DeviceMemory<float>* batch_mean, 229 DeviceMemory<float>* batch_var, DeviceMemory<float>* saved_mean, 230 DeviceMemory<float>* saved_inv_var, bool is_training, 231 std::function<const DeviceMemory<float>&()> var_to_inv_var, 232 std::function<void()> inv_var_to_var) override; 233 234 bool DoBatchNormalizationBackward( 235 Stream* stream, const DeviceMemory<float>& y_backprop, 236 const DeviceMemory<float>& x, const DeviceMemory<float>& scale, 237 const DeviceMemory<float>& mean, const DeviceMemory<float>& variance, 238 const dnn::BatchDescriptor& x_desc, 239 const dnn::BatchDescriptor& scale_offset_desc, const double epsilon, 240 DeviceMemory<float>* x_backprop, DeviceMemory<float>* scale_backprop, 241 DeviceMemory<float>* offset_backprop) override; 242 243 bool DoBatchNormalizationBackward( 244 Stream* stream, const DeviceMemory<Eigen::half>& y_backprop, 245 const DeviceMemory<Eigen::half>& x, const DeviceMemory<float>& scale, 246 const DeviceMemory<float>& mean, const DeviceMemory<float>& inv_var, 247 const dnn::BatchDescriptor& x_desc, 248 const dnn::BatchDescriptor& scale_offset_desc, const double epsilon, 249 DeviceMemory<Eigen::half>* x_backprop, 250 DeviceMemory<float>* scale_backprop, 251 DeviceMemory<float>* offset_backprop) override; 252 253 port::Status DoConvolve( 254 dnn::ConvolutionKind kind, dnn::DataType element_type, Stream* stream, 255 const dnn::BatchDescriptor& input_descriptor, DeviceMemoryBase input_data, 256 const dnn::FilterDescriptor& filter_descriptor, 257 DeviceMemoryBase filter_data, 258 const dnn::BatchDescriptor& output_descriptor, 259 DeviceMemoryBase output_data, 260 const dnn::ConvolutionDescriptor& convolution_descriptor, 261 dnn::AlgorithmDesc algorithm_desc, DeviceMemory<uint8> scratch_memory, 262 dnn::ProfileResult* output_profile_result) override; 263 264 bool DoFusedConvolve( 265 Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor, 266 const DeviceMemory<double>& conv_input_data, double conv_input_scale, 267 const dnn::FilterDescriptor& filter_descriptor, 268 const DeviceMemory<double>& filter_data, 269 const dnn::ConvolutionDescriptor& convolution_descriptor, 270 const DeviceMemory<double>& side_input_data, double side_input_scale, 271 const dnn::BatchDescriptor& bias_descriptor, 272 const DeviceMemory<double>& biases, dnn::ActivationMode activation_mode, 273 const dnn::BatchDescriptor& output_descriptor, 274 DeviceMemory<double>* output_data, ScratchAllocator* scratch_allocator, 275 const dnn::AlgorithmConfig& algorithm_config, 276 dnn::ProfileResult* output_profile_result) override; 277 278 bool DoFusedConvolve( 279 Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor, 280 const DeviceMemory<float>& conv_input_data, float conv_input_scale, 281 const dnn::FilterDescriptor& filter_descriptor, 282 const DeviceMemory<float>& filter_data, 283 const dnn::ConvolutionDescriptor& convolution_descriptor, 284 const DeviceMemory<float>& side_input_data, float side_input_scale, 285 const dnn::BatchDescriptor& bias_descriptor, 286 const DeviceMemory<float>& biases, dnn::ActivationMode activation_mode, 287 const dnn::BatchDescriptor& output_descriptor, 288 DeviceMemory<float>* output_data, ScratchAllocator* scratch_allocator, 289 const dnn::AlgorithmConfig& algorithm_config, 290 dnn::ProfileResult* output_profile_result) override; 291 292 bool DoFusedConvolve(Stream* stream, 293 const dnn::BatchDescriptor& conv_input_descriptor, 294 const DeviceMemory<Eigen::half>& conv_input_data, 295 float conv_input_scale, 296 const dnn::FilterDescriptor& filter_descriptor, 297 const DeviceMemory<Eigen::half>& filter_data, 298 const dnn::ConvolutionDescriptor& convolution_descriptor, 299 const DeviceMemory<Eigen::half>& side_input_data, 300 float side_input_scale, 301 const dnn::BatchDescriptor& bias_descriptor, 302 const DeviceMemory<Eigen::half>& biases, 303 dnn::ActivationMode activation_mode, 304 const dnn::BatchDescriptor& output_descriptor, 305 DeviceMemory<Eigen::half>* output_data, 306 ScratchAllocator* scratch_allocator, 307 const dnn::AlgorithmConfig& algorithm_config, 308 dnn::ProfileResult* output_profile_result) override; 309 310 bool DoFusedConvolve( 311 Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor, 312 const DeviceMemory<int8>& conv_input_data, float conv_input_scale, 313 const dnn::FilterDescriptor& filter_descriptor, 314 const DeviceMemory<int8>& filter_data, 315 const dnn::ConvolutionDescriptor& convolution_descriptor, 316 const DeviceMemory<int8>& side_input_data, float side_input_scale, 317 const dnn::BatchDescriptor& bias_descriptor, 318 const DeviceMemory<float>& biases, dnn::ActivationMode activation_mode, 319 const dnn::BatchDescriptor& output_descriptor, 320 DeviceMemory<int8>* output_data, ScratchAllocator* scratch_allocator, 321 const dnn::AlgorithmConfig& algorithm_config, 322 dnn::ProfileResult* output_profile_result) override; 323 DoConvolveQuantized(Stream * stream,const dnn::BatchDescriptor & input_descriptor,const DeviceMemory<float> & input_data,const dnn::FilterDescriptor & filter_descriptor,const DeviceMemory<int8> & filter_coefficients,const DeviceMemory<float> & coefficient_scales,const dnn::ConvolutionDescriptor & convolution_descriptor,const dnn::BatchDescriptor & output_descriptor,DeviceMemory<float> * output_data)324 bool DoConvolveQuantized( 325 Stream* stream, const dnn::BatchDescriptor& input_descriptor, 326 const DeviceMemory<float>& input_data, 327 const dnn::FilterDescriptor& filter_descriptor, 328 const DeviceMemory<int8>& filter_coefficients, 329 const DeviceMemory<float>& coefficient_scales, 330 const dnn::ConvolutionDescriptor& convolution_descriptor, 331 const dnn::BatchDescriptor& output_descriptor, 332 DeviceMemory<float>* output_data) override { 333 LOG(ERROR) << "DoConvolveQuantized not supported by MIOpen"; 334 return false; 335 } 336 DoConvolveQuantized(Stream * stream,const dnn::BatchDescriptor & input_descriptor,const DeviceMemory<float> & input_data,const dnn::FilterDescriptor & filter_descriptor,const DeviceMemory<int16> & filter_coefficients,const DeviceMemory<float> & coefficient_scales,const dnn::ConvolutionDescriptor & convolution_descriptor,const dnn::BatchDescriptor & output_descriptor,DeviceMemory<float> * output_data)337 bool DoConvolveQuantized( 338 Stream* stream, const dnn::BatchDescriptor& input_descriptor, 339 const DeviceMemory<float>& input_data, 340 const dnn::FilterDescriptor& filter_descriptor, 341 const DeviceMemory<int16>& filter_coefficients, 342 const DeviceMemory<float>& coefficient_scales, 343 const dnn::ConvolutionDescriptor& convolution_descriptor, 344 const dnn::BatchDescriptor& output_descriptor, 345 DeviceMemory<float>* output_data) override { 346 LOG(ERROR) << "DoConvolveQuantized not supported by MIOpen"; 347 return false; 348 } 349 DoSeparableConvolve(Stream * stream,const dnn::BatchDescriptor & batch_descriptor,const DeviceMemory<float> & input_data,const dnn::FilterDescriptor & filter_descriptor,int depth_multiplier,const DeviceMemory<float> & first_weights,const DeviceMemory<float> & second_weights,const dnn::ConvolutionDescriptor & convolution_descriptor,const dnn::BatchDescriptor & output_descriptor,DeviceMemory<float> * output_data)350 bool DoSeparableConvolve( 351 Stream* stream, const dnn::BatchDescriptor& batch_descriptor, 352 const DeviceMemory<float>& input_data, 353 const dnn::FilterDescriptor& filter_descriptor, int depth_multiplier, 354 const DeviceMemory<float>& first_weights, 355 const DeviceMemory<float>& second_weights, 356 const dnn::ConvolutionDescriptor& convolution_descriptor, 357 const dnn::BatchDescriptor& output_descriptor, 358 DeviceMemory<float>* output_data) override { 359 LOG(ERROR) << "separable convolution not supported by MIOpen"; 360 return false; 361 } 362 363 bool DoConvolveBackwardBias( 364 Stream* stream, const dnn::BatchDescriptor& input_descriptor, 365 const DeviceMemory<double>& input_data, 366 const dnn::BatchDescriptor& bias_descriptor, 367 DeviceMemory<double>* backward_bias_data) override; 368 369 bool DoConvolveBackwardBias(Stream* stream, 370 const dnn::BatchDescriptor& input_descriptor, 371 const DeviceMemory<float>& input_data, 372 const dnn::BatchDescriptor& bias_descriptor, 373 DeviceMemory<float>* backward_bias_data) override; 374 375 bool DoConvolveBackwardBias( 376 Stream* stream, const dnn::BatchDescriptor& input_descriptor, 377 const DeviceMemory<Eigen::half>& input_data, 378 const dnn::BatchDescriptor& bias_descriptor, 379 DeviceMemory<Eigen::half>* backward_bias_data) override; 380 381 bool DoMatMul(Stream* stream, const DeviceMemory<float>& input_data, 382 const DeviceMemory<float>& weights, 383 const dnn::BatchDescriptor& input_dimensions, 384 const dnn::BatchDescriptor& output_dimensions, 385 DeviceMemory<float>* output_data) override; 386 DoMatMulQuantized(Stream * stream,const DeviceMemory<float> & input_data,const DeviceMemory<int8> & quantized_weights,const DeviceMemory<float> & weight_scales,const dnn::BatchDescriptor & input_dimensions,const dnn::BatchDescriptor & output_dimensions,DeviceMemory<float> * output_data)387 bool DoMatMulQuantized(Stream* stream, const DeviceMemory<float>& input_data, 388 const DeviceMemory<int8>& quantized_weights, 389 const DeviceMemory<float>& weight_scales, 390 const dnn::BatchDescriptor& input_dimensions, 391 const dnn::BatchDescriptor& output_dimensions, 392 DeviceMemory<float>* output_data) override { 393 LOG(ERROR) << "DNN MatMulQuantized not supported by MIOpen"; 394 return false; 395 } 396 DoMatMulQuantized(Stream * stream,const DeviceMemory<float> & input_data,const DeviceMemory<int16> & quantized_weights,const DeviceMemory<float> & weight_scales,const dnn::BatchDescriptor & input_dimensions,const dnn::BatchDescriptor & output_dimensions,DeviceMemory<float> * output_data)397 bool DoMatMulQuantized(Stream* stream, const DeviceMemory<float>& input_data, 398 const DeviceMemory<int16>& quantized_weights, 399 const DeviceMemory<float>& weight_scales, 400 const dnn::BatchDescriptor& input_dimensions, 401 const dnn::BatchDescriptor& output_dimensions, 402 DeviceMemory<float>* output_data) override { 403 LOG(ERROR) << "DNN MatMulQuantized not supported by MIOpen"; 404 return false; 405 } 406 407 bool DoBiasAdd(Stream* stream, const DeviceMemory<float>& input_data, 408 const DeviceMemory<float>& biases, 409 const dnn::BatchDescriptor& dimensions, 410 DeviceMemory<float>* output_data) override; 411 412 bool DoActivate(Stream* stream, dnn::ActivationMode activation_mode, 413 const dnn::BatchDescriptor& dimensions, 414 const DeviceMemory<float>& input_data, 415 DeviceMemory<float>* output_data, uint64 options) override; 416 417 bool DoPoolForward(Stream* stream, 418 const dnn::PoolingDescriptor& pooling_dimensions, 419 const dnn::BatchDescriptor& input_dimensions, 420 const DeviceMemory<double>& input_data, 421 const dnn::BatchDescriptor& output_dimensions, 422 DeviceMemory<double>* output_data, 423 ScratchAllocator* workspace_allocator = nullptr) override; 424 425 bool DoPoolForward(Stream* stream, 426 const dnn::PoolingDescriptor& pooling_dimensions, 427 const dnn::BatchDescriptor& input_dimensions, 428 const DeviceMemory<float>& input_data, 429 const dnn::BatchDescriptor& output_dimensions, 430 DeviceMemory<float>* output_data, 431 ScratchAllocator* workspace_allocator = nullptr) override; 432 433 bool DoPoolForward(Stream* stream, 434 const dnn::PoolingDescriptor& pooling_dimensions, 435 const dnn::BatchDescriptor& input_dimensions, 436 const DeviceMemory<Eigen::half>& input_data, 437 const dnn::BatchDescriptor& output_dimensions, 438 DeviceMemory<Eigen::half>* output_data, 439 ScratchAllocator* workspace_allocator = nullptr) override; 440 441 bool DoPoolBackward(Stream* stream, 442 const dnn::PoolingDescriptor& pooling_dimensions, 443 const dnn::BatchDescriptor& input_dimensions, 444 const DeviceMemory<double>& input_data, 445 const dnn::BatchDescriptor& output_dimensions, 446 const DeviceMemory<double>& output_data, 447 const DeviceMemory<double>& input_diff_data, 448 DeviceMemory<double>* output_diff_data, 449 ScratchAllocator* workspace_allocator = nullptr) override; 450 451 bool DoPoolBackward(Stream* stream, 452 const dnn::PoolingDescriptor& pooling_dimensions, 453 const dnn::BatchDescriptor& input_dimensions, 454 const DeviceMemory<float>& input_data, 455 const dnn::BatchDescriptor& output_dimensions, 456 const DeviceMemory<float>& output_data, 457 const DeviceMemory<float>& input_diff_data, 458 DeviceMemory<float>* output_diff_data, 459 ScratchAllocator* workspace_allocator = nullptr) override; 460 461 bool DoPoolBackward(Stream* stream, 462 const dnn::PoolingDescriptor& pooling_dimensions, 463 const dnn::BatchDescriptor& input_dimensions, 464 const DeviceMemory<Eigen::half>& input_data, 465 const dnn::BatchDescriptor& output_dimensions, 466 const DeviceMemory<Eigen::half>& output_data, 467 const DeviceMemory<Eigen::half>& input_diff_data, 468 DeviceMemory<Eigen::half>* output_diff_data, 469 ScratchAllocator* workspace_allocator = nullptr) override; 470 471 bool DoNormalizeWithDimensions( 472 Stream* stream, const dnn::NormalizeDescriptor& normalize_descriptor, 473 const dnn::BatchDescriptor& dimensions, 474 const DeviceMemory<float>& input_data, 475 DeviceMemory<float>* output_data) override; 476 477 bool DoNormalizeBackwardWithDimensions( 478 Stream* stream, const dnn::NormalizeDescriptor& normalize_descriptor, 479 const dnn::BatchDescriptor& dimensions, 480 const DeviceMemory<float>& raw_data, 481 const DeviceMemory<float>& normalized_data, 482 const DeviceMemory<float>& normalized_variable_gradient, 483 DeviceMemory<float>* raw_variable_gradient, 484 ScratchAllocator* workspace_allocator = nullptr) override; 485 486 bool DoDepthConcatenate( 487 Stream* stream, port::ArraySlice<dnn::BatchDescriptor> input_dimensions, 488 port::ArraySlice<const DeviceMemory<float>*> input_data, 489 DeviceMemory<float>* output_data) override; 490 491 bool DoElementwiseOperate( 492 Stream* stream, dnn::ElementwiseOperation operation, 493 port::ArraySlice<dnn::BatchDescriptor> input_dimensions, 494 port::ArraySlice<const DeviceMemory<float>*> input_data, 495 const dnn::BatchDescriptor& output_dimensions, 496 DeviceMemory<float>* output_data) override; 497 498 bool DoXYPad(Stream* stream, const dnn::BatchDescriptor& dimensions, 499 const DeviceMemory<float>& input_data, int64 left_pad, 500 int64 right_pad, int64 top_pad, int64 bottom_pad, 501 DeviceMemory<float>* output_data) override; 502 503 bool DoXYSlice(Stream* stream, const dnn::BatchDescriptor& dimensions, 504 const DeviceMemory<float>& input_data, int64 left_trim, 505 int64 right_trim, int64 top_trim, int64 bottom_trim, 506 DeviceMemory<float>* output_data) override; 507 508 bool DoMemcpyD2HQuantized(Stream* stream, 509 const DeviceMemory<float>& device_unquantized_src, 510 dnn::QuantizedActivationMode mode, void* host_dst, 511 int64 size) override; 512 513 bool DoMemcpyH2DQuantized( 514 Stream* stream, const void* host_src, int64 size, 515 dnn::QuantizedActivationMode mode, 516 DeviceMemory<float>* device_unquantized_dst) override; 517 518 // Derives an output batch descriptor from an input batch and convolution 519 // descriptors. 520 bool DeriveOutputBatchDescriptor( 521 const dnn::BatchDescriptor& batch_descriptor, 522 const dnn::FilterDescriptor& filter_descriptor, 523 const dnn::ConvolutionDescriptor& convolution_descriptor, 524 dnn::BatchDescriptor* output_batch_descriptor); 525 526 bool DoTransformTensor(Stream* stream, const dnn::BatchDescriptor& input_desc, 527 dnn::DataType input_type, 528 const DeviceMemoryBase& input_data, 529 const dnn::BatchDescriptor& output_desc, 530 dnn::DataType output_type, float scale, 531 DeviceMemoryBase* output_data) override; 532 533 bool DoFusedConvolutionBiasActivation( 534 Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor, 535 const DeviceMemory<float>& conv_input_data, 536 const dnn::FilterDescriptor& filter_descriptor, 537 const DeviceMemory<float>& filter_data, 538 const dnn::ConvolutionDescriptor& convolution_descriptor, 539 const dnn::BatchDescriptor& bias_descriptor, 540 const DeviceMemory<float>& bias_data, dnn::ActivationMode activation_mode, 541 const dnn::BatchDescriptor& output_descriptor, 542 DeviceMemory<float>* output_data, 543 dnn::ProfileResult* output_profile_result) override; 544 545 bool DoFusedBatchNormActivationInference( 546 Stream* stream, const dnn::BatchDescriptor& x_descriptor, 547 const DeviceMemory<float>& x_data, 548 const dnn::BatchDescriptor& scale_mean_variance_descriptor, 549 const DeviceMemory<float>& scale_data, 550 const DeviceMemory<float>& offset_data, 551 const DeviceMemory<float>& mean_data, 552 const DeviceMemory<float>& variance_data, double epsilon, 553 dnn::ActivationMode activation_mode, DeviceMemory<float>* y_data, 554 dnn::ProfileResult* output_profile_result) override; 555 556 bool DoFusedBatchNormActivationInference( 557 Stream* stream, const dnn::BatchDescriptor& x_descriptor, 558 const DeviceMemory<Eigen::half>& x_data, 559 const dnn::BatchDescriptor& scale_mean_variance_descriptor, 560 const DeviceMemory<float>& scale_data, 561 const DeviceMemory<float>& offset_data, 562 const DeviceMemory<float>& mean_data, 563 const DeviceMemory<float>& variance_data, double epsilon, 564 dnn::ActivationMode activation_mode, DeviceMemory<Eigen::half>* y_data, 565 dnn::ProfileResult* output_profile_result) override; 566 567 bool DoFusedBatchNormActivationForward( 568 Stream* stream, const dnn::BatchDescriptor& x_descriptor, 569 const DeviceMemory<float>& x_data, 570 const dnn::BatchDescriptor& scale_offset_mean_variance_descriptor, 571 const DeviceMemory<float>& scale_data, 572 const DeviceMemory<float>& offset_data, double epsilon, 573 dnn::ActivationMode activation_mode, DeviceMemory<float>* y_data, 574 DeviceMemory<float>* batch_mean_data, DeviceMemory<float>* batch_var_data, 575 DeviceMemory<float>* saved_mean_data, DeviceMemory<float>* saved_var_data, 576 dnn::ProfileResult* output_profile_result) override; 577 578 bool DoFusedBatchNormActivationForward( 579 Stream* stream, const dnn::BatchDescriptor& x_descriptor, 580 const DeviceMemory<Eigen::half>& x_data, 581 const dnn::BatchDescriptor& scale_offset_mean_variance_descriptor, 582 const DeviceMemory<float>& scale_data, 583 const DeviceMemory<float>& offset_data, double epsilon, 584 dnn::ActivationMode activation_mode, DeviceMemory<Eigen::half>* y_data, 585 DeviceMemory<float>* batch_mean_data, DeviceMemory<float>* batch_var_data, 586 DeviceMemory<float>* saved_mean_data, DeviceMemory<float>* saved_var_data, 587 dnn::ProfileResult* output_profile_result) override; 588 589 bool DoFusedBatchNormActivationBackward( 590 Stream* stream, const dnn::BatchDescriptor& y_act_backprop_descriptor, 591 const DeviceMemory<float>& y_act_backprop_data, 592 const DeviceMemory<float>& y_act_data, 593 dnn::ActivationMode activation_mode, const DeviceMemory<float>& x_bn_data, 594 const dnn::BatchDescriptor& scale_offset_mean_variance_descriptor, 595 const DeviceMemory<float>& scale_data, 596 const DeviceMemory<float>& offset_data, 597 const DeviceMemory<float>& saved_mean_data, 598 const DeviceMemory<float>& saved_var_data, 599 DeviceMemory<float>* x_bn_backprop_data, 600 DeviceMemory<float>* scale_backprop_data, 601 DeviceMemory<float>* offset_backprop_data, 602 dnn::ProfileResult* output_profile_result) override; 603 604 bool DoFusedBatchNormActivationBackward( 605 Stream* stream, const dnn::BatchDescriptor& y_act_backprop_descriptor, 606 const DeviceMemory<Eigen::half>& y_act_backprop_data, 607 const DeviceMemory<Eigen::half>& y_act_data, 608 dnn::ActivationMode activation_mode, 609 const DeviceMemory<Eigen::half>& x_bn_data, 610 const dnn::BatchDescriptor& scale_offset_mean_variance_descriptor, 611 const DeviceMemory<float>& scale_data, 612 const DeviceMemory<float>& offset_data, 613 const DeviceMemory<float>& saved_mean_data, 614 const DeviceMemory<float>& saved_var_data, 615 DeviceMemory<Eigen::half>* x_bn_backprop_data, 616 DeviceMemory<float>* scale_backprop_data, 617 DeviceMemory<float>* offset_backprop_data, 618 dnn::ProfileResult* output_profile_result) override; 619 GetParentExecutor()620 GpuExecutor* GetParentExecutor() { return parent_; } 621 622 private: 623 GpuExecutor* parent_; // Parent executor object. Not owned. 624 625 // Provide access to the MIOpen handle. 626 std::unique_ptr<class MIOpenAccess> miopen_; 627 628 template <class T, class U> 629 bool DoBatchNormalizationForwardImpl( 630 Stream* stream, dnn::DataType input_data_type, 631 dnn::DataType scale_data_type, const DeviceMemory<T>& x, 632 const DeviceMemory<U>& scale, const DeviceMemory<U>& offset, 633 const DeviceMemory<U>& estimated_mean, 634 const DeviceMemory<U>& estimated_variance, 635 const dnn::BatchDescriptor& x_desc, 636 const dnn::BatchDescriptor& scale_offset_desc, const double epsilon, 637 DeviceMemory<T>* y, DeviceMemory<U>* batch_mean, 638 DeviceMemory<U>* batch_var, DeviceMemory<U>* saved_mean, 639 DeviceMemory<U>* saved_inv_var, bool is_training, 640 std::function<const DeviceMemory<U>&()> var_to_inv_var, 641 std::function<void()> inv_var_to_var); 642 643 template <class T, class U> 644 bool DoBatchNormalizationBackwardImpl( 645 Stream* stream, int miopen_input_type, int miopen_scale_type, 646 const DeviceMemory<T>& y_backprop, const DeviceMemory<T>& x, 647 const DeviceMemory<U>& scale, const DeviceMemory<U>& mean, 648 const DeviceMemory<U>& variance, const dnn::BatchDescriptor& x_desc, 649 const dnn::BatchDescriptor& scale_offset_desc, const double epsilon, 650 DeviceMemory<T>* x_backprop, DeviceMemory<U>* scale_backprop, 651 DeviceMemory<U>* offset_backprop); 652 653 template <class T> 654 bool DoConvolveBackwardBiasImpl( 655 Stream* stream, 656 int miopen_type, // Actually miopenDataType_t. 657 const dnn::BatchDescriptor& input_descriptor, 658 const DeviceMemory<T>& input_data, 659 const dnn::BatchDescriptor& bias_descriptor, 660 DeviceMemory<T>* backward_bias_data); 661 662 template <class T> 663 bool DoRnnForwardImpl(Stream* stream, const MIOpenRnnDescriptor& rnn_desc, 664 const MIOpenRnnSequenceTensorDescriptor& input_desc, 665 const DeviceMemory<T>& input_data, 666 const MIOpenRnnStateTensorDescriptor& input_h_desc, 667 const DeviceMemory<T>& input_h_data, 668 const MIOpenRnnStateTensorDescriptor& input_c_desc, 669 const DeviceMemory<T>& input_c_data, 670 const DeviceMemory<T>& params, 671 const MIOpenRnnSequenceTensorDescriptor& output_desc, 672 DeviceMemory<T>* output_data, 673 const MIOpenRnnStateTensorDescriptor& output_h_desc, 674 DeviceMemory<T>* output_h_data, 675 const MIOpenRnnStateTensorDescriptor& output_c_desc, 676 DeviceMemory<T>* output_c_data, bool is_training, 677 ScratchAllocator* reserve_space_allocator, 678 ScratchAllocator* workspace_allocator); 679 template <class T> 680 bool DoRnnBackwardImpl(Stream* stream, const MIOpenRnnDescriptor& rnn_desc, 681 const MIOpenRnnSequenceTensorDescriptor& input_desc, 682 const DeviceMemory<T>& input_data, 683 const MIOpenRnnStateTensorDescriptor& input_h_desc, 684 const DeviceMemory<T>& input_h_data, 685 const MIOpenRnnStateTensorDescriptor& input_c_desc, 686 const DeviceMemory<T>& input_c_data, 687 const DeviceMemory<T>& params, 688 const MIOpenRnnSequenceTensorDescriptor& output_desc, 689 const DeviceMemory<T>& output_data, 690 const MIOpenRnnStateTensorDescriptor& output_h_desc, 691 const DeviceMemory<T>& output_h_data, 692 const MIOpenRnnStateTensorDescriptor& output_c_desc, 693 const DeviceMemory<T>& output_c_data, 694 const DeviceMemory<T>& output_backprop_data, 695 const DeviceMemory<T>& output_h_backprop_data, 696 const DeviceMemory<T>& output_c_backprop_data, 697 DeviceMemory<T>* input_backprop_data, 698 DeviceMemory<T>* input_h_backprop_data, 699 DeviceMemory<T>* input_c_backprop_data, 700 DeviceMemory<T>* params_backprop_data, 701 DeviceMemory<uint8>* reserve_space_data, 702 ScratchAllocator* workspace_allocator); 703 704 template <typename T> 705 bool DoFusedConvolutionBiasActivationImpl( 706 Stream* stream, 707 int miopen_type, // Actually miopenDataType_t. 708 const dnn::BatchDescriptor& conv_input_descriptor, 709 const DeviceMemory<T>& conv_input_data, 710 const dnn::FilterDescriptor& filter_descriptor, 711 const DeviceMemory<T>& filter_data, 712 const dnn::ConvolutionDescriptor& convolution_descriptor, 713 const dnn::BatchDescriptor& bias_descriptor, 714 const DeviceMemory<T>& bias_data, dnn::ActivationMode activation_mode, 715 const dnn::BatchDescriptor& output_descriptor, 716 DeviceMemory<T>* output_data, dnn::ProfileResult* output_profile_result); 717 718 template <typename T, typename U> 719 bool DoFusedBatchNormActivationInferenceImpl( 720 Stream* stream, 721 int miopen_type, // Actually miopenDataType_t. 722 const dnn::BatchDescriptor& x_descriptor, const DeviceMemory<T>& x_data, 723 const dnn::BatchDescriptor& scale_offset_mean_variance_descriptor, 724 const DeviceMemory<U>& scale_data, const DeviceMemory<U>& offset_data, 725 const DeviceMemory<U>& mean_data, const DeviceMemory<U>& variance_data, 726 double epsilon, dnn::ActivationMode activation_mode, 727 DeviceMemory<T>* y_data, dnn::ProfileResult* output_profile_result); 728 729 template <typename T, typename U> 730 bool DoFusedBatchNormActivationForwardImpl( 731 Stream* stream, 732 int miopen_type, // Actually miopenDataType_t. 733 const dnn::BatchDescriptor& x_descriptor, const DeviceMemory<T>& x_data, 734 const dnn::BatchDescriptor& scale_offset_mean_variance_descriptor, 735 const DeviceMemory<U>& scale_data, const DeviceMemory<U>& offset_data, 736 double epsilon, dnn::ActivationMode activation_mode, 737 DeviceMemory<T>* y_data, DeviceMemory<U>* batch_mean_data, 738 DeviceMemory<U>* batch_var_data, DeviceMemory<U>* saved_mean_data, 739 DeviceMemory<U>* saved_var_data, 740 dnn::ProfileResult* output_profile_result); 741 742 template <typename T, typename U> 743 bool DoFusedBatchNormActivationBackwardImpl( 744 Stream* stream, 745 int miopen_type, // Actually miopenDataType_t. 746 const dnn::BatchDescriptor& y_act_backprop_descriptor, 747 const DeviceMemory<T>& y_act_backprop_data, 748 const DeviceMemory<T>& y_act_data, dnn::ActivationMode activation_mode, 749 const DeviceMemory<T>& x_bn_data, 750 const dnn::BatchDescriptor& scale_offset_mean_variance_descriptor, 751 const DeviceMemory<U>& scale_data, const DeviceMemory<U>& offset_data, 752 const DeviceMemory<U>& saved_mean_data, 753 const DeviceMemory<U>& saved_var_data, 754 DeviceMemory<T>* x_bn_backprop_data, DeviceMemory<U>* scale_backprop_data, 755 DeviceMemory<U>* offset_backprop_data, 756 dnn::ProfileResult* output_profile_result); 757 758 port::Status DoPrepareForConvolution( 759 dnn::ConvolutionKind kind, dnn::DataType element_type, Stream* stream, 760 const dnn::BatchDescriptor& input_descriptor, DeviceMemoryBase input_data, 761 const dnn::FilterDescriptor& filter_descriptor, 762 DeviceMemoryBase filter_data, 763 const dnn::BatchDescriptor& output_descriptor, 764 DeviceMemoryBase output_data, 765 const dnn::ConvolutionDescriptor& convolution_descriptor, 766 const dnn::AlgorithmConfig& algorithm_config, 767 ScratchAllocator* scratch_allocator, dnn::AlgorithmDesc* algorithm_desc, 768 DeviceMemory<uint8>* scratch_memory) override; 769 770 SE_DISALLOW_COPY_AND_ASSIGN(MIOpenSupport); 771 }; 772 773 } // namespace gpu 774 } // namespace stream_executor 775 776 #endif // TENSORFLOW_STREAM_EXECUTOR_ROCM_ROCM_DNN_H_ 777