1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 // Neural Net operation support for StreamExecutor instances.
17 //
18 // This is an abstract interface for a platform to optionally support common
19 // neural net operations; it accommodates implementations such as the cudnn
20 // library operations.
21
22 #ifndef TENSORFLOW_STREAM_EXECUTOR_DNN_H_
23 #define TENSORFLOW_STREAM_EXECUTOR_DNN_H_
24
25 #include <functional>
26 #include <limits>
27 #include <memory>
28 #include <tuple>
29
30 #include "absl/types/optional.h"
31 #include "absl/types/span.h"
32 #include "tensorflow/core/platform/protobuf.h"
33 #include "tensorflow/stream_executor/device_memory.h"
34 #include "tensorflow/stream_executor/dnn.pb.h"
35 #include "tensorflow/stream_executor/lib/array_slice.h"
36 #include "tensorflow/stream_executor/lib/status.h"
37 #include "tensorflow/stream_executor/lib/statusor.h"
38 #include "tensorflow/stream_executor/platform/logging.h"
39 #include "tensorflow/stream_executor/platform/port.h"
40
41 namespace Eigen {
42 struct half;
43 } // namespace Eigen
44
45 namespace stream_executor {
46
47 class HostBuffer;
48 class Stream;
49 class ScratchAllocator;
50
51 namespace dnn {
52
53 // Specifies an index to use when accessing specific spatial dimensions.
54 enum class DimIndex : int {
55 X = 0,
56 Y = 1,
57 Z = 2,
58 };
59
60 // Helper functions to make methods more readable.
GetDim(absl::Span<const int64> data,DimIndex dim)61 inline int64 GetDim(absl::Span<const int64> data, DimIndex dim) {
62 return data.rbegin()[static_cast<int64>(dim)];
63 }
64
SetDim(absl::Span<int64> data,DimIndex dim,int64 value)65 inline void SetDim(absl::Span<int64> data, DimIndex dim, int64 value) {
66 data.rbegin()[static_cast<int64>(dim)] = value;
67 }
68
SetDim(std::vector<int64> * data,DimIndex dim,int64 value)69 inline void SetDim(std::vector<int64>* data, DimIndex dim, int64 value) {
70 return SetDim(absl::MakeSpan(*data), dim, value);
71 }
72
73 // tensorflow::int64 is not the same type as tensorflow::protobuf_int64 in
74 // open-source. Wrapper function that gives an int64 array slice view of a
75 // repeated int64 protobuf field.
AsInt64Slice(const tensorflow::protobuf::RepeatedField<tensorflow::protobuf_int64> & v)76 inline absl::Span<const int64> AsInt64Slice(
77 const tensorflow::protobuf::RepeatedField<tensorflow::protobuf_int64>& v) {
78 return absl::Span<const int64>(reinterpret_cast<const int64*>(v.data()),
79 v.size());
80 }
81
AsInt64Slice(tensorflow::protobuf::RepeatedField<tensorflow::protobuf_int64> * v)82 inline absl::Span<int64> AsInt64Slice(
83 tensorflow::protobuf::RepeatedField<tensorflow::protobuf_int64>* v) {
84 return absl::Span<int64>(reinterpret_cast<int64*>(v->mutable_data()),
85 v->size());
86 }
87
88 // Returns a string representation of the given data layout.
89 string DataLayoutString(DataLayout layout);
90
91 // Specifies a quantization for activations in a given BatchDescriptor.
92 enum class QuantizedActivationMode {
93 k8Bit = 1,
94 k16Bit = 2,
95 k32Bit = 4,
96 };
97
98 // A helper class to convert C/C++ types to the proper enums.
99 template <typename T>
100 struct ToDataType;
101 template <>
102 struct ToDataType<float> {
103 static constexpr DataType value = DataType::kFloat;
104 };
105 template <>
106 struct ToDataType<double> {
107 static constexpr DataType value = DataType::kDouble;
108 };
109 template <>
110 struct ToDataType<Eigen::half> {
111 static constexpr DataType value = DataType::kHalf;
112 };
113 template <>
114 struct ToDataType<int8> {
115 static constexpr DataType value = DataType::kInt8;
116 };
117 template <>
118 struct ToDataType<int32> {
119 static constexpr DataType value = DataType::kInt32;
120 };
121
122 // Specifies the types of a RNN model.
123 enum class RnnMode {
124 kRnnRelu = 0,
125 kRnnTanh = 1,
126 kRnnLstm = 2,
127 kRnnGru = 3,
128 };
129
130 // Specifies the input model and whether there is a linear transformation
131 // between the input state and the first layer hidden state.
132 enum class RnnInputMode {
133 kRnnLinearSkip = 0,
134 kRnnSkipInput = 1,
135 };
136
137 // Specifies the number of directions used in a RNN model. When bidirection
138 // is used, the input states and output sequence contain data for both
139 // directions.
140 enum class RnnDirectionMode {
141 kRnnUnidirectional = 0,
142 kRnnBidirectional = 1,
143 };
144
145 // Relevant to DepthToSpace and SpaceToDepth. This is the write layout when
146 // performing depth to space and the read layout when performing space to depth.
147 // It's specified with most-major dimension first and most-minor dimension last.
148 // In DepthToSpace, the D*M² values are read in and then, for DepthHeightWidth,
149 // written out to the output patch, by varying first width, then height, then
150 // depth. In C array format, it looks like [depth][height][width]. See
151 // DepthToSpace comment for more information.
152 enum class DepthToSpaceLayout { DepthHeightWidth };
153
154 // Specifies the descriptor for a RNN model.
155 //
156 // An example use case:
157 // * The user first creates a model through createRnnDescriptor.
158 // * The user queries the size of the underlying opaque parameter buffer.
159 // * The user creates and initializes a parameter buffer of the proper size.
160 // * The user runs forward and backward operations using this RNN descriptor.
161 // * Once a while, user queries maintainable weights and bias regions from
162 // the underlying parameter buffer. They are more likely to be forward
163 // compatible and should used in saving and restoring a model.
164 // * The user releases the RNN descriptor when the model is no longer in use.
165 class RnnDescriptor {
166 public:
167 struct ParamsRegion {
168 int64 offset;
169 int64 size;
170 };
171 typedef std::vector<ParamsRegion> ParamsRegions;
172 virtual ~RnnDescriptor() {}
173 virtual int64 ParamsSizeInBytes() const { return -1; }
174 virtual ParamsRegions ParamsWeightRegions() const { return ParamsRegions(); }
175 virtual ParamsRegions ParamsBiasRegions() const { return ParamsRegions(); }
176 };
177
178 // Specifies the sequence in a RNN model.
179 //
180 // The user is responsible for releasing this descriptor when it is no longer
181 // in use. The destructor releases the underlying descriptors.
182 class RnnSequenceTensorDescriptor {
183 public:
184 virtual ~RnnSequenceTensorDescriptor() {}
185 };
186
187 // Specifies either the input and hidden state in a RNN model.
188 //
189 // The user is responsible for releasing this descriptor when it is no longer
190 // in use. The destructor releases the underlying descriptors.
191 class RnnStateTensorDescriptor {
192 public:
193 virtual ~RnnStateTensorDescriptor() {}
194 };
195
196 // Returns a string representation of the given quantization mode.
197 string QuantizedActivationModeString(QuantizedActivationMode mode);
198
199 // Describes the dimensions that a layer consumes/produces.
200 //
201 // This is a matrix (height, width), its "depth" (feature_map_count),
202 // how many of these matrices are present (count),
203 // and the maximum and minimum values expected in the matrix (value_max,
204 // value_min).
205 // If input is quantized, all values greater
206 // than value_max will be clipped to value_max and all values less than
207 // value_min will be clipped to value_min.
208 // When quantized output is dequantized no value will be greater than
209 // value_max or less than value_min.
210 //
211 // Uses the named argument construction form:
212 //
213 // auto input_batch_dimensions =
214 // BatchDescriptor().set_count(42).set_feature_map_count(7)...
215 //
216 // Details:
217 //
218 // For a convolutional layer, a single inference takes a 3-dimensional matrix
219 // of input and produces a 3-dimensional matrix of output. We call the three
220 // dimensions height, width and feature_map_count, where for an image, the
221 // height and width correspond to the Y and X pixel indices, respectively, and
222 // the feature_map_count corresponds to the RGB dimension of the input data.
223 // Then the count indicates how many 3D matrices are being presented to be
224 // processed at once; this corresponds to the neural network concept of
225 // minibatch size.
226 //
227 // For a fully connected layer, it's better to put the nodes of the layer in
228 // the feature_map_count, and leave the height and weight as degenerate (== 1).
229 // Count indicates how many input vectors (degenerate 3D matrices) are to be
230 // processed.
231 //
232 // If unspecified, value_max and value_min default to 0.0.
233 // If value_max == value_min the Stream will attempt to derive valid values -
234 // for example the output of Relu6 activation will always be in the range
235 // [0.0, 6.0].
236 //
237 // If unspecified, layout defaults to kYXDepthBatch.
238 class BatchDescriptor {
239 public:
240 // Creates a "blank" batch descriptor, which should be initialized via the
241 // named argument helpers.
242 BatchDescriptor();
243 explicit BatchDescriptor(int ndims);
244
245 // Clones values from 'other' for initialization.
246 void CloneFrom(const BatchDescriptor& other);
247
248 string ToString() const;
249 string ToShortString() const;
250
251 // Pre-condition:
252 // value_max_ == 0
253 // value_min_ == 0
254 // quantized_activation_mode_ == QuantizedActivationMode::k8Bit
255 TensorDescriptorProto ToProto(DataType data_type) const;
256
257 // Accessors.
258 int64 count() const { return tensor_.dimensions(0); }
259 int64 feature_map_count() const { return tensor_.dimensions(1); }
260 int64 height() const { return GetDim(spatial_size(), DimIndex::Y); }
261 int64 width() const { return GetDim(spatial_size(), DimIndex::X); }
262 int64 spatial_dim(DimIndex dim) const { return GetDim(spatial_size(), dim); }
263 int ndims() const { return spatial_size().size(); }
264 float value_max() const { return value_max_; }
265 float value_min() const { return value_min_; }
266 DataLayout layout() const { return tensor_.data_layout(); }
267 QuantizedActivationMode quantized_activation_mode() const {
268 return quantized_activation_mode_;
269 }
270 // Full dimensions of the underlying data, ordered according to a specific
271 // layout.
272 std::vector<int64> full_dims(const DataLayout& layout) const;
273
274 // Full strides of the underlying data, ordered according to a specific
275 // layout.
276 std::vector<int64> full_strides(const DataLayout& layout) const;
277
278 // Named-argument helpers for avoiding user error during construction.
279 BatchDescriptor& set_count(int64 value) {
280 tensor_.set_dimensions(0, value);
281 return *this;
282 }
283 BatchDescriptor& set_feature_map_count(int64 value) {
284 tensor_.set_dimensions(1, value);
285 return *this;
286 }
287 BatchDescriptor& set_height(int64 value) {
288 SetDim(spatial_size(), DimIndex::Y, value);
289 return *this;
290 }
291 BatchDescriptor& set_width(int64 value) {
292 SetDim(spatial_size(), DimIndex::X, value);
293 return *this;
294 }
295 BatchDescriptor& set_spatial_dim(DimIndex dim, int64 value) {
296 SetDim(spatial_size(), dim, value);
297 return *this;
298 }
299 BatchDescriptor& set_value_max(float value) {
300 value_max_ = value;
301 return *this;
302 }
303 BatchDescriptor& set_value_min(float value) {
304 value_min_ = value;
305 return *this;
306 }
307 BatchDescriptor& set_layout(DataLayout layout) {
308 tensor_.set_data_layout(layout);
309 return *this;
310 }
311 BatchDescriptor& set_quantized_activation_mode(
312 QuantizedActivationMode quantized_activation_mode) {
313 quantized_activation_mode_ = quantized_activation_mode;
314 return *this;
315 }
316
317 // Return the number of nodes in a single feature map.
318 int64 NodesPerFeatureMap() const;
319
320 // Return the number of nodes across all feature maps. Note that this is not
321 // affected by the batch count.
322 int64 NodesAcrossFeatureMaps() const;
323
324 // Returns the number of elements (e.g. RGB pixel values) required to hold a
325 // given batch descriptor, given a no-padding assumption. Note that this is
326 // affected by the batch count.
327 int64 ElementCount() const;
328
329 // Return the number of weights required to fully connect a layer with
330 // dimensions given by the 'input' descriptor with a layer with dimensions
331 // given by the 'output' descriptor.
332 static int64 FullyConnectedWeightCount(const BatchDescriptor& input,
333 const BatchDescriptor& output);
334
335 // Return the number of biases required to fully connect to an output layer
336 // with dimensions given the 'output' descriptor.
337 static int64 FullyConnectedBiasCount(const BatchDescriptor& output);
338
339 // Return a BatchDescriptor for the output of a depth concatenation
340 // with the given input descriptors. The inputs should have the same
341 // dimensions, except possibly for feature_map_count(), though this
342 // function does not verify that.
343 static BatchDescriptor DepthConcatenateOutputDescriptor(
344 port::ArraySlice<dnn::BatchDescriptor> inputs);
345
346 private:
347 absl::Span<const int64> spatial_size() const {
348 return AsInt64Slice(tensor_.dimensions()).subspan(2);
349 }
350
351 absl::Span<int64> spatial_size() {
352 return AsInt64Slice(tensor_.mutable_dimensions()).subspan(2);
353 }
354
355 TensorDescriptorProto tensor_;
356 float value_max_;
357 float value_min_;
358 QuantizedActivationMode quantized_activation_mode_;
359 };
360
361 // Returns a string representation of the given filter layout.
362 string FilterLayoutString(FilterLayout layout);
363
364 // Describes a filter for the convolution. This is the "window" from
365 // height-by-width patches of each of the feature maps in the input layer to the
366 // cells within the output feature map.
367 //
368 // Uses the named argument construction form:
369 //
370 // FilterDescriptor filter_dimensions;
371 // filter_dimensions
372 // .set_output_feature_map_count(42)
373 // .set_input_feature_map_count(7)
374 // ...
375 //
376 // Arguments:
377 // - output_feature_map_count: number of feature maps in the output layer.
378 // - input_feature_map_count: number of feature maps in the input layer (from
379 // which the filter patch is taken).
380 // - input_filter_height: "height" number of neurons used in the sliding window
381 // over the input layer.
382 // - input_filter_width: "width" number of neurons used in the sliding window
383 // over the input layer.
384 //
385 // Sometimes names like "filter input height" are referred to by synonymous
386 // terminology, such as "kernel y size".
387 //
388 // If unspecified, layout defaults to kOutputInputYX.
389 class FilterDescriptor {
390 public:
391 // By default construction, all dimensions are set to zero, so they should all
392 // be populated by the user via the named-argument helpers below. (See class
393 // comment for details.)
394 FilterDescriptor();
395 explicit FilterDescriptor(int ndims);
396 ~FilterDescriptor();
397
398 // Named-argument helpers for avoiding user error during construction.
399 FilterDescriptor& set_output_feature_map_count(int64 value) {
400 tensor_.set_dimensions(0, value);
401 return *this;
402 }
403 FilterDescriptor& set_input_feature_map_count(int64 value) {
404 tensor_.set_dimensions(1, value);
405 return *this;
406 }
407 FilterDescriptor& set_input_filter_height(int64 value) {
408 SetDim(input_filter_dims(), DimIndex::Y, value);
409 return *this;
410 }
411 FilterDescriptor& set_input_filter_width(int64 value) {
412 SetDim(input_filter_dims(), DimIndex::X, value);
413 return *this;
414 }
415 FilterDescriptor& set_layout(FilterLayout layout) {
416 tensor_.set_filter_layout(layout);
417 return *this;
418 }
419 FilterDescriptor& set_spatial_dim(DimIndex dim, int64 value) {
420 SetDim(input_filter_dims(), dim, value);
421 return *this;
422 }
423 int ndims() const { return input_filter_dims().size(); }
424
425 void CloneFrom(const FilterDescriptor& other);
426
427 string ToString() const;
428 string ToShortString() const;
429 TensorDescriptorProto ToProto(DataType data_type) const;
430
431 // Returns the number of weights required as parameters for a convolution
432 // using this filter descriptor.
433 int64 ComputeWeightCount() const;
434
435 // Returns the number of biases required as parameters for a convolution
436 // using this filter descriptor.
437 int64 bias_count() const { return output_feature_map_count(); }
438
439 int64 output_feature_map_count() const { return tensor_.dimensions(0); }
440 int64 input_feature_map_count() const { return tensor_.dimensions(1); }
441 int64 input_filter_height() const {
442 return GetDim(input_filter_dims(), DimIndex::Y);
443 }
444 int64 input_filter_width() const {
445 return GetDim(input_filter_dims(), DimIndex::X);
446 }
447 int64 input_filter_dim(DimIndex dim) const {
448 return GetDim(input_filter_dims(), dim);
449 }
450
451 FilterLayout layout() const { return tensor_.filter_layout(); }
452
453 absl::Span<const int64> input_filter_dims() const {
454 return AsInt64Slice(tensor_.dimensions()).subspan(2);
455 }
456
457 private:
458 absl::Span<int64> input_filter_dims() {
459 return AsInt64Slice(tensor_.mutable_dimensions()).subspan(2);
460 }
461
462 TensorDescriptorProto tensor_;
463 };
464
465 // Describes how padding should be aligned when the total number of pad
466 // elements is odd.
467 enum class PadAlignment : int64 {
468 kDefault = 0, // default padding for the device.
469 kCudnnPadding, // cuDNN padding - prefer to pad at the start.
470 kTensorFlowPadding, // TensorFlow padding - prefer to pad at the end.
471 };
472
473 // Returns a string representation of the given padding alignment.
474 string PadAlignmentString(PadAlignment alignment);
475
476 // Print alignment to str. Needed to use CHECK_EQ between two PadAlignments.
477 std::ostream& operator<<(std::ostream& str, dnn::PadAlignment alignment);
478
479 // Describes a convolution.
480 //
481 // Uses the named argument construction form:
482 //
483 // ConvolutionDescriptor convolution_dimensions;
484 // convolution_dimensions
485 // .set_vertical_filter_stride(2)
486 // .set_horizontal_filter_stride(2)
487 // ...
488 //
489 // Arguments:
490 // - zero_padding_height: padding of the "y dimension" of the input data. Note
491 // that this is different from the height of the filter.
492 // - zero_padding_width: analogous to the height above, but in the "x
493 // dimension".
494 // - vertical_filter_stride: the convolution slides a 2-dimensional window of
495 // filter-height-by-filter-width over the input layer -- the center of that
496 // window is moved in the "y dimension" according to this stride value.
497 // - horizontal_filter_stride: analogous to the vertical stride above, but in
498 // the "x dimension".
499 // - vertical_dilation_rate: there will be (vertical_dilation_rate - 1) skipped
500 // cells between each filter element in the "y dimension".
501 // - horizontal_dilation_rate: there will be (horizontal_dilation_rate - 1)
502 // skipped cells between each filter element in the "x dimension".
503 // - convolution_not_crosscor: By default (convolution_not_crosscor == false),
504 // we perform cross correlation rather than convolution. With the flag set,
505 // we perform convolution. Convolution and cross correlation are related by
506 // rotating the filter by 180 degrees (or equivalently flipping all spatial
507 // dimensions).
508 class ConvolutionDescriptor {
509 public:
510 // By default construction, there is no zero-padding and the filter stride is
511 // 1x1 (centering the filter on every cell in the input layer's
512 // width-by-height area).
513 ConvolutionDescriptor();
514 explicit ConvolutionDescriptor(int ndims);
515 ~ConvolutionDescriptor();
516
517 string ToString() const;
518 string ToShortString() const;
519 ConvolutionDescriptorProto ToProto() const { return proto_; }
520
521 ConvolutionDescriptor& set_zero_padding_height(int64 value) {
522 SetDim(padding(), DimIndex::Y, value);
523 return *this;
524 }
525 ConvolutionDescriptor& set_zero_padding_width(int64 value) {
526 SetDim(padding(), DimIndex::X, value);
527 return *this;
528 }
529 ConvolutionDescriptor& set_zero_padding(DimIndex dim, int64 value) {
530 SetDim(padding(), dim, value);
531 return *this;
532 }
533 ConvolutionDescriptor& set_vertical_filter_stride(int64 value) {
534 SetDim(strides(), DimIndex::Y, value);
535 return *this;
536 }
537 ConvolutionDescriptor& set_horizontal_filter_stride(int64 value) {
538 SetDim(strides(), DimIndex::X, value);
539 return *this;
540 }
541 ConvolutionDescriptor& set_filter_stride(DimIndex dim, int64 value) {
542 SetDim(strides(), dim, value);
543 return *this;
544 }
545 ConvolutionDescriptor& set_vertical_dilation_rate(int64 value) {
546 SetDim(dilations(), DimIndex::Y, value);
547 return *this;
548 }
549 ConvolutionDescriptor& set_horizontal_dilation_rate(int64 value) {
550 SetDim(dilations(), DimIndex::X, value);
551 return *this;
552 }
553 ConvolutionDescriptor& set_dilation_rate(DimIndex dim, int64 value) {
554 SetDim(dilations(), dim, value);
555 return *this;
556 }
557 ConvolutionDescriptor& set_group_count(int group_count) {
558 proto_.set_group_count(group_count);
559 return *this;
560 }
561 ConvolutionDescriptor& set_convolution_not_crosscorr(bool conv) {
562 proto_.set_convolution_mode(conv ? ConvolutionMode::CONVOLUTION
563 : ConvolutionMode::CROSS_CORRELATION);
564 return *this;
565 }
566 int64 zero_padding_height() const { return GetDim(padding(), DimIndex::Y); }
567 int64 zero_padding_width() const { return GetDim(padding(), DimIndex::X); }
568 int64 vertical_filter_stride() const {
569 return GetDim(strides(), DimIndex::Y);
570 }
571 int64 horizontal_filter_stride() const {
572 return GetDim(strides(), DimIndex::X);
573 }
574 int64 vertical_dilation_rate() const {
575 return GetDim(dilations(), DimIndex::Y);
576 }
577 int64 horizontal_dilation_rate() const {
578 return GetDim(dilations(), DimIndex::X);
579 }
580
581 int zero_padding(DimIndex dim) const { return GetDim(padding(), dim); }
582 int filter_stride(DimIndex dim) const { return GetDim(strides(), dim); }
583 int dilation_rate(DimIndex dim) const { return GetDim(dilations(), dim); }
584 // TODO(timshen): remove this function. No users of this class is setting a
585 // non-default pad alignment.
586 PadAlignment pad_alignment() const { return PadAlignment::kDefault; }
587 int group_count() const { return proto_.group_count(); }
588 int ndims() const { return padding().size(); }
589 bool convolution_not_crosscorr() const {
590 return proto_.convolution_mode() == ConvolutionMode::CONVOLUTION;
591 }
592
593 absl::Span<const int64> strides() const {
594 return AsInt64Slice(proto_.strides());
595 }
596
597 absl::Span<const int64> dilations() const {
598 return AsInt64Slice(proto_.dilations());
599 }
600
601 absl::Span<const int64> padding() const {
602 return AsInt64Slice(proto_.paddings());
603 }
604
605 private:
606 absl::Span<int64> strides() { return AsInt64Slice(proto_.mutable_strides()); }
607
608 absl::Span<int64> dilations() {
609 return AsInt64Slice(proto_.mutable_dilations());
610 }
611
612 absl::Span<int64> padding() {
613 return AsInt64Slice(proto_.mutable_paddings());
614 }
615
616 ConvolutionDescriptorProto proto_;
617
618 // TODO(leary) cudnn provides these fields, but need to characterize what
619 // their effect is -- they may be boolean rather than integral.
620 // int64 upscale_input_x;
621 // int64 upscale_input_y;
622 };
623
624 // A patch of values in the input can be pooled via either a max or an average
625 // operation.
626 // Specify int64 so there's no padding in PoolingDescriptor.
627 enum class PoolingMode : int64 {
628 kMaximum,
629 kAverage,
630 };
631
632 // Specify the dimension in which to concatenate inputs in space.
633 // Specify int64 so there's no padding in SpaceConcatenateMode.
634 enum class SpaceConcatenateMode : int64 {
635 XDirection,
636 YDirection,
637 };
638
639 // Returns a short name for the pooling mode, e.g. "Avg".
640 string ShortPoolingModeString(PoolingMode mode);
641
642 // Describes a pooling operation to be enqueued onto a stream via a platform's
643 // DnnSupport.
644 //
645 // TODO(broune): describe how padding works and what happens if the
646 // window height/width is not divisible by the vertical/horizontal
647 // stride.
648 //
649 // Arguments:
650 // pooling_mode: pooling operator to use on the input patch
651 // window_height: height of input window
652 // window_width: width of input window
653 // vertical_stride: vertical delta for center of the input patch
654 // horizontal_stride: horizontal delta for center of the input patch
655 class PoolingDescriptor {
656 public:
657 PoolingDescriptor();
658 explicit PoolingDescriptor(int ndims);
659
660 PoolingDescriptor& set_pooling_mode(PoolingMode value) {
661 mode_ = value;
662 return *this;
663 }
664 PoolingDescriptor& set_window_height(int64 value) {
665 SetDim(&window_, DimIndex::Y, value);
666 return *this;
667 }
668 PoolingDescriptor& set_window_width(int64 value) {
669 SetDim(&window_, DimIndex::X, value);
670 return *this;
671 }
672 PoolingDescriptor& set_window(DimIndex dim, int64 value) {
673 SetDim(&window_, dim, value);
674 return *this;
675 }
676 PoolingDescriptor& set_vertical_padding(int64 value) {
677 SetDim(&padding_, DimIndex::Y, value);
678 return *this;
679 }
680 PoolingDescriptor& set_horizontal_padding(int64 value) {
681 SetDim(&padding_, DimIndex::X, value);
682 return *this;
683 }
684 PoolingDescriptor& set_padding(DimIndex dim, int64 value) {
685 SetDim(&padding_, dim, value);
686 return *this;
687 }
688 PoolingDescriptor& set_vertical_stride(int64 value) {
689 SetDim(&strides_, DimIndex::Y, value);
690 return *this;
691 }
692 PoolingDescriptor& set_horizontal_stride(int64 value) {
693 SetDim(&strides_, DimIndex::X, value);
694 return *this;
695 }
696 PoolingDescriptor& set_stride(DimIndex dim, int64 value) {
697 SetDim(&strides_, dim, value);
698 return *this;
699 }
700 PoolingDescriptor& set_propagate_nans(bool value) {
701 propagate_nans_ = value;
702 return *this;
703 }
704
705 int ndims() const { return ndims_; }
706 void CloneFrom(const PoolingDescriptor& other);
707
708 string ToString() const;
709 string ToShortString() const;
710
711 PoolingMode mode() const { return mode_; }
712 int64 window_height() const { return GetDim(window_, DimIndex::Y); }
713 int64 window_width() const { return GetDim(window_, DimIndex::X); }
714 int64 window(DimIndex dim) const { return GetDim(window_, dim); }
715 int64 vertical_padding() const { return GetDim(padding_, DimIndex::Y); }
716 int64 horizontal_padding() const { return GetDim(padding_, DimIndex::X); }
717 int64 padding(DimIndex dim) const { return GetDim(padding_, dim); }
718 int64 vertical_stride() const { return GetDim(strides_, DimIndex::Y); }
719 int64 horizontal_stride() const { return GetDim(strides_, DimIndex::X); }
720 int64 stride(DimIndex dim) const { return GetDim(strides_, dim); }
721 absl::Span<const int64> window() const { return window_; }
722 absl::Span<const int64> padding() const { return padding_; }
723 absl::Span<const int64> strides() const { return strides_; }
724 bool propagate_nans() const { return propagate_nans_; }
725
726 private:
727 PoolingMode mode_;
728 int ndims_;
729 bool propagate_nans_;
730
731 // Stored as: ..., y, x.
732 std::vector<int64> window_;
733 std::vector<int64> padding_;
734 std::vector<int64> strides_;
735 };
736
737 // Collects parameters for DNN algorithms
738 class AlgorithmDesc {
739 public:
740 typedef int64 Index;
741 AlgorithmDesc() : AlgorithmDesc(0, false) {}
742 AlgorithmDesc(Index a, bool use_tensor_ops) {
743 proto_.set_algo_id(a);
744 proto_.set_math_type(use_tensor_ops ? AlgorithmProto::TENSOR_OP_MATH
745 : AlgorithmProto::DEFAULT_MATH);
746 }
747 bool tensor_ops_enabled() const {
748 return proto_.math_type() == AlgorithmProto::TENSOR_OP_MATH;
749 }
750 Index algo_id() const { return proto_.algo_id(); }
751 bool operator==(const AlgorithmDesc& other) const {
752 return algo_id() == other.algo_id() &&
753 tensor_ops_enabled() == other.tensor_ops_enabled();
754 }
755 uint64 hash() const;
756
757 AlgorithmProto ToProto() const { return proto_; }
758
759 private:
760 AlgorithmProto proto_;
761 };
762
763 // Describes the result from a perf experiment.
764 //
765 // Arguments:
766 // algorithm: returns the exact algorithm that was used.
767 // elapsed_time_in_ms: returns the measured elapsed time in milliseconds.
768 class ProfileResult {
769 public:
770 bool is_valid() const {
771 return algorithm_.has_value() &&
772 elapsed_time_in_ms() != std::numeric_limits<float>::max();
773 }
774
775 AlgorithmDesc algorithm() const { return *algorithm_; }
776 void set_algorithm(AlgorithmDesc val) { algorithm_ = val; }
777
778 float elapsed_time_in_ms() const { return elapsed_time_in_ms_; }
779 void set_elapsed_time_in_ms(float val) { elapsed_time_in_ms_ = val; }
780
781 size_t scratch_size() const { return scratch_size_; }
782 void set_scratch_size(size_t val) { scratch_size_ = val; }
783
784 private:
785 absl::optional<AlgorithmDesc> algorithm_;
786 float elapsed_time_in_ms_ = std::numeric_limits<float>::max();
787 // The scratch size algorithm_ requires. Currently it's only populated by
788 // convolutions.
789 size_t scratch_size_ = 0;
790 };
791
792 // Describes the configuration for the algorithms that will used.
793 //
794 // Arguments:
795 // algorithm: the primary algorithm that should be used.
796 // algorithm_no_scratch: a secondary algorithm that should be used, if the
797 // the allocation for the scratch memory fails.
798 class AlgorithmConfig {
799 public:
800 AlgorithmConfig() {}
801 explicit AlgorithmConfig(AlgorithmDesc algorithm) : algorithm_(algorithm) {}
802 AlgorithmConfig(AlgorithmDesc algorithm, AlgorithmDesc algorithm_no_scratch)
803 : algorithm_(algorithm), algorithm_no_scratch_(algorithm_no_scratch) {}
804 absl::optional<AlgorithmDesc> algorithm() const { return algorithm_; }
805 void set_algorithm(AlgorithmDesc val) { algorithm_ = val; }
806 absl::optional<AlgorithmDesc> algorithm_no_scratch() const {
807 return algorithm_no_scratch_;
808 }
809 void set_algorithm_no_scratch(AlgorithmDesc val) {
810 algorithm_no_scratch_ = val;
811 }
812 bool operator==(const AlgorithmConfig& other) const {
813 return this->algorithm_ == other.algorithm_ &&
814 this->algorithm_no_scratch_ == other.algorithm_no_scratch_;
815 }
816 bool operator!=(const AlgorithmConfig& other) const {
817 return !(*this == other);
818 }
819 string ToString() const;
820
821 private:
822 absl::optional<AlgorithmDesc> algorithm_;
823 absl::optional<AlgorithmDesc> algorithm_no_scratch_;
824 };
825
826 // Describes a local response normalization (LRN). LRN is used e.g. in
827 // dist_belief.
828 //
829 // Let V be the vector of feature maps at some (batch, y, x)
830 // coordinate. LRN applies independently to each vector V in the
831 // input, across all coordinates (batch, y, x), by mapping each V to
832 // another vector U of the same size using the formula
833 //
834 // U_i = V_i / ((bias + alpha * (sum_j V_j^2)) ^ beta)
835 //
836 // where the sum is taken over j in the closed range [i - range, i + range].
837 //
838 // When calculating U_i the j in the sum can extend beyond the bounds
839 // of V. If wrap_around is true, then V_j = V_{j mod F} where F is the
840 // size of V, which is the number of feature maps. If wrap_around is
841 // false, then V_j = 0 for j outside [0, F-1].
842 //
843 // If segment_size <= F, where F is the number of feature_maps, then
844 // segment_size has no effect. Otherwise, each consecutive segment of
845 // segment_size entries in V are normalized separately.
846 //
847 // Not all StreamExecutors allow wrap_around == true or segment_size
848 // != 64. Some do not implement normalization at all.
849 class NormalizeDescriptor {
850 public:
851 NormalizeDescriptor();
852
853 NormalizeDescriptor& set_bias(float bias) {
854 bias_ = bias;
855 return *this;
856 }
857
858 NormalizeDescriptor& set_range(int32 range) {
859 range_ = range;
860 return *this;
861 }
862
863 NormalizeDescriptor& set_alpha(float alpha) {
864 alpha_ = alpha;
865 return *this;
866 }
867
868 NormalizeDescriptor& set_beta(float beta) {
869 beta_ = beta;
870 return *this;
871 }
872
873 NormalizeDescriptor& set_wrap_around(bool wrap_around) {
874 wrap_around_ = wrap_around;
875 return *this;
876 }
877
878 NormalizeDescriptor& set_segment_size(int32 segment_size) {
879 segment_size_ = segment_size;
880 return *this;
881 }
882
883 void CloneFrom(const NormalizeDescriptor& other);
884
885 string ToString() const;
886 string ToShortString() const;
887
888 float bias() const { return bias_; }
889 int32 range() const { return range_; }
890 float alpha() const { return alpha_; }
891 float beta() const { return beta_; }
892 bool wrap_around() const { return wrap_around_; }
893 int32 segment_size() const { return segment_size_; }
894
895 private:
896 float bias_;
897 int32 range_;
898 float alpha_;
899 float beta_;
900 bool wrap_around_;
901 int32 segment_size_;
902 };
903
904 // Returns a string representation of the given activation mode.
905 string ActivationModeString(ActivationMode mode);
906
907 // Describes the operation that DoElementwiseOperation should perform on its
908 // inputs.
909 enum class ElementwiseOperation { kAdd, kMultiply };
910
911 string ElementwiseOperationString(ElementwiseOperation op);
912
913 // A simple class representing the version of the backing library, to
914 // workaround the "too perfect forwarding" issue in gcc6+ compilers.
915 // See PR#16309 and issue #18402 for links discussing the issue.
916 class VersionInfo {
917 public:
918 VersionInfo(int major = 0, int minor = 0, int patch = 0)
919 : major_(major), minor_(minor), patch_(patch) {}
920 int major_version() const { return major_; }
921 int minor_version() const { return minor_; }
922 int patch() const { return patch_; }
923
924 private:
925 int major_;
926 int minor_;
927 int patch_;
928 };
929
930 // Suite of operations typically used for implementing Deep/Convolutional Neural
931 // Nets. Note: A false return value of an operation indicates the
932 // implementation is not available.
933 //
934 // TODO(b/118763918): this class (or rather dispatch table) has several
935 // problems:
936 // * Some overloads are missing. Ideally we want to have template virtual
937 // functions while the template arguments is a closed set. However, we don't
938 // get that from the language.
939 // * The API is a union of cuDNN and another private backend. Only 10% of the
940 // functions are actually implemented by both backends, the rest are
941 // actually backend-specific. The massive interface creates extra mental
942 // burden.
943 // * Poor error handling: the API should return Status objects.
944 //
945 // PrepareForConvolution is an example for how new APIs should be written.
946 class DnnSupport {
947 public:
948 DnnSupport() {}
949 virtual ~DnnSupport() {}
950
951 virtual port::Status Init() = 0;
952
953 // Gets the version of the backing library, as a VersionInfo object.
954 virtual port::StatusOr<VersionInfo> GetVersion() {
955 return port::UnimplementedError(
956 "DnnSupport::GetVersion not implemented on this platform.");
957 }
958
959 // Performs a single-precision forward batch normalization operation onto
960 // the stream.
961 //
962 // Arguments:
963 // stream: borrowed pointer to the stream that the batch normalization
964 // operation should be enqueued onto.
965 // x: input data.
966 // scale: scaling parameters.
967 // offset: offset parameters.
968 // estimated_mean: population mean estimated during training.
969 // Used for inference only; empty for training.
970 // estimated_variance: population variance estimated during training,
971 // used for inference only; empty for training.
972 // x_desc: dimensions of the input data, which is the same as the dimensions
973 // of the output.
974 // scale_offset_desc: dimensions of scale and offset.
975 // epsilon: a small floating point number added to the variance of x.
976 // y: output data.
977 // batch_mean: batch mean, to be used to compute the running mean.
978 // batch_variance: batch variance, to be used to compute
979 // the running variance.
980 // reserve_space_1: saved mean, to be reused in the backward gradient
981 // computation.
982 // reserve_space_2: saved inv_var (1/sqrt(epsilon + variance), to be reused
983 // in the backward gradient computation.
984 // is_training: Set to true for training, false for inference.
985 // var_to_inv_var: a function to convert the variance to inverted variance
986 // for cuDNN v4 forward inference.
987 // inv_var_to_var: a function to convert the inverted variance to
988 // variance for cuDNN v4 forward training, to be used for TensorFlow
989 // to calculate the running variance.
990 virtual bool DoBatchNormalizationForward(
991 Stream* stream, const DeviceMemory<float>& x,
992 const DeviceMemory<float>& scale, const DeviceMemory<float>& offset,
993 const DeviceMemory<float>& estimated_mean,
994 const DeviceMemory<float>& estimated_variance,
995 const dnn::BatchDescriptor& x_desc,
996 const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
997 DeviceMemory<float>* y, DeviceMemory<float>* batch_mean,
998 DeviceMemory<float>* batch_var, DeviceMemory<float>* reserve_space_1,
999 DeviceMemory<float>* reserve_space_2, bool is_training,
1000 std::function<const DeviceMemory<float>&()> var_to_inv_var,
1001 std::function<void()> inv_var_to_var) {
1002 return false;
1003 }
1004
1005 // Performs a half-precision forwards batch normalization operation onto the
1006 // stream. See DoBatchNormalizationForward above for argument details.
1007 virtual bool DoBatchNormalizationForward(
1008 Stream* stream, const DeviceMemory<Eigen::half>& x,
1009 const DeviceMemory<float>& scale, const DeviceMemory<float>& offset,
1010 const DeviceMemory<float>& estimated_mean,
1011 const DeviceMemory<float>& estimated_variance,
1012 const dnn::BatchDescriptor& x_desc,
1013 const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
1014 DeviceMemory<Eigen::half>* y, DeviceMemory<float>* batch_mean,
1015 DeviceMemory<float>* batch_var, DeviceMemory<float>* reserve_space_1,
1016 DeviceMemory<float>* reserve_space_2, bool is_training,
1017 std::function<const DeviceMemory<float>&()> var_to_inv_var,
1018 std::function<void()> inv_var_to_var) {
1019 return false;
1020 }
1021
1022 // Performs a single-precision backward batch normalization gradient
1023 // computation operation onto the stream.
1024 //
1025 // Arguments:
1026 // stream: borrowed pointer to the stream that the batch normalization
1027 // gradient computation operation should be enqueued onto.
1028 // y_backprop: gradient with regard to output y.
1029 // x: input data.
1030 // scale: scaling parameters.
1031 // inv_var: 1/sqrt(epsilon + variance) of x.
1032 // x_desc: dimensions of the input data, which is the same as the dimensions
1033 // of the output.
1034 // scale_offset_desc: dimensions of scale and offset.
1035 // epsilon: a small floating point number added to the variance of x.
1036 // x_backprop: gradient with respect to input x.
1037 // scale_backprop: gradient with respect to scale.
1038 // offset_backprop: gradient with respect to offset.
1039 virtual bool DoBatchNormalizationBackward(
1040 Stream* stream, const DeviceMemory<float>& y_backprop,
1041 const DeviceMemory<float>& x, const DeviceMemory<float>& scale,
1042 const DeviceMemory<float>& mean, const DeviceMemory<float>& inv_var,
1043 const dnn::BatchDescriptor& x_desc,
1044 const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
1045 DeviceMemory<float>* x_backprop, DeviceMemory<float>* scale_backprop,
1046 DeviceMemory<float>* offset_backprop) {
1047 return false;
1048 }
1049
1050 // Performs a half-precision backward batch normalization gradient computation
1051 // operation onto the stream. See DoBatchNormalizationBackward above for
1052 // argument details.
1053 virtual bool DoBatchNormalizationBackward(
1054 Stream* stream, const DeviceMemory<Eigen::half>& y_backprop,
1055 const DeviceMemory<Eigen::half>& x, const DeviceMemory<float>& scale,
1056 const DeviceMemory<float>& mean, const DeviceMemory<float>& inv_var,
1057 const dnn::BatchDescriptor& x_desc,
1058 const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
1059 DeviceMemory<Eigen::half>* x_backprop,
1060 DeviceMemory<float>* scale_backprop,
1061 DeviceMemory<float>* offset_backprop) {
1062 return false;
1063 }
1064
1065 // Enqueues a fused convolution operation onto the stream.
1066 // We provide several variants with different types for inputs, biases and
1067 // scaling parameters.
1068 //
1069 // Arguments (all borrowed):
1070 // stream: borrowed pointer to the stream that the 'convolve' operation
1071 // should be enqueued onto.
1072 // conv_input_descriptor: dimensions of the convolution input layer.
1073 // conv_input_data: un-owned device memory region which contains the
1074 // convolution input.
1075 // conv_input_scale: a floating point scale to multiply with each element
1076 // of conv_input_data.
1077 // filter_descriptor: dimensions of the convolution filter.
1078 // filter_data: un-owned device memory region which contains the
1079 // convolution filter weights.
1080 // convolution_descriptor: stride of the convolution filter.
1081 // biases: un-owned device memory region containing biases to add to the
1082 // input.
1083 // activation_mode: Type of activation to perform.
1084 // side_input_data: un-owned device memory region which contains optional
1085 // side input data. If 'side_input_scale' is non-zero, then this must
1086 // point to data in the tensor shape specified by output_shape.
1087 // It will be scaled by 'side_input_scale' and added to the convolution
1088 // result and bias prior to applying the activation function.
1089 // side_input_scale: a floating point scale to multiply with each element
1090 // of side_input_data.
1091 // output_descriptor: dimensions of the output layer.
1092 // output_data: un-owned device memory region in which to place the
1093 // convolution result.
1094 // scratch_allocator: un-owned, may-be-null object that may allocate scratch
1095 // space in order to speed up the convolution operation.
1096 // algorithm_config: specifies which algorithm should be used for the
1097 // operation.
1098 // output_profile_result: the output profile result for this call. The
1099 // profiling is only enabled when this is not nullptr.
1100 //
1101 // conv_input_descriptor, filter_descriptor, convolution_descriptor and
1102 // output_descriptor together specify exactly how the convolution is aligned
1103 // with the input data:
1104 //
1105 // * (input dimensions - filter size + 1) / filter stride == output dimensions
1106 // corresponds to dist_belief padding = VALID, i.e. the input is not padded.
1107 // * input dimensions / filter stride == output dimensions
1108 // corresponds to dist_belief padding = SAME, i.e. input and output are the
1109 // same size - this requires padding the input.
1110 // * (input dimensions + filter size - 1) / filter stride == output dimensions
1111 // corresponds to dist_belief padding = FULL, i.e. the output is sized so
1112 // that if the inverse of the filter is applied to the output in VALID mode
1113 // the result is the same size as the input - this requires even more
1114 // padding of the input.
1115 virtual bool DoFusedConvolve(
1116 Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor,
1117 const DeviceMemory<double>& conv_input_data, double conv_input_scale,
1118 const dnn::FilterDescriptor& filter_descriptor,
1119 const DeviceMemory<double>& filter_data,
1120 const dnn::ConvolutionDescriptor& convolution_descriptor,
1121 const DeviceMemory<double>& side_input_data, double side_input_scale,
1122 const dnn::BatchDescriptor& bias_descriptor,
1123 const DeviceMemory<double>& biases, dnn::ActivationMode activation_mode,
1124 const dnn::BatchDescriptor& output_descriptor,
1125 DeviceMemory<double>* output_data, ScratchAllocator* scratch_allocator,
1126 const dnn::AlgorithmConfig& algorithm_config,
1127 dnn::ProfileResult* output_profile_result) {
1128 return false;
1129 }
1130
1131 // This is the float version of DoFusedConvolve.
1132 virtual bool DoFusedConvolve(
1133 Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor,
1134 const DeviceMemory<float>& conv_input_data, float conv_input_scale,
1135 const dnn::FilterDescriptor& filter_descriptor,
1136 const DeviceMemory<float>& filter_data,
1137 const dnn::ConvolutionDescriptor& convolution_descriptor,
1138 const DeviceMemory<float>& side_input_data, float side_input_scale,
1139 const dnn::BatchDescriptor& bias_descriptor,
1140 const DeviceMemory<float>& biases, dnn::ActivationMode activation_mode,
1141 const dnn::BatchDescriptor& output_descriptor,
1142 DeviceMemory<float>* output_data, ScratchAllocator* scratch_allocator,
1143 const dnn::AlgorithmConfig& algorithm_config,
1144 dnn::ProfileResult* output_profile_result) {
1145 return false;
1146 }
1147
1148 // This is the Eigen::half version of DoFusedConvolve.
1149 // The scaling parameters are still floats.
1150 virtual bool DoFusedConvolve(
1151 Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor,
1152 const DeviceMemory<Eigen::half>& conv_input_data, float conv_input_scale,
1153 const dnn::FilterDescriptor& filter_descriptor,
1154 const DeviceMemory<Eigen::half>& filter_data,
1155 const dnn::ConvolutionDescriptor& convolution_descriptor,
1156 const DeviceMemory<Eigen::half>& side_input_data, float side_input_scale,
1157 const dnn::BatchDescriptor& bias_descriptor,
1158 const DeviceMemory<Eigen::half>& biases,
1159 dnn::ActivationMode activation_mode,
1160 const dnn::BatchDescriptor& output_descriptor,
1161 DeviceMemory<Eigen::half>* output_data,
1162 ScratchAllocator* scratch_allocator,
1163 const dnn::AlgorithmConfig& algorithm_config,
1164 dnn::ProfileResult* output_profile_result) {
1165 return false;
1166 }
1167
1168 // This is the int8 version of DoFusedConvolve.
1169 // The bias input and scaling parameters are floats.
1170 virtual bool DoFusedConvolve(
1171 Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor,
1172 const DeviceMemory<int8>& conv_input_data, float conv_input_scale,
1173 const dnn::FilterDescriptor& filter_descriptor,
1174 const DeviceMemory<int8>& filter_data,
1175 const dnn::ConvolutionDescriptor& convolution_descriptor,
1176 const DeviceMemory<int8>& side_input_data, float side_input_scale,
1177 const dnn::BatchDescriptor& bias_descriptor,
1178 const DeviceMemory<float>& biases, dnn::ActivationMode activation_mode,
1179 const dnn::BatchDescriptor& output_descriptor,
1180 DeviceMemory<int8>* output_data, ScratchAllocator* scratch_allocator,
1181 const dnn::AlgorithmConfig& algorithm_config,
1182 dnn::ProfileResult* output_profile_result) {
1183 return false;
1184 }
1185
1186 template <typename ElementType>
1187 port::Status PrepareForConvolution(
1188 ConvolutionKind kind, Stream* stream,
1189 const BatchDescriptor& batch_descriptor,
1190 DeviceMemory<ElementType> input_data,
1191 const FilterDescriptor& filter_descriptor,
1192 DeviceMemory<ElementType> filter_data,
1193 const BatchDescriptor& output_descriptor,
1194 DeviceMemory<ElementType> output_data,
1195 const ConvolutionDescriptor& convolution_descriptor,
1196 const AlgorithmConfig& algorithm_config,
1197 ScratchAllocator* scratch_allocator, AlgorithmDesc* algorithm_desc,
1198 DeviceMemory<uint8>* scratch_memory) {
1199 return DoPrepareForConvolution(
1200 kind, ToDataType<ElementType>::value, stream, batch_descriptor,
1201 input_data, filter_descriptor, filter_data, output_descriptor,
1202 output_data, convolution_descriptor, algorithm_config,
1203 scratch_allocator, algorithm_desc, scratch_memory);
1204 }
1205
1206 // Enqueues a single-precision convolution operation onto the stream.
1207 //
1208 // Arguments (all borrowed):
1209 // stream: borrowed pointer to the stream that the 'convolve' operation
1210 // should be enqueued onto.
1211 // input_descriptor: dimensions of the input layer.
1212 // input_data: un-owned device memory region which contains the
1213 // convolution input.
1214 // filter_descriptor: dimensions of the convolution filter.
1215 // convolution_descriptor: stride of the convolution filter.
1216 // output_descriptor: dimensions of the output layer.
1217 // output_data: un-owned device memory region in which to place the
1218 // convolution result.
1219 // algorithm_desc: specifies which algorithm should be used for the
1220 // operation.
1221 // scratch: un-owned device memory for scratch space in order to speed up
1222 // the convolution operation.
1223 // output_profile_result: the output profile result for this call. The
1224 // profiling is only enabled when this is not nullptr.
1225 //
1226 // input_descriptor, filter_descriptor, convolution_descriptor and
1227 // output_descriptor together specify exactly how the convolution is aligned
1228 // with the input data:
1229 //
1230 // * (input dimensions - filter size + 1) / filter stride == output dimensions
1231 // corresponds to dist_belief padding = VALID, i.e. the input is not padded.
1232 // * input dimensions / filter stride == output dimensions
1233 // corresponds to dist_belief padding = SAME, i.e. input and output are the
1234 // same size - this requires padding the input.
1235 // * (input dimensions + filter size - 1) / filter stride == output dimensions
1236 // corresponds to dist_belief padding = FULL, i.e. the output is sized so
1237 // that if the inverse of the filter is applied to the output in VALID mode
1238 // the result is the same size as the input - this requires even more
1239 // padding of the input.
1240 virtual port::Status DoConvolve(
1241 ConvolutionKind kind, DataType element_type, Stream* stream,
1242 const BatchDescriptor& input_descriptor, DeviceMemoryBase input_data,
1243 const FilterDescriptor& filter_descriptor, DeviceMemoryBase filter_data,
1244 const BatchDescriptor& output_descriptor, DeviceMemoryBase output_data,
1245 const ConvolutionDescriptor& convolution_descriptor,
1246 AlgorithmDesc algorithm_desc, DeviceMemory<uint8> scratch_memory,
1247 ProfileResult* output_profile_result) = 0;
1248
1249 template <typename ElementType>
1250 bool DoConvolve(Stream* stream, const dnn::BatchDescriptor& input_descriptor,
1251 const DeviceMemory<ElementType>& input_data,
1252 const dnn::FilterDescriptor& filter_descriptor,
1253 const DeviceMemory<ElementType>& filter_data,
1254 const dnn::ConvolutionDescriptor& convolution_descriptor,
1255 const dnn::BatchDescriptor& output_descriptor,
1256 DeviceMemory<ElementType>* output_data,
1257 const dnn::AlgorithmDesc& algorithm_desc,
1258 DeviceMemory<uint8>* scratch_memory,
1259 ProfileResult* output_profile_result) {
1260 return IsStatusOk(
1261 DoConvolve(ConvolutionKind::FORWARD, ToDataType<ElementType>::value,
1262 stream, input_descriptor, input_data, filter_descriptor,
1263 filter_data, output_descriptor, *output_data,
1264 convolution_descriptor, algorithm_desc, *scratch_memory,
1265 output_profile_result),
1266 !output_profile_result);
1267 }
1268
1269 // Return a list of algorithms supported by the forward convolution pass.
1270 // cc_major and cc_minor are the compute capabilities of the device.
1271 virtual bool GetConvolveAlgorithms(
1272 bool with_winograd_nonfused, int cc_major, int cc_minor,
1273 std::vector<AlgorithmDesc>* out_algorithms);
1274
1275 // Returns a list of supported rnn algorithms.
1276 virtual bool GetRnnAlgorithms(std::vector<AlgorithmDesc>* out_algorithms);
1277
1278 // Version of DoConvolve that uses pre-quantized 8 bit coefficients.
1279 // coefficient_scales specifies the scaling of each column of coefficients:
1280 // original float coefficient[row * num_columns + column] =
1281 // quantized coefficient[row * num_columns + column] *
1282 // coefficient_scales[column].
1283 virtual bool DoConvolveQuantized(
1284 Stream* stream, const dnn::BatchDescriptor& input_descriptor,
1285 const DeviceMemory<float>& input_data,
1286 const dnn::FilterDescriptor& filter_descriptor,
1287 const DeviceMemory<int8>& filter_coefficients,
1288 const DeviceMemory<float>& coefficient_scales,
1289 const dnn::ConvolutionDescriptor& convolution_descriptor,
1290 const dnn::BatchDescriptor& output_descriptor,
1291 DeviceMemory<float>* output_data) = 0;
1292
1293 // Same as DoConvolveQuantized above, but int8 filter coefficients.
1294 virtual bool DoConvolveQuantized(
1295 Stream* stream, const dnn::BatchDescriptor& input_descriptor,
1296 const DeviceMemory<float>& input_data,
1297 const dnn::FilterDescriptor& filter_descriptor,
1298 const DeviceMemory<int16>& filter_coefficients,
1299 const DeviceMemory<float>& coefficient_scales,
1300 const dnn::ConvolutionDescriptor& convolution_descriptor,
1301 const dnn::BatchDescriptor& output_descriptor,
1302 DeviceMemory<float>* output_data) = 0;
1303
1304 // Variation of the above with the weight matrix split into two matrices.
1305 // first_weights: Coefficients of the first matrix.
1306 // second_weights: Coefficients of the second matrix.
1307 // depth_multiplier: specifies the columns of the first matrix and rows
1308 // of the second one - first_weights columns = depth_multiplier,
1309 // second_weights rows = depth_multiplier *
1310 // filter_descriptor.input_feature_map_count().
1311 // see go/separable for documentation on separable convolutions.
1312 virtual bool DoSeparableConvolve(
1313 Stream* stream, const BatchDescriptor& input_descriptor,
1314 const DeviceMemory<float>& input_data,
1315 const FilterDescriptor& filter_descriptor, int depth_multiplier,
1316 const DeviceMemory<float>& first_weights,
1317 const DeviceMemory<float>& second_weights,
1318 const ConvolutionDescriptor& convolution_descriptor,
1319 const BatchDescriptor& output_descriptor,
1320 DeviceMemory<float>* output_data) = 0;
1321
1322 // Enqueues a single-precision backward convolution (for data) operation onto
1323 // the stream.
1324 //
1325 // Arguments:
1326 // stream: borrowed pointer to the stream that the 'convolve' operation
1327 // should be enqueued onto.
1328 // filter_descriptor: dimensions of the convolution filter.
1329 // filter_data: coefficients for the convolution filter.
1330 // output_descriptor: dimensions of the output gradients, which is the same
1331 // as the dimensions of the output.
1332 // backward_output_data: un-owned device memory region which contains the
1333 // backprop of the output.
1334 // convolution_descriptor: stride of the convolution filter.
1335 // input_descriptor: dimensions of the input layer.
1336 // backward_input_data: un-owned device memory region in which to place the
1337 // backprop of the input.
1338 // scratch_allocator: un-owned, may-be-null object that may allocate scratch
1339 // space in order to speed up the convolution operation.
1340 template <typename ElementType>
1341 bool DoConvolveBackwardData(
1342 Stream* stream, const dnn::FilterDescriptor& filter_descriptor,
1343 const DeviceMemory<ElementType>& filter_data,
1344 const dnn::BatchDescriptor& output_descriptor,
1345 const DeviceMemory<ElementType>& backward_output_data,
1346 const dnn::ConvolutionDescriptor& convolution_descriptor,
1347 const dnn::BatchDescriptor& input_descriptor,
1348 DeviceMemory<ElementType>* backward_input_data,
1349 const dnn::AlgorithmDesc& algorithm_desc,
1350 DeviceMemory<uint8>* scratch_memory,
1351 ProfileResult* output_profile_result) {
1352 return IsStatusOk(
1353 DoConvolve(ConvolutionKind::BACKWARD_DATA,
1354 ToDataType<ElementType>::value, stream, input_descriptor,
1355 *backward_input_data, filter_descriptor, filter_data,
1356 output_descriptor, backward_output_data,
1357 convolution_descriptor, algorithm_desc, *scratch_memory,
1358 output_profile_result),
1359 !output_profile_result);
1360 }
1361
1362 // Return a list of algorithms supported by the backward convolution pass for
1363 // data.
1364 virtual bool GetConvolveBackwardDataAlgorithms(
1365 bool with_winograd_nonfused, int cc_major, int cc_minor,
1366 std::vector<AlgorithmDesc>* out_algorithms);
1367
1368 // Enqueues a single-precision backward convolution (for filter) operation
1369 // onto the stream.
1370 //
1371 // Arguments:
1372 // stream: borrowed pointer to the stream that the 'convolve' operation
1373 // should be enqueued onto.
1374 // input_descriptor: dimensions of the input layer.
1375 // input_data: un-owned device memory region which contains the
1376 // convolution input.
1377 // output_descriptor: dimensions of the output gradients, which is the same
1378 // as the dimensions of the output.
1379 // backward_output_data: un-owned device memory region which contains the
1380 // backprop of the output.
1381 // convolution_descriptor: stride of the convolution filter.
1382 // filter_descriptor: dimensions of the convolution filter.
1383 // backward_filter_data: un-owned device memory region in which to place the
1384 // backprop of the filter.
1385 // scratch_allocator: un-owned, may-be-null object that may allocate scratch
1386 // space in order to speed up the convolution operation.
1387 template <typename ElementType>
1388 bool DoConvolveBackwardFilter(
1389 Stream* stream, const BatchDescriptor& input_descriptor,
1390 const DeviceMemory<ElementType>& input_data,
1391 const BatchDescriptor& output_descriptor,
1392 const DeviceMemory<ElementType>& backward_output_data,
1393 const ConvolutionDescriptor& convolution_descriptor,
1394 const FilterDescriptor& filter_descriptor,
1395 DeviceMemory<ElementType>* backward_filter_data,
1396 const dnn::AlgorithmDesc& algorithm_desc,
1397 DeviceMemory<uint8>* scratch_memory,
1398 ProfileResult* output_profile_result) {
1399 return IsStatusOk(
1400 DoConvolve(ConvolutionKind::BACKWARD_FILTER,
1401 ToDataType<ElementType>::value, stream, input_descriptor,
1402 input_data, filter_descriptor, *backward_filter_data,
1403 output_descriptor, backward_output_data,
1404 convolution_descriptor, algorithm_desc, *scratch_memory,
1405 output_profile_result),
1406 !output_profile_result);
1407 }
1408
1409 // Return a list of algorithms supported by the backward convolution pass for
1410 // filters.
1411 virtual bool GetConvolveBackwardFilterAlgorithms(
1412 bool with_winograd_nonfused, int cc_major, int cc_minor,
1413 std::vector<AlgorithmDesc>* out_algorithms);
1414
1415 // Enqueues a single-precision backward convolution (for bias) operation onto
1416 // the stream.
1417 //
1418 // Arguments:
1419 // stream: borrowed pointer to the stream that the 'convolve' operation
1420 // should be enqueued onto.
1421 // input_descriptor: dimensions of the input layer.
1422 // input_data: un-owned device memory region which contains the
1423 // convolution input.
1424 // bias_descriptor: dimensions of the bias tensor. Should be the same as the
1425 // input dimensions, but with the spatial dimensions set to 1.
1426 // backward_filter_data: un-owned device memory region in which to place the
1427 // backprop of the bias.
1428 virtual bool DoConvolveBackwardBias(Stream* stream,
1429 const BatchDescriptor& input_descriptor,
1430 const DeviceMemory<float>& input_data,
1431 const BatchDescriptor& bias_descriptor,
1432 DeviceMemory<float>* backward_bias_data) {
1433 return false;
1434 }
1435
1436 virtual bool DoConvolveBackwardBias(
1437 Stream* stream, const BatchDescriptor& input_descriptor,
1438 const DeviceMemory<double>& input_data,
1439 const BatchDescriptor& bias_descriptor,
1440 DeviceMemory<double>* backward_bias_data) {
1441 return false;
1442 }
1443
1444 virtual bool DoConvolveBackwardBias(
1445 Stream* stream, const BatchDescriptor& input_descriptor,
1446 const DeviceMemory<Eigen::half>& input_data,
1447 const BatchDescriptor& bias_descriptor,
1448 DeviceMemory<Eigen::half>* backward_bias_data) {
1449 return false;
1450 }
1451
1452 // Fully connects the "nodes" (float values) in input_data with
1453 // shape input_dimensions to output_data with output_dimensions
1454 // using provided weights. This is equivalent to computing a matrix
1455 // product, hence the name MatMul.
1456 //
1457 // A BatchDescriptor has four dimensions: batch, y, x, depth. Matrix products
1458 // happen in two dimensions. To get down to two dimensions, we consider the
1459 // input y, x and depth dimension as one combined dimension T. For now,
1460 // assume that the output height and width are 1 and let OD be the output
1461 // depth.
1462 //
1463 // There are three device memory buffers passed in to this
1464 // function. We can now view all three as matrices:
1465 //
1466 // input_data: A batch x T matrix
1467 // weights: A T x OD matrix
1468 // output_data: A batch x OD matrix
1469 //
1470 // This function then computes the matrix product of input_data and
1471 // weights and writes the result into output_data.
1472 //
1473 // Here the weights buffer is in row major order, i.e. the first OD
1474 // entries in weights are the first row, the second OD entries in
1475 // weights are the second row and so on.
1476 //
1477 // The case for output width*height > 1 is more complicated. Let K =
1478 // OY * OX where OY is the output height and OX is the output
1479 // width. Then weights is divided into K sub-arrays W_i, for
1480 // i=0,...,k-1, that each represent a T x OD matrix. This function
1481 // then computes the K matrix multiplications of input_data with
1482 // each W_i. This creates K matrices with dimensions batch x
1483 // OD. These K matrices are concatenated horizontally to form one
1484 // larger matrix with dimensions batch x (K*OD); note that this is
1485 // not the same as concatenating the bytes of the matrices. The
1486 // combined matrix can then be interpreted as a tensor with
1487 // dimensions (batch, OY, OX, OD). If the output tensor format is
1488 // not kBatchYXDepth, this function would then need to arrange for
1489 // the output to be in the requested layout, if that is
1490 // supported. Note that the case K=1 is equivalent to the
1491 // description above. It is recommended to prefer the case K=1.
1492 //
1493 // Arguments (all borrowed):
1494 // stream: borrowed pointer to the stream that the 'fully connect' operation
1495 // should be enqueued onto.
1496 // output_data: un-owned device memory region in which to place the
1497 // fully connected result.
1498 virtual bool DoMatMul(Stream* stream, const DeviceMemory<float>& input_data,
1499 const DeviceMemory<float>& weights,
1500 const dnn::BatchDescriptor& input_dimensions,
1501 const dnn::BatchDescriptor& output_dimensions,
1502 DeviceMemory<float>* output_data) = 0;
1503
1504 // Version of DoMatMul that uses pre-quantized 8 bit weights.
1505 // weight_scales specifies the scaling of each column of weights:
1506 // original float weight[row * num_columns + column] =
1507 // quantized_weight[row * nnum_columns + column] * weight_scales[column].
1508 virtual bool DoMatMulQuantized(Stream* stream,
1509 const DeviceMemory<float>& input_data,
1510 const DeviceMemory<int8>& quantized_weights,
1511 const DeviceMemory<float>& weight_scales,
1512 const dnn::BatchDescriptor& input_dimensions,
1513 const dnn::BatchDescriptor& output_dimensions,
1514 DeviceMemory<float>* output_data) = 0;
1515
1516 // Version of DoMatMul that uses pre-quantized 16 bit weights.
1517 // weight_scales specifies the scaling of each column of weights:
1518 // original float weight[row * num_columns + column] =
1519 // quantized_weight[row * nnum_columns + column] * weight_scales[column].
1520 virtual bool DoMatMulQuantized(Stream* stream,
1521 const DeviceMemory<float>& input_data,
1522 const DeviceMemory<int16>& quantized_weights,
1523 const DeviceMemory<float>& weight_scales,
1524 const dnn::BatchDescriptor& input_dimensions,
1525 const dnn::BatchDescriptor& output_dimensions,
1526 DeviceMemory<float>* output_data) = 0;
1527
1528 // Adds biases to the feature maps in input_data producing
1529 // output_data. input_data can equal output_data, but must not
1530 // partially overlap it.
1531 //
1532 // Let K = count() * height() * width() and N = feature_map_count()
1533 // on dimensions. Then input_value contains K*N values and biases
1534 // contains N values. We can thus logically consider input_value to
1535 // contain K vectors of N elements each. This function adds biases
1536 // to each of those N vectors.
1537 //
1538 // TODO(broune): This works differently when width() * height() > 1
1539 // and the call to ThenBiasAdd() follows a call to ThenMatMul(). In
1540 // that case there should be width() * height() *
1541 // feature_map_count() biases, but this is not implemented on all
1542 // StreamExecutors.
1543 //
1544 // Arguments (all borrowed):
1545 // stream: borrowed pointer to the stream that the 'bias add' operation
1546 // should be enqueued onto.
1547 // input_data: un-owned device memory region containing the input.
1548 // biases: un-owned device memory region containing biases to add to the
1549 // input.
1550 // dimensions: dimensions of input_data and output_data.
1551 // output_data: un-owned device memory region in which to place the result.
1552 virtual bool DoBiasAdd(Stream* stream, const DeviceMemory<float>& input_data,
1553 const DeviceMemory<float>& biases,
1554 const dnn::BatchDescriptor& dimensions,
1555 DeviceMemory<float>* output_data) = 0;
1556
1557 // Performs a forward pooling operation on input_data, writing to
1558 // output_data. See PoolingDescriptor for how to configure the
1559 // pooling operation.
1560 //
1561 // Pooling happens as a window that moves across the Y and X
1562 // dimensions of input_data, where each position of the window
1563 // yields one output value. E.g. for max pooling, the computed value
1564 // is the maximum element in the window. The operation is applied
1565 // independently to each batch and at each feature map (depth), so
1566 // that the output depth and feature_map_count are the same as for
1567 // the input. The output width and height can be different.
1568 //
1569 // See PoolingDescriptor for how to configure the pooling operation.
1570 virtual bool DoPoolForward(Stream* stream,
1571 const dnn::PoolingDescriptor& pooling_dimensions,
1572 const dnn::BatchDescriptor& input_dimensions,
1573 const DeviceMemory<float>& input_data,
1574 const dnn::BatchDescriptor& output_dimensions,
1575 DeviceMemory<float>* output_data,
1576 ScratchAllocator* workspace_allocator) = 0;
1577
1578 virtual bool DoPoolForward(Stream* stream,
1579 const dnn::PoolingDescriptor& pooling_dimensions,
1580 const dnn::BatchDescriptor& input_dimensions,
1581 const DeviceMemory<double>& input_data,
1582 const dnn::BatchDescriptor& output_dimensions,
1583 DeviceMemory<double>* output_data,
1584 ScratchAllocator* workspace_allocator) {
1585 LOG(FATAL) << "DoPoolForward not implemented for double.";
1586 return false;
1587 }
1588
1589 virtual bool DoPoolForward(Stream* stream,
1590 const dnn::PoolingDescriptor& pooling_dimensions,
1591 const dnn::BatchDescriptor& input_dimensions,
1592 const DeviceMemory<Eigen::half>& input_data,
1593 const dnn::BatchDescriptor& output_dimensions,
1594 DeviceMemory<Eigen::half>* output_data,
1595 ScratchAllocator* workspace_allocator) {
1596 LOG(FATAL) << "DoPoolForward not implemented for float16.";
1597 return false;
1598 }
1599
1600 virtual bool DoPoolForward(Stream* stream,
1601 const dnn::PoolingDescriptor& pooling_dimensions,
1602 const dnn::BatchDescriptor& input_dimensions,
1603 const DeviceMemory<int8>& input_data,
1604 const dnn::BatchDescriptor& output_dimensions,
1605 DeviceMemory<int8>* output_data,
1606 ScratchAllocator* workspace_allocator) {
1607 LOG(FATAL) << "DoPoolForward not implemented for int8.";
1608 return false;
1609 }
1610
1611 // Performs differentiation of the pooling operation.
1612 virtual bool DoPoolBackward(Stream* stream,
1613 const dnn::PoolingDescriptor& pooling_dimensions,
1614 const dnn::BatchDescriptor& input_dimensions,
1615 const DeviceMemory<double>& input_data,
1616 const dnn::BatchDescriptor& output_dimensions,
1617 const DeviceMemory<double>& output_data,
1618 const DeviceMemory<double>& input_diff_data,
1619 DeviceMemory<double>* output_diff_data,
1620 ScratchAllocator* workspace_allocator) {
1621 LOG(FATAL) << "DoPoolBackward not implemented.";
1622 return false;
1623 }
1624
1625 virtual bool DoPoolBackward(Stream* stream,
1626 const dnn::PoolingDescriptor& pooling_dimensions,
1627 const dnn::BatchDescriptor& input_dimensions,
1628 const DeviceMemory<float>& input_data,
1629 const dnn::BatchDescriptor& output_dimensions,
1630 const DeviceMemory<float>& output_data,
1631 const DeviceMemory<float>& input_diff_data,
1632 DeviceMemory<float>* output_diff_data,
1633 ScratchAllocator* workspace_allocator) {
1634 LOG(FATAL) << "DoPoolBackward not implemented.";
1635 return false;
1636 }
1637
1638 virtual bool DoPoolBackward(Stream* stream,
1639 const dnn::PoolingDescriptor& pooling_dimensions,
1640 const dnn::BatchDescriptor& input_dimensions,
1641 const DeviceMemory<Eigen::half>& input_data,
1642 const dnn::BatchDescriptor& output_dimensions,
1643 const DeviceMemory<Eigen::half>& output_data,
1644 const DeviceMemory<Eigen::half>& input_diff_data,
1645 DeviceMemory<Eigen::half>* output_diff_data,
1646 ScratchAllocator* workspace_allocator) {
1647 LOG(FATAL) << "DoPoolBackward not implemented.";
1648 return false;
1649 }
1650
1651 // Applies local response normalization to the values from input_data and
1652 // writes the result to output_data.
1653 //
1654 // See comments on NormalizeDescriptor for a description of local response
1655 // normalization.
1656 virtual bool DoNormalizeWithDimensions(
1657 Stream* stream, const dnn::NormalizeDescriptor& normalize_descriptor,
1658 const dnn::BatchDescriptor& dimensions,
1659 const DeviceMemory<float>& input_data, DeviceMemory<float>* output_data) {
1660 return false;
1661 }
1662
1663 // Performs backpropagation for the normalization operation
1664 //
1665 // Given raw data, its corresponding normalized output, and a gradient of some
1666 // unspecified function with respect to the normalized variables, computes the
1667 // gradient of that unspecified function with respect to the raw variables.
1668 //
1669 // The normalized data input array is expected to match the output that would
1670 // be obtained by running the raw data input array through the DoNormalize
1671 // method above.
1672 //
1673 // See comments on NormalizeDescriptor for a description of local response
1674 // normalization.
1675 virtual bool DoNormalizeBackwardWithDimensions(
1676 Stream* stream, const dnn::NormalizeDescriptor& normalize_descriptor,
1677 const dnn::BatchDescriptor& dimensions,
1678 const DeviceMemory<float>& raw_data,
1679 const DeviceMemory<float>& normalized_data,
1680 const DeviceMemory<float>& normalized_variable_gradient,
1681 DeviceMemory<float>* raw_variable_gradient,
1682 ScratchAllocator* workspace_allocator) {
1683 return false;
1684 }
1685
1686 // Applies an activation function (see ActivationMode) to all of the values
1687 // held on the device in 'input_data', whose dimensions are described by
1688 // 'dimensions'.
1689 //
1690 // Arguments (all borrowed):
1691 // stream: borrowed pointer to the stream that the 'activate' operation
1692 // should be enqueued onto.
1693 // activation_mode: Type of activation to perform.
1694 // input_data: un-owned device memory region which contains the
1695 // activate input.
1696 // output_data: un-owned device memory region in which to place the
1697 // activate result.
1698 virtual bool DoActivate(Stream* stream, ActivationMode activation_mode,
1699 const BatchDescriptor& dimensions,
1700 const DeviceMemory<float>& input_data,
1701 DeviceMemory<float>* output_data, uint64 options) {
1702 return false;
1703 }
1704
1705 // Concatenates several layers into one, by concatenating the depth of each
1706 // layer at matching x and y coordinates.
1707 // The inputs must all have the same width and height, the output will have
1708 // the same width and height as the inputs and its depth will be the sum of
1709 // the input depths.
1710 //
1711 // Arguments (all borrowed):
1712 // stream: borrowed pointer to the stream that the 'depth concatenate'
1713 // operation should be enqueued onto.
1714 // input_dimensions: The dimensions of each input.
1715 // input_data: un-owned device memory region which contains the
1716 // input data for each input layer.
1717 // output_data: un-owned device memory region in which to place the
1718 // depth concatenate result.
1719 virtual bool DoDepthConcatenate(
1720 Stream* stream, port::ArraySlice<dnn::BatchDescriptor> input_dimensions,
1721 port::ArraySlice<const DeviceMemory<float>*> input_data,
1722 DeviceMemory<float>* output_data) = 0;
1723
1724 // Concatenates several layers into one, by concatenating each in the
1725 // x-dimension or y-dimension, based on a user-specified flag.
1726 // For x-concatenation, layers are aligned at matching y and depth
1727 // coordinates, and for y-concatenation, they are aligned at matching x and
1728 // depth coordinates. The inputs must all have the same depth and batch size.
1729 // For x-concatenation, the inputs must have the same height (y-size), and the
1730 // output will have the same depth and height as the inputs and its width (x-
1731 // size) will be the sum of the input widths. For y-concatenation, the inputs
1732 // must have the same width, and the output will have the same depth and width
1733 // as the inputs, and its height will be the sum of the input heights.
1734 //
1735 // Arguments:
1736 // stream: borrowed pointer to the stream that the 'space concatenate'
1737 // operation should be enqueued onto.
1738 // input_dimensions: the dimensions of each input.
1739 // input_data: un-owned device memory region which contains the input data
1740 // for each input layer.
1741 // output_data: un-owned device memory region in which to place the space
1742 // concatenate result.
1743 // concat_direction: either dnn:SpaceConcatenateMode::XDirection or
1744 // dnn::SpaceConcatenateMode::YDirection.
1745 virtual bool DoSpaceConcatenate(
1746 Stream* stream, port::ArraySlice<dnn::BatchDescriptor> input_dimensions,
1747 port::ArraySlice<const DeviceMemory<float>*> input_data,
1748 DeviceMemory<float>* output_data,
1749 dnn::SpaceConcatenateMode concat_direction) {
1750 return false;
1751 }
1752
1753 // Change the layout of the data by shrinking one dimension (or set of
1754 // dimensions) and growing another dimension (or set of dimensions), while
1755 // keeping the total number of data elements constant, and maintaining the
1756 // current data ordering.
1757 //
1758 // Currently, the only supported operation is depth into space by a power of
1759 // 2. E.g. (y, x, z) -> (y*2, x*2, z/4)
1760 //
1761 // Note that Reshape may not be a no-op, depending on the platform and which
1762 // dimensions are being changed.
1763 //
1764 // Example: forgetting about batch for the moment, let's take a tensor that's
1765 // 2x1x8 (y by x by z) and reshape to a tensor that's 4x2x2. The memory layout
1766 // is row-major order: y,x,z. I.e. z changes the fastest, then x, then y. The
1767 // elements of the tensor range from 0 to 15. The x,y,z indices are below each
1768 // element.
1769 //
1770 // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
1771 // y0 y0 y0 y0 y0 y0 y0 y0 y1 y1 y1 y1 y1 y1 y1 y1
1772 // x0 x0 x0 x0 x0 x0 x0 x0 x0 x0 x0 x0 x0 x0 x0 x0
1773 // z0 z1 z2 z3 z4 z5 z6 z7 z0 z1 z2 z3 z4 z5 z6 z7
1774 //
1775 // reshape to 4x2x2
1776 //
1777 // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
1778 // y0 y0 y0 y0 y1 y1 y1 y1 y2 y2 y2 y2 y3 y3 y3 y3
1779 // x0 x0 x1 x1 x0 x0 x1 x1 x0 x0 x1 x1 x0 x0 x1 x1
1780 // z0 z1 z0 z1 z0 z1 z0 z1 z0 z1 z0 z1 z0 z1 z0 z1
1781 virtual bool DoReshape(Stream* stream,
1782 const dnn::BatchDescriptor& input_dimensions,
1783 const DeviceMemory<float>& input_data,
1784 const dnn::BatchDescriptor& output_dimensions,
1785 DeviceMemory<float>* output_data) {
1786 return false;
1787 }
1788
1789 // Depth to space takes an X by Y image with depth D*M² and changes it to an
1790 // MX x MY image with depth D. Each input location (x,y) with depth D*M² in
1791 // the input image is changed to an MxM contiguous area in the output image,
1792 // with the values being laid out in the raster order by DepthToSpaceLayout,
1793 // and will have a new depth of D.
1794 //
1795 // Example.
1796 // M=2, Din =8, Xin=2, Yin=2. Xout=4, Yout=4, Dout=2
1797 // DepthHeightWidth layout
1798 // Values within a 'cell' are at different depths and same x & y.
1799 // Input:
1800 // abcdefgh ijklmnop
1801 // qrstuvwx yz012345
1802 // Output:
1803 // ae bf im jn
1804 // cg dh ko lp
1805 // qu rv y2 z3
1806 // sw tx 04 15
1807 //
1808 // sqrt_depth_reduction: 'M' in the comment above
1809 virtual bool DoDepthToSpace(Stream* stream,
1810 const dnn::BatchDescriptor& input_dimensions,
1811 const DeviceMemory<float>& input_data,
1812 const DepthToSpaceLayout& depth_to_space_layout,
1813 const int& sqrt_depth_reduction,
1814 DeviceMemory<float>* output_data) {
1815 return false;
1816 }
1817
1818 // Space to depth is the inverse of depth to space. Space to depth takes each
1819 // non-overlapping M by M patch (in the X and Y dimensions) with depth D of
1820 // the input, and transforms it to a 1 by 1 patch with depth D*M². If the
1821 // input has size (MX, MY, D), the output has size (X, Y, D*M²). The number of
1822 // data elements is not changed.
1823 //
1824 // Example.
1825 // M=2, Din =2, Xin=4, Yin=4, Dout=8
1826 // DepthHeightWidth layout
1827 // Values within a 'cell' are at different depths and same x & y.
1828 // Input:
1829 // ae bf im jn
1830 // cg dh ko lp
1831 // qu rv y2 z3
1832 // sw tx 04 15
1833 // Output:
1834 // abcdefgh ijklmnop
1835 // qrstuvwx yz012345
1836 //
1837 // sqrt_depth_increase: 'M' in the comment above
1838 virtual bool DoSpaceToDepth(Stream* stream,
1839 const dnn::BatchDescriptor& input_dimensions,
1840 const DeviceMemory<float>& input_data,
1841 const DepthToSpaceLayout& space_to_depth_layout,
1842 const int& sqrt_depth_increase,
1843 DeviceMemory<float>* output_data) {
1844 return false;
1845 }
1846
1847 // Computes the specified operation (e.g. addition or multiplication)
1848 // between corresponding elements in the inputs and stores the result in the
1849 // output element.
1850 // The inputs and output must all have the same dimensions, but may have
1851 // different quantization parameters (min_value and max_value).
1852 //
1853 // Arguments (all borrowed):
1854 // stream: borrowed pointer to the stream that the 'elementwise operation'
1855 // should be enqueued onto.
1856 // operation: The operation to perform.
1857 // input_dimensions: The dimensions of each input.
1858 // input_data: un-owned device memory region which contains the
1859 // input data for each input layer.
1860 // output_dimensions: The dimensions of the output.
1861 // output_data: un-owned device memory region in which to place the
1862 // operation result.
1863 virtual bool DoElementwiseOperate(
1864 Stream* stream, ElementwiseOperation operation,
1865 port::ArraySlice<dnn::BatchDescriptor> input_dimensions,
1866 port::ArraySlice<const DeviceMemory<float>*> input_data,
1867 const dnn::BatchDescriptor& output_dimensions,
1868 DeviceMemory<float>* output_data) = 0;
1869
1870 // Computes the specified operation (e.g. addition or multiplication)
1871 // between corresponding elements in the inputs and stores the result in the
1872 // output element. Each input is multiplied by a scalar constant and the
1873 // result is divided by a scalar constant.
1874 // e.g. To perform Z = 0.9*X + 1.1*Y, set the input multiplicands to 9 and 11
1875 // and the output divisor to 10.
1876 // The inputs and output must all have the same dimensions, but may have
1877 // different quantization parameters (min_value and max_value).
1878 //
1879 // Arguments (all borrowed):
1880 // stream: borrowed pointer to the stream that the 'elementwise operation'
1881 // should be enqueued onto.
1882 // operation: The operation to perform.
1883 // input_multiplicands: Amount to scale each input.
1884 // output_divisor: Amount to divide the output.
1885 // input_dimensions: The dimensions of each input.
1886 // input_data: un-owned device memory region which contains the
1887 // input data for each input layer.
1888 // output_dimensions: The dimensions of the output.
1889 // output_data: un-owned device memory region in which to place the
1890 // operation result.
1891 virtual bool DoElementwiseOperateScaledQuantized(
1892 Stream* stream, ElementwiseOperation operation,
1893 port::ArraySlice<int> input_multiplicands, int output_divisor,
1894 port::ArraySlice<dnn::BatchDescriptor> input_dimensions,
1895 port::ArraySlice<const DeviceMemory<float>*> input_data,
1896 const dnn::BatchDescriptor& output_dimensions,
1897 DeviceMemory<float>* output_data) {
1898 return false;
1899 }
1900
1901 // Pads the input with zeros in the X and Y dimensions. The feature_map
1902 // dimension is unchanged.
1903 //
1904 // Arguments (all borrowed):
1905 // stream: borrowed pointer to the stream that the 'elementwise operation'
1906 // should be enqueued onto.
1907 // dimensions: The dimensions of the input.
1908 // input_data: un-owned device memory region which contains the
1909 // input data for the input layer.
1910 // left_pad: Amount to pad the input on the left.
1911 // right_pad: Amount to pad the input on the right.
1912 // top_pad: Amount to pad the input at the top (low Y).
1913 // bottom_pad: Amount to pad the input at the bottom (high Y).
1914 // output_data: un-owned device memory region in which to place the
1915 // padded result.
1916 virtual bool DoXYPad(Stream* stream, const dnn::BatchDescriptor &dimensions,
1917 const DeviceMemory<float> &input_data,
1918 int64 left_pad, int64 right_pad, int64 top_pad,
1919 int64 bottom_pad, DeviceMemory<float> *output_data) = 0;
1920
1921 // Extracts a slice of the input in the X and Y dimensions. The feature_map
1922 // dimension is unchanged.
1923 //
1924 // Arguments (all borrowed):
1925 // stream: borrowed pointer to the stream that the 'elementwise operation'
1926 // should be enqueued onto.
1927 // dimensions: The dimensions of the input.
1928 // input_data: un-owned device memory region which contains the
1929 // input data for the input layer.
1930 // left_trim: Amount to cut off the input on the left.
1931 // right_trim: Amount to cut off the input on the right.
1932 // top_trim: Amount to cut off the input at the top (low y).
1933 // bottom_trim: Amount to cut off the input at the bottom (high Y).
1934 // output_data: un-owned device memory region in which to place the
1935 // padded result.
1936 virtual bool DoXYSlice(Stream* stream, const dnn::BatchDescriptor &dimensions,
1937 const DeviceMemory<float> &input_data,
1938 int64 left_trim, int64 right_trim, int64 top_trim,
1939 int64 bottom_trim, DeviceMemory<float> *output_data) = 0;
1940
1941 // Grows the input tensor by replicating the X and Y dimensions. The batch and
1942 // depth/feature_map dimensions are unchanged. Currently, the input tensor is
1943 // limited to X=1 and Y=1.
1944 //
1945 // For example, the input has dimensions x=2, y=3, and replicate_x=3,
1946 // replicate_y=2. The diagonal elements of the output would be: [x0y0, x1y1,
1947 // x0y2, x1y0, x0y1, x1y2].
1948 // Here is the example as a picture. input:
1949 // AB
1950 // CD
1951 // EF
1952 // broadcast result:
1953 // ABABAB
1954 // CDCDCD
1955 // EFEFEF
1956 // ABABAB
1957 // CDCDCD
1958 // EFEFEF
1959 //
1960 // Arguments (all borrowed):
1961 // stream: borrowed pointer to the stream that the 'elementwise operation'
1962 // should be enqueued onto.
1963 // dimensions: The dimensions of the input.
1964 // input_data: un-owned device memory region which contains the
1965 // input data for the input layer.
1966 // replicate_x: Amount to replicate the input's X dimension.
1967 // replicate_y: Amount to replicate the input's Y dimension.
1968 // output_data: un-owned device memory region in which to place the
1969 // padded result.
1970 virtual bool DoXYBroadcast(Stream* stream,
1971 const dnn::BatchDescriptor& dimensions,
1972 const DeviceMemory<float>& input_data,
1973 int64 replicate_x, int64 replicate_y,
1974 DeviceMemory<float>* output_data) {
1975 return false;
1976 }
1977
1978 // Enqueues an asynchronous memcpy of the *quantized* output of a layer (that
1979 // is, bytes instead of scaled floats) into 'host_dst' if they are available
1980 // for the underlying DNN implementation. If this quantized output is not
1981 // available, false is returned, which will place 'stream' into an error
1982 // state.
1983 //
1984 // Arguments (all borrowed):
1985 // stream: borrowed pointer to the stream that the 'quantized memcpy'
1986 // operation should be enqueued onto.
1987 // gpu_unquantized_src: the device memory that contains the unquantized data
1988 // -- this data should also have a corresponding quantized representation
1989 // on the device for this operation to succeed.
1990 // mode: Type of quantization of the data to write into host_dst.
1991 // host_dst: un-owned host memory region that is mutated in place,
1992 // it is clobbered by the values in 'gpu_unquantized_src' when the enqueued
1993 // (asynchronous) memcpy operation is performed.
1994 // size: size in bytes of the host_dst host memory region.
1995 virtual bool DoMemcpyD2HQuantized(
1996 Stream* stream, const DeviceMemory<float>& gpu_unquantized_src,
1997 QuantizedActivationMode mode, void* host_dst, int64 size) = 0;
1998
1999 // Enqueues an asynchronous memcpy of 'host_dst' into the *quantized* input
2000 // of a layer (that is, bytes instead of scaled floats) if they are supported
2001 // by the underlying DNN implementation. If this quantized input is not
2002 // supported, false is returned, which will place 'stream' into an error
2003 // state.
2004 //
2005 // Arguments (all borrowed):
2006 // stream: borrowed pointer to the stream that the 'quantized memcpy'
2007 // operation should be enqueued onto.
2008 // host_src: un-owned host memory region that contains the quantized data.
2009 // size: size in bytes of the host_src host memory region.
2010 // mode: Type of quantization of the data to read from host_src.
2011 // gpu_unquantized_dst: the device memory that is clobbered by the values in
2012 // 'host_src' when the enqueued (asynchronous) memcpy operation is
2013 // performed. -- this data should also have a corresponding quantized
2014 // representation on the device for this operation to
2015 // succeed.
2016 virtual bool DoMemcpyH2DQuantized(
2017 Stream* stream, const void* host_src, int64 size,
2018 QuantizedActivationMode mode,
2019 DeviceMemory<float>* gpu_unquantized_dst) = 0;
2020
2021 // Create an RNN descriptor based on model shapes and configurations.
2022 // The caller retains the ownership of the descriptor.
2023 //
2024 // Arguments:
2025 // num_layers: the number of layers for a RNN model.
2026 // hidden_size: the size of the hidden state.
2027 // input_size: the size of the input state.
2028 // input_mode: an enum to specify whether a linear transformation is added
2029 // after the input state. If input_size is different from hidden_size, this
2030 // is required.
2031 // direction_mode: an enum to specify whether this model is unidirectional or
2032 // bidirectional.
2033 // rnn_mode: an enum to specify the type of model to build.
2034 // data_type: an enum to specify the data types used in this model.
2035 // dropout: the dropout threshold between layers. When it is 0., no dropout
2036 // is added.
2037 // seed: a seed for initializing the dropout layers.
2038 // state_allocator: an memory allocator that will be used to store the state
2039 // for dropout layer. The user has to maintain the memory until the model
2040 // is no longer in use.
2041 virtual port::StatusOr<std::unique_ptr<dnn::RnnDescriptor>>
2042 createRnnDescriptor(int num_layers, int hidden_size, int input_size,
2043 int batch_size, dnn::RnnInputMode input_mode,
2044 dnn::RnnDirectionMode direction_mode,
2045 dnn::RnnMode rnn_mode, dnn::DataType data_type,
2046 const dnn::AlgorithmConfig& algorithm_config,
2047 float dropout, uint64 seed,
2048 ScratchAllocator* state_allocator) {
2049 return port::Status(port::error::UNIMPLEMENTED,
2050 "createRnnDescriptor is unimplemented");
2051 }
2052
2053 // Create a RNN sequence descriptor that specifies either the input or output
2054 // sequence. The caller retains the ownership of the returned descriptor.
2055 //
2056 // Arguments:
2057 // max_seq_length: the max length of the sequences.
2058 // batch_size: the size of a minibatch.
2059 // data_size: the size of the state.
2060 // seq_lenghs: the lengths of sequences in a batch.
2061 // data_type: an enum to specify the type for the underlying data.
2062 virtual port::StatusOr<std::unique_ptr<dnn::RnnSequenceTensorDescriptor>>
2063 createRnnSequenceTensorDescriptor(int max_seq_length, int batch_size,
2064 int data_size, dnn::DataType data_type) {
2065 return port::Status(port::error::UNIMPLEMENTED,
2066 "createRnnSequenceTensorDescriptor is unimplemented");
2067 }
2068
2069 virtual port::StatusOr<std::unique_ptr<dnn::RnnSequenceTensorDescriptor>>
2070 createRnnSequenceTensorDescriptor(int max_seq_length, int batch_size,
2071 int data_size,
2072 const absl::Span<const int>& seq_lengths,
2073 bool time_major, dnn::DataType data_type) {
2074 return port::Status(port::error::UNIMPLEMENTED,
2075 "createRnnSequenceTensorDescriptor is unimplemented");
2076 }
2077
2078 // Create an RNN state descriptor that specifies the input or hidden state.
2079 // The caller retains the ownership of the returned descriptor.
2080 virtual port::StatusOr<std::unique_ptr<dnn::RnnStateTensorDescriptor>>
2081 createRnnStateTensorDescriptor(int num_layer, int batch_size, int data_size,
2082 dnn::DataType data_type) {
2083 return port::Status(port::error::UNIMPLEMENTED,
2084 "createRnnStateTensorDescriptor is unimplemented");
2085 }
2086
2087 // Enqueue a forward operation of the RNN model onto the stream.
2088 //
2089 // Arguments:
2090 // stream: pointer to the stream where this operation should be enqueued to.
2091 // rnn_desc: a RNN descriptor created by createRnnDescriptor.
2092 // input_desc: descriptor for the input sequence.
2093 // input_data: the device memory region that contains the input data.
2094 // input_h_desc: descriptor for the input "h" state.
2095 // input_h_data: the device memory region that contains the input "h" data.
2096 // input_c_desc: descriptor for the input "c" state.
2097 // input_c_data: the device memory region that contains the input "c" data.
2098 // This must be specified for LSTM models.
2099 // params: the device memory region that contains the parameters used in this
2100 // model.
2101 // output_desc: descriptor for the output sequence.
2102 // output_data: the memory region that stores the output sequence data.
2103 // output_h_desc: descriptor for the output "h" state.
2104 // output_h_data: the memory region that stores the output "h" data.
2105 // output_c_desc: descriptor for the output "c" state.
2106 // output_c_data: the memory region that stores the output "c" data. This
2107 // must be specified for LSTM models.
2108 // is_training: whether this is used in training or inference. That decides
2109 // whether respace_space data need to be produced.
2110 // reserve_space_allocator: if "is_training" is true, an memory allocator
2111 // to create memory that holds the produced reserve_space. The caller is
2112 // retains the data and feed it to the backward pass.
2113 // workspace_allocator: an allocator to create temporary workspace used in
2114 // this kernel. The caller is responsible for retaining the memory long
2115 // enough for the lifespan of this operation, and recycles afterwards.
2116 virtual bool DoRnnForward(Stream* stream, const dnn::RnnDescriptor& rnn_desc,
2117 const dnn::RnnSequenceTensorDescriptor& input_desc,
2118 const DeviceMemory<Eigen::half>& input_data,
2119 const dnn::RnnStateTensorDescriptor& input_h_desc,
2120 const DeviceMemory<Eigen::half>& input_h_data,
2121 const dnn::RnnStateTensorDescriptor& input_c_desc,
2122 const DeviceMemory<Eigen::half>& input_c_data,
2123 const DeviceMemory<Eigen::half>& params,
2124 const dnn::RnnSequenceTensorDescriptor& output_desc,
2125 DeviceMemory<Eigen::half>* output_data,
2126 const dnn::RnnStateTensorDescriptor& output_h_desc,
2127 DeviceMemory<Eigen::half>* output_h_data,
2128 const dnn::RnnStateTensorDescriptor& output_c_desc,
2129 DeviceMemory<Eigen::half>* output_c_data,
2130 bool is_training,
2131 ScratchAllocator* reserve_space_allocator,
2132 ScratchAllocator* workspace_allocator,
2133 dnn::ProfileResult* output_profile_result) {
2134 return false;
2135 }
2136
2137 virtual bool DoRnnForward(Stream* stream, const dnn::RnnDescriptor& rnn_desc,
2138 const dnn::RnnSequenceTensorDescriptor& input_desc,
2139 const DeviceMemory<float>& input_data,
2140 const dnn::RnnStateTensorDescriptor& input_h_desc,
2141 const DeviceMemory<float>& input_h_data,
2142 const dnn::RnnStateTensorDescriptor& input_c_desc,
2143 const DeviceMemory<float>& input_c_data,
2144 const DeviceMemory<float>& params,
2145 const dnn::RnnSequenceTensorDescriptor& output_desc,
2146 DeviceMemory<float>* output_data,
2147 const dnn::RnnStateTensorDescriptor& output_h_desc,
2148 DeviceMemory<float>* output_h_data,
2149 const dnn::RnnStateTensorDescriptor& output_c_desc,
2150 DeviceMemory<float>* output_c_data,
2151 bool is_training,
2152 ScratchAllocator* reserve_space_allocator,
2153 ScratchAllocator* workspace_allocator,
2154 dnn::ProfileResult* output_profile_result) {
2155 return false;
2156 }
2157
2158 virtual bool DoRnnForward(Stream* stream, const dnn::RnnDescriptor& rnn_desc,
2159 const dnn::RnnSequenceTensorDescriptor& input_desc,
2160 const DeviceMemory<double>& input_data,
2161 const dnn::RnnStateTensorDescriptor& input_h_desc,
2162 const DeviceMemory<double>& input_h_data,
2163 const dnn::RnnStateTensorDescriptor& input_c_desc,
2164 const DeviceMemory<double>& input_c_data,
2165 const DeviceMemory<double>& params,
2166 const dnn::RnnSequenceTensorDescriptor& output_desc,
2167 DeviceMemory<double>* output_data,
2168 const dnn::RnnStateTensorDescriptor& output_h_desc,
2169 DeviceMemory<double>* output_h_data,
2170 const dnn::RnnStateTensorDescriptor& output_c_desc,
2171 DeviceMemory<double>* output_c_data,
2172 bool is_training,
2173 ScratchAllocator* reserve_space_allocator,
2174 ScratchAllocator* workspace_allocator,
2175 dnn::ProfileResult* output_profile_result) {
2176 return false;
2177 }
2178 // Enqueue a backward operation of the RNN model onto the stream.
2179 //
2180 // Arguments:
2181 // stream: pointer to the stream where this operation should be enqueued to.
2182 // rnn_desc: a RNN descriptor created by createRnnDescriptor.
2183 // input_desc: descriptor for the input sequence.
2184 // input_data: the device memory region that contains the input data.
2185 // input_h_desc: descriptor for the input "h" state.
2186 // input_h_data: the device memory region that contains the input "h" data.
2187 // input_c_desc: descriptor for the input "c" state.
2188 // input_c_data: the device memory region that contains the input "c" data.
2189 // This must be specified for LSTM models.
2190 // params: the device memory region that contains the parameters used in this
2191 // model.
2192 // output_desc: descriptor for the output sequence.
2193 // output_data: the memory region that stores the output sequence data.
2194 // output_h_desc: descriptor for the output "h" state.
2195 // output_h_data: the memory region that stores the output "h" data.
2196 // output_c_desc: descriptor for the output "c" state.
2197 // output_c_data: the memory region that stores the output "c" data. This
2198 // must be specified for LSTM models.
2199 // output_backprop_data: the device memory region that contains the backprop
2200 // to the output sequence.
2201 // output_h_backprop_data: the device memory region that contains the
2202 // backprop to the output "h" state.
2203 // output_c_backprop_data: the device memory region that contains the
2204 // backprop to the output "c" state.
2205 // input_backprop_data: the device memory region that stores the backprop
2206 // to the input sequence.
2207 // input_h_backprop_data: the device memory region that stores the backprop
2208 // to the input "h" state.
2209 // input_c_backprop_data: the device memory region that stores the backprop
2210 // to the input "c" state.
2211 // params_backprop_data: the device memory region that stores the backprop
2212 // to the parameters.
2213 // reserve_space_data: the reserve_space data that is produced by the forward
2214 // operation. This memory region could be modified by this operation.
2215 // workspace_allocator: a memory allocator that creates the temporary
2216 // workspace memory used by this operation. The caller is responsible for
2217 // keeping the memory alive long enough for this operation, and recylces
2218 // afterwards.
2219 virtual bool DoRnnBackward(
2220 Stream* stream, const dnn::RnnDescriptor& rnn_desc,
2221 const dnn::RnnSequenceTensorDescriptor& input_desc,
2222 const DeviceMemory<Eigen::half>& input_data,
2223 const dnn::RnnStateTensorDescriptor& input_h_desc,
2224 const DeviceMemory<Eigen::half>& input_h_data,
2225 const dnn::RnnStateTensorDescriptor& input_c_desc,
2226 const DeviceMemory<Eigen::half>& input_c_data,
2227 const DeviceMemory<Eigen::half>& params,
2228 const dnn::RnnSequenceTensorDescriptor& output_desc,
2229 const DeviceMemory<Eigen::half>& output_data,
2230 const dnn::RnnStateTensorDescriptor& output_h_desc,
2231 const DeviceMemory<Eigen::half>& output_h_data,
2232 const dnn::RnnStateTensorDescriptor& output_c_desc,
2233 const DeviceMemory<Eigen::half>& output_c_data,
2234 const DeviceMemory<Eigen::half>& output_backprop_data,
2235 const DeviceMemory<Eigen::half>& output_h_backprop_data,
2236 const DeviceMemory<Eigen::half>& output_c_backprop_data,
2237 DeviceMemory<Eigen::half>* input_backprop_data,
2238 DeviceMemory<Eigen::half>* input_h_backprop_data,
2239 DeviceMemory<Eigen::half>* input_c_backprop_data,
2240 DeviceMemory<Eigen::half>* params_backprop_data,
2241 DeviceMemory<uint8>* reserve_space_data,
2242 ScratchAllocator* workspace_allocator,
2243 dnn::ProfileResult* output_profile_result) {
2244 return false;
2245 }
2246
2247 virtual bool DoRnnBackward(
2248 Stream* stream, const dnn::RnnDescriptor& rnn_desc,
2249 const dnn::RnnSequenceTensorDescriptor& input_desc,
2250 const DeviceMemory<float>& input_data,
2251 const dnn::RnnStateTensorDescriptor& input_h_desc,
2252 const DeviceMemory<float>& input_h_data,
2253 const dnn::RnnStateTensorDescriptor& input_c_desc,
2254 const DeviceMemory<float>& input_c_data,
2255 const DeviceMemory<float>& params,
2256 const dnn::RnnSequenceTensorDescriptor& output_desc,
2257 const DeviceMemory<float>& output_data,
2258 const dnn::RnnStateTensorDescriptor& output_h_desc,
2259 const DeviceMemory<float>& output_h_data,
2260 const dnn::RnnStateTensorDescriptor& output_c_desc,
2261 const DeviceMemory<float>& output_c_data,
2262 const DeviceMemory<float>& output_backprop_data,
2263 const DeviceMemory<float>& output_h_backprop_data,
2264 const DeviceMemory<float>& output_c_backprop_data,
2265 DeviceMemory<float>* input_backprop_data,
2266 DeviceMemory<float>* input_h_backprop_data,
2267 DeviceMemory<float>* input_c_backprop_data,
2268 DeviceMemory<float>* params_backprop_data,
2269 DeviceMemory<uint8>* reserve_space_data,
2270 ScratchAllocator* workspace_allocator,
2271 dnn::ProfileResult* output_profile_result) {
2272 return false;
2273 }
2274
2275 virtual bool DoRnnBackward(
2276 Stream* stream, const dnn::RnnDescriptor& rnn_desc,
2277 const dnn::RnnSequenceTensorDescriptor& input_desc,
2278 const DeviceMemory<double>& input_data,
2279 const dnn::RnnStateTensorDescriptor& input_h_desc,
2280 const DeviceMemory<double>& input_h_data,
2281 const dnn::RnnStateTensorDescriptor& input_c_desc,
2282 const DeviceMemory<double>& input_c_data,
2283 const DeviceMemory<double>& params,
2284 const dnn::RnnSequenceTensorDescriptor& output_desc,
2285 const DeviceMemory<double>& output_data,
2286 const dnn::RnnStateTensorDescriptor& output_h_desc,
2287 const DeviceMemory<double>& output_h_data,
2288 const dnn::RnnStateTensorDescriptor& output_c_desc,
2289 const DeviceMemory<double>& output_c_data,
2290 const DeviceMemory<double>& output_backprop_data,
2291 const DeviceMemory<double>& output_h_backprop_data,
2292 const DeviceMemory<double>& output_c_backprop_data,
2293 DeviceMemory<double>* input_backprop_data,
2294 DeviceMemory<double>* input_h_backprop_data,
2295 DeviceMemory<double>* input_c_backprop_data,
2296 DeviceMemory<double>* params_backprop_data,
2297 DeviceMemory<uint8>* reserve_space_data,
2298 ScratchAllocator* workspace_allocator,
2299 dnn::ProfileResult* output_profile_result) {
2300 return false;
2301 }
2302
2303 // Transforms a tensor into another tensor with a different layout and/or data
2304 // type.
2305 //
2306 // Arguments:
2307 // stream: pointer to the stream where this operation should be enqueued to.
2308 // input_desc: specifies the shape and the data layout of the input tensor.
2309 // input_type: the data type of the input tensor.
2310 // input_data: the device memory region that contains the input tensor.
2311 // output_desc: specifies the shape and the data layout of the output tensor.
2312 // output_type: the data type of the output tensor.
2313 // scale: an element-wise scaling factor to apply.
2314 // output_data: the device memory region that contains the output tensor.
2315 virtual bool DoTransformTensor(Stream* stream,
2316 const dnn::BatchDescriptor& input_desc,
2317 dnn::DataType input_type,
2318 const DeviceMemoryBase& input_data,
2319 const dnn::BatchDescriptor& output_desc,
2320 dnn::DataType output_type, float scale,
2321 DeviceMemoryBase* output_data) {
2322 return false;
2323 }
2324
2325 // Enqueues a fused convolution+bias+activation operation onto the stream.
2326 //
2327 // Arguments (all borrowed):
2328 //
2329 // stream: borrowed pointer to the stream that the 'fusion' operation should
2330 // be enqueued onto.
2331 //
2332 // conv_input_descriptor: dimensions of the convolution input layer.
2333 // conv_input_data: device memory which contains the convolution input.
2334 //
2335 // filter_descriptor: dimensions of the convolution filter.
2336 // filter_data: device memory which contains the convolution filter weights.
2337 //
2338 // convolution_descriptor: stride of the convolution filter.
2339 //
2340 // bias_descriptor: dimensions of the bias layer
2341 // biases: device memory region containing biases to add to the convolution
2342 // output
2343 //
2344 // activation_mode: Type of activation to perform.
2345 //
2346 // output_descriptor: dimensions of the output layer.
2347 // output_data: device memory region in which to place the fusion result.
2348 //
2349 // output_profile_result: the output profile result for this call.
2350 // The profiling is only enabled when this is not nullptr.
2351 //
2352 virtual bool DoFusedConvolutionBiasActivation(
2353 Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor,
2354 const DeviceMemory<float>& conv_input_data,
2355 const dnn::FilterDescriptor& filter_descriptor,
2356 const DeviceMemory<float>& filter_data,
2357 const dnn::ConvolutionDescriptor& convolution_descriptor,
2358 const dnn::BatchDescriptor& bias_descriptor,
2359 const DeviceMemory<float>& bias_data, dnn::ActivationMode activation_mode,
2360 const dnn::BatchDescriptor& output_descriptor,
2361 DeviceMemory<float>* output_data,
2362 dnn::ProfileResult* output_profile_result) {
2363 return false;
2364 }
2365
2366 // Enqueues a fused batchnorm+activation (inference) operation onto the
2367 // stream.
2368 //
2369 // Arguments (all borrowed):
2370 //
2371 // stream: borrowed pointer to the stream that the 'fusion' operation should
2372 // be enqueued onto.
2373 //
2374 // x_descriptor: dimensions of the batchnorm input layer.
2375 // x_data: device memory which contains the batchnorm input.
2376 //
2377 // scale_offset_mean_variance_descriptor:
2378 // dimensions of the scale/offset/mean/variance tensor.
2379 // scale_data: device memory which contains the scale input.
2380 // offset_data: device memory which contains the offset input.
2381 // mean_data: device memory which contains the mean input.
2382 // variance_data: device memory which contains the variance input.
2383 // epsilon : the epsilon value to use in batchnorm calculation
2384 //
2385 // activation_mode: Type of activation to perform.
2386 //
2387 // y_data: device memory region in which to place the fusion result.
2388 //
2389 // output_profile_result: the output profile result for this call.
2390 // The profiling is only enabled when this is not nullptr.
2391 //
2392 virtual bool DoFusedBatchNormActivationInference(
2393 Stream* stream, const dnn::BatchDescriptor& x_descriptor,
2394 const DeviceMemory<float>& x_data,
2395 const dnn::BatchDescriptor& scale_offset_mean_variance_descriptor,
2396 const DeviceMemory<float>& scale_data,
2397 const DeviceMemory<float>& offset_data,
2398 const DeviceMemory<float>& mean_data,
2399 const DeviceMemory<float>& variance_data, double epsilon,
2400 dnn::ActivationMode activation_mode, DeviceMemory<float>* y_data,
2401 dnn::ProfileResult* output_profile_result) {
2402 return false;
2403 }
2404
2405 virtual bool DoFusedBatchNormActivationInference(
2406 Stream* stream, const dnn::BatchDescriptor& x_descriptor,
2407 const DeviceMemory<Eigen::half>& x_data,
2408 const dnn::BatchDescriptor& scale_offset_mean_variance_descriptor,
2409 const DeviceMemory<float>& scale_data,
2410 const DeviceMemory<float>& offset_data,
2411 const DeviceMemory<float>& mean_data,
2412 const DeviceMemory<float>& variance_data, double epsilon,
2413 dnn::ActivationMode activation_mode, DeviceMemory<Eigen::half>* y_data,
2414 dnn::ProfileResult* output_profile_result) {
2415 return false;
2416 }
2417
2418 // Enqueues a fused batchnorm+activation (training-fwd) operation onto the
2419 // stream.
2420 //
2421 // Arguments (all borrowed):
2422 //
2423 // stream: borrowed pointer to the stream that the 'fusion' operation should
2424 // be enqueued onto.
2425 //
2426 // x_descriptor: dimensions of the batchnorm input layer.
2427 // x_data: device memory which contains the batchnorm input.
2428 //
2429 // scale_offset_mean_variance_descriptor:
2430 // dimensions of the scale/offset/mean/variance tensor.
2431 // scale_data: device memory which contains the scale input.
2432 // offset_data: device memory which contains the offset input.
2433 // epsilon : the epsilon value to use in batchnorm calculation
2434 //
2435 // activation_mode: Type of activation to perform.
2436 //
2437 // y_data: device memory region in which to place the fusion result.
2438 // batch_mean_data: device memory in which to place the batch mean output.
2439 // batch_var_data: device memory in which to place the batch variance output.
2440 // saved_mean_data: device memory in which to save the mean for bwd pass.
2441 // saved_var_data: device memory in which to save the variance for bwd pass.
2442 //
2443 // output_profile_result: the output profile result for this call.
2444 // The profiling is only enabled when this is not nullptr.
2445 //
2446 virtual bool DoFusedBatchNormActivationForward(
2447 Stream* stream, const dnn::BatchDescriptor& x_descriptor,
2448 const DeviceMemory<float>& x_data,
2449 const dnn::BatchDescriptor& scale_offset_mean_variance_descriptor,
2450 const DeviceMemory<float>& scale_data,
2451 const DeviceMemory<float>& offset_data, double epsilon,
2452 dnn::ActivationMode activation_mode, DeviceMemory<float>* y_data,
2453 DeviceMemory<float>* batch_mean_data, DeviceMemory<float>* batch_var_data,
2454 DeviceMemory<float>* saved_mean_data, DeviceMemory<float>* saved_var_data,
2455 dnn::ProfileResult* output_profile_result) {
2456 return false;
2457 }
2458
2459 virtual bool DoFusedBatchNormActivationForward(
2460 Stream* stream, const dnn::BatchDescriptor& x_descriptor,
2461 const DeviceMemory<Eigen::half>& x_data,
2462 const dnn::BatchDescriptor& scale_offset_mean_variance_descriptor,
2463 const DeviceMemory<float>& scale_data,
2464 const DeviceMemory<float>& offset_data, double epsilon,
2465 dnn::ActivationMode activation_mode, DeviceMemory<Eigen::half>* y_data,
2466 DeviceMemory<float>* batch_mean_data, DeviceMemory<float>* batch_var_data,
2467 DeviceMemory<float>* saved_mean_data, DeviceMemory<float>* saved_var_data,
2468 dnn::ProfileResult* output_profile_result) {
2469 return false;
2470 }
2471
2472 // Enqueues a fused batchnorm+activation (training-bwd) operation onto the
2473 // stream.
2474 //
2475 // Arguments (all borrowed):
2476 //
2477 // stream: borrowed pointer to the stream that the 'fusion' operation should
2478 // be enqueued onto.
2479 //
2480 // y_act_backprop_descriptor: dimensions of the backprop input from the
2481 // previous layer. y_act_backprop_data: device memory which contains the
2482 // backprop input.
2483 //
2484 // y_act_data: device memory which contains the actv-fwd output data.
2485 //
2486 // activation_mode: actv-fwd type.
2487 //
2488 // scale_offset_mean_variance_descriptor:
2489 // dimensions of the scale/offset/mean/variance tensor.
2490 // scale_data: device memory which contains the scale input.
2491 // offset_data: device memory which contains the offset input.
2492 // saved_mean_data: device memory which contains the saved mean from fwd
2493 // pass. saved_var_data: device memory which contains the saved variance from
2494 // fwd pass.
2495 //
2496 // x_bn_backprop_data: device memory region in which to place the backprop
2497 // data from this layer scale_backprop_data: device memory in which to place
2498 // the scale backprop output. offset_backprop_data: device memory in which to
2499 // place the offset backprop output.
2500 //
2501 // output_profile_result: the output profile result for this call.
2502 // The profiling is only enabled when this is not nullptr.
2503 //
2504 virtual bool DoFusedBatchNormActivationBackward(
2505 Stream* stream, const dnn::BatchDescriptor& y_act_backprop_descriptor,
2506 const DeviceMemory<float>& y_act_backprop_data,
2507 const DeviceMemory<float>& y_act_data,
2508 dnn::ActivationMode activation_mode, const DeviceMemory<float>& x_bn_data,
2509 const dnn::BatchDescriptor& scale_offset_mean_variance_descriptor,
2510 const DeviceMemory<float>& scale_data,
2511 const DeviceMemory<float>& offset_data,
2512 const DeviceMemory<float>& saved_mean_data,
2513 const DeviceMemory<float>& saved_var_data,
2514 DeviceMemory<float>* x_bn_backprop_data,
2515 DeviceMemory<float>* scale_backprop_data,
2516 DeviceMemory<float>* offset_backprop_data,
2517 dnn::ProfileResult* output_profile_result) {
2518 return false;
2519 }
2520
2521 virtual bool DoFusedBatchNormActivationBackward(
2522 Stream* stream, const dnn::BatchDescriptor& y_act_backprop_descriptor,
2523 const DeviceMemory<Eigen::half>& y_act_backprop_data,
2524 const DeviceMemory<Eigen::half>& y_act_data,
2525 dnn::ActivationMode activation_mode,
2526 const DeviceMemory<Eigen::half>& x_bn_data,
2527 const dnn::BatchDescriptor& scale_offset_mean_variance_descriptor,
2528 const DeviceMemory<float>& scale_data,
2529 const DeviceMemory<float>& offset_data,
2530 const DeviceMemory<float>& saved_mean_data,
2531 const DeviceMemory<float>& saved_var_data,
2532 DeviceMemory<Eigen::half>* x_bn_backprop_data,
2533 DeviceMemory<float>* scale_backprop_data,
2534 DeviceMemory<float>* offset_backprop_data,
2535 dnn::ProfileResult* output_profile_result) {
2536 return false;
2537 }
2538
2539 protected:
2540 // Returns whether status is 'ok', and potentially logs the error.
2541 static bool IsStatusOk(const port::Status& status, bool report_error);
2542
2543 private:
2544 virtual port::Status DoPrepareForConvolution(
2545 ConvolutionKind kind, DataType element_type, Stream* stream,
2546 const BatchDescriptor& batch_descriptor, DeviceMemoryBase input_data,
2547 const FilterDescriptor& filter_descriptor, DeviceMemoryBase filter_data,
2548 const BatchDescriptor& output_descriptor, DeviceMemoryBase output_data,
2549 const ConvolutionDescriptor& convolution_descriptor,
2550 const AlgorithmConfig& algorithm_config,
2551 ScratchAllocator* scratch_allocator, AlgorithmDesc* algorithm_desc,
2552 DeviceMemory<uint8>* scratch_memory) {
2553 *algorithm_desc = {};
2554 *scratch_memory = {};
2555 return port::Status::OK();
2556 }
2557
2558 SE_DISALLOW_COPY_AND_ASSIGN(DnnSupport);
2559 };
2560
2561 } // namespace dnn
2562 } // namespace stream_executor
2563
2564 #endif // TENSORFLOW_STREAM_EXECUTOR_DNN_H_
2565