• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 // Output kernels for fusing computation into Eigen Tensor contractions:
17 //   (1) FusedConv2DOp
18 //   (2) FusedMatMulOp
19 //
20 // Supported fused computations:
21 //   (1) {Conv2D/MatMul} + BiasAdd + <Activation>
22 //   (2) {Conv2D/MatMul} + FusedBatchNorm + <Activation>
23 //
24 // Activation: Relu, Relu6, Elu, etc...
25 
26 #ifndef TENSORFLOW_CORE_KERNELS_FUSED_EIGEN_OUTPUT_KERNELS_H_
27 #define TENSORFLOW_CORE_KERNELS_FUSED_EIGEN_OUTPUT_KERNELS_H_
28 
29 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
30 #include "tensorflow/core/framework/op_kernel.h"
31 #include "tensorflow/core/framework/tensor.h"
32 #include "tensorflow/core/framework/tensor_types.h"
33 
34 namespace tensorflow {
35 
36 enum class FusedComputationType {
37   kUndefined,
38   kBiasAdd,
39   kBiasAddWithRelu,
40   kBiasAddWithRelu6,
41   kBiasAddWithElu,
42   kBiasAddWithLeakyRelu,
43   kFusedBatchNorm,
44   kFusedBatchNormWithRelu,
45   kFusedBatchNormWithRelu6,
46   kFusedBatchNormWithElu,
47   kFusedBatchNormWithLeakyRelu
48 };
49 
50 // We have to pass around additional arguments for all possible fusion types.
51 struct FusedComputationArgs {
52   float epsilon = 0.0;          // Used by `FusedBatchNorm` fusion only
53   float leakyrelu_alpha = 0.0;  // Used by `LeakyRelu` fusion only
54 };
55 
56 struct FusedComputationPattern {
57   FusedComputationType fused_computation;
58   std::vector<string> fused_ops;
59 };
60 
61 // Parse attributes from the kernel construction context, and verifies that they
62 // specify valid fused computation pattern.
63 Status InitializeFusedComputation(
64     OpKernelConstruction* context, const string& kernel_name,
65     const std::vector<FusedComputationPattern>& patterns,
66     FusedComputationType* fused_computation,
67     FusedComputationArgs* fused_computation_args);
68 
69 // Type alias for the tensor contraction output mapper.
70 template <typename Scalar, typename StorageIndex>
71 using ContractionOutputMapper =
72     Eigen::internal::blas_data_mapper<Scalar, StorageIndex, Eigen::ColMajor>;
73 
74 // Returns input expression without any transformations.
75 struct Identity {
76   template <typename XprType>
77   static auto apply(XprType expr) -> XprType {
78     return expr;
79   };
80 };
81 
82 // Applies `Relu` to the passed input expression.
83 struct Relu {
84   template <typename XprType>
85   static auto apply(XprType expr)
86       -> decltype(expr.cwiseMax(std::declval<typename XprType::Scalar>())) {
87     return expr.cwiseMax(static_cast<typename XprType::Scalar>(0));
88   };
89 };
90 
91 // Applies `Relu6` to the passed input expression.
92 struct Relu6 {
93   template <typename XprType>
94   static auto apply(XprType expr)
95       -> decltype(expr.cwiseMax(std::declval<typename XprType::Scalar>())
96                       .cwiseMin(std::declval<typename XprType::Scalar>())) {
97     return expr.cwiseMax(static_cast<typename XprType::Scalar>(0))
98         .cwiseMin(static_cast<typename XprType::Scalar>(6));
99   };
100 };
101 
102 // Applies `Elu` to the passed input expression.
103 struct Elu {
104   template <typename XprType>
105   static auto apply(XprType expr) -> decltype(
106       (expr < std::declval<typename XprType::Scalar>())
107           .select(expr.exp() -
108                       expr.constant(std::declval<typename XprType::Scalar>()),
109                   expr)) {
110     return (expr < static_cast<typename XprType::Scalar>(0))
111         .select(expr.exp() -
112                     expr.constant(static_cast<typename XprType::Scalar>(1)),
113                 expr);
114   };
115 };
116 
117 // Applies `LeakyRelu` to the passed input expression.
118 struct LeakyRelu {
119   template <typename XprType>
120   static auto apply(XprType expr, const float leakyrelu_alpha) -> decltype(
121       (expr < std::declval<typename XprType::Scalar>())
122           .select(expr *
123                       expr.constant(std::declval<typename XprType::Scalar>()),
124                   expr)) {
125     return (expr < static_cast<typename XprType::Scalar>(0))
126         .select(expr * expr.constant(static_cast<typename XprType::Scalar>(
127                            leakyrelu_alpha)),
128                 expr);
129   };
130 };
131 
132 template <typename T>
133 struct BiasAddArgs {
134   const T* bias_add_data = nullptr;
135   float leakyrelu_alpha;
136 
IsSupportedBiasAddArgs137   static bool IsSupported(FusedComputationType fusion) {
138     return fusion == FusedComputationType::kBiasAdd ||
139            fusion == FusedComputationType::kBiasAddWithRelu ||
140            fusion == FusedComputationType::kBiasAddWithRelu6 ||
141            fusion == FusedComputationType::kBiasAddWithElu ||
142            fusion == FusedComputationType::kBiasAddWithLeakyRelu;
143   }
144 };
145 
146 template <typename T>
147 struct FusedBatchNormArgs {
148   const T* scale_data = nullptr;
149   const T* offset_data = nullptr;
150   const T* estimated_mean_data = nullptr;
151   const T* estimated_variance_data = nullptr;
152 
153   // Precomputed expression:
154   //   scaling_factor = (estimated_variance + epsilon).rsqrt() * scale
155   Eigen::Tensor<T, 1, Eigen::RowMajor> scaling_factor;
156 
157   float leakyrelu_alpha;
158 
IsSupportedFusedBatchNormArgs159   static bool IsSupported(FusedComputationType fusion) {
160     return fusion == FusedComputationType::kFusedBatchNorm ||
161            fusion == FusedComputationType::kFusedBatchNormWithRelu ||
162            fusion == FusedComputationType::kFusedBatchNormWithRelu6 ||
163            fusion == FusedComputationType::kFusedBatchNormWithElu ||
164            fusion == FusedComputationType::kFusedBatchNormWithLeakyRelu;
165   }
166 };
167 
168 // TensorContraction swaps lhs with rhs, and changes layout from RowMajor
169 // (default in Tensorflow) to ColMajor (preferred in Eigen), and computes matmul
170 // using these tensors.
171 //
172 // (1) Spatial Convolution (see eigen_spatial_convolutions.h):
173 //
174 //   TensorContraction output matrix (before reshape) has a ColMajor layout, and
175 //   has dimensions:
176 //   - rows: output_channels
177 //   - cols: all other dimensions
178 //
179 //   First element in every column is:
180 //     [batch ??, height ??, width ??, out_channel = i]
181 //
182 //   We do not know what are the values of the 'batch', 'height', and 'width'
183 //   here (if we know original dimensions, they can be computed from 'j').
184 //
185 //   Each column of an output block is a continuous slice along the output
186 //   channel dimension, so we can use it to efficiently compute any
187 //   transformation that depends only on a channel value (e.g. add channel
188 //   bias).
189 //
190 // (2) Matrix Multiplication (see matmul_op.cc):
191 //
192 //   For the `MxK * KxN` matrix multiplication, output matrix has a `MxN`
193 //   dimensions. Each column in output block is a slice of the innermost
194 //   dimension of the output matrix starting at offset 'i'.
195 //
196 //   Example: In Tensorflow MatMul [8x32] * [32x64], each output block column
197 //   will correspond to MatMul output row of size 64 (because Tensorflow uses
198 //   row major storage order).
199 
200 // Output kernel that fuses BiasAdd operation into the output of tensor
201 // contraction + activation function defined by Activation.
202 template <typename T, typename Activation = Identity>
203 struct BiasAddOutputKernel {
BiasAddOutputKernelBiasAddOutputKernel204   explicit BiasAddOutputKernel(const BiasAddArgs<T>& args)
205       : bias_data(args.bias_add_data) {}
206 
207   template <typename StorageIndex, typename Scalar>
operatorBiasAddOutputKernel208   EIGEN_ALWAYS_INLINE void operator()(
209       const ContractionOutputMapper<Scalar, StorageIndex>& output_mapper,
210       const Eigen::TensorContractionParams& params, StorageIndex i,
211       StorageIndex j, StorageIndex num_rows, StorageIndex num_cols) const {
212     DCHECK(params.swapped_arguments);
213 
214     const T* bias_base = bias_data + i;
215     typename TTypes<T>::UnalignedConstTensor bias(bias_base, num_rows);
216 
217     for (int col = 0; col < num_cols; ++col) {
218       T* output_base = &output_mapper(0, col);
219       typename TTypes<T>::UnalignedTensor output(output_base, num_rows);
220       const auto expr = output + bias;
221       output = Activation::template apply<decltype(expr)>(expr);
222     }
223   }
224 
225  private:
226   const T* bias_data;
227 };
228 
229 template <typename T>
230 struct BiasAddOutputKernel<T, LeakyRelu> {
231   explicit BiasAddOutputKernel(const BiasAddArgs<T>& args)
232       : bias_data(args.bias_add_data), leakyrelu_alpha(args.leakyrelu_alpha) {}
233 
234   template <typename StorageIndex, typename Scalar>
235   EIGEN_ALWAYS_INLINE void operator()(
236       const ContractionOutputMapper<Scalar, StorageIndex>& output_mapper,
237       const Eigen::TensorContractionParams& params, StorageIndex i,
238       StorageIndex j, StorageIndex num_rows, StorageIndex num_cols) const {
239     DCHECK(params.swapped_arguments);
240 
241     const T* bias_base = bias_data + i;
242     typename TTypes<T>::UnalignedConstTensor bias(bias_base, num_rows);
243 
244     for (int col = 0; col < num_cols; ++col) {
245       T* output_base = &output_mapper(0, col);
246       typename TTypes<T>::UnalignedTensor output(output_base, num_rows);
247       const auto expr = output + bias;
248       output = LeakyRelu::template apply<decltype(expr)>(expr, leakyrelu_alpha);
249     }
250   }
251 
252  private:
253   const T* bias_data;
254   float leakyrelu_alpha;
255 };
256 
257 // Output kernel that fuses FusedBatchNorm operation into the output of tensor
258 // contraction + activation function defined by Activation.
259 template <typename T, typename Activation = Identity>
260 struct FusedBatchNormOutputKernel {
261   FusedBatchNormOutputKernel(T epsilon, const FusedBatchNormArgs<T>& args)
262       : epsilon(epsilon),
263         scaling_factor_data(args.scaling_factor.data()),
264         offset_data(args.offset_data),
265         estimated_mean_data(args.estimated_mean_data) {}
266 
267   template <typename StorageIndex, typename Scalar>
268   EIGEN_ALWAYS_INLINE void operator()(
269       const ContractionOutputMapper<Scalar, StorageIndex>& output_mapper,
270       const Eigen::TensorContractionParams& params, StorageIndex i,
271       StorageIndex j, StorageIndex num_rows, StorageIndex num_cols) const {
272     DCHECK(params.swapped_arguments);
273 
274     const T* scaling_factor_base = scaling_factor_data + i;
275     const T* offset_base = offset_data + i;
276     const T* mean_base = estimated_mean_data + i;
277 
278     typename TTypes<T>::UnalignedConstTensor scaling_factor(scaling_factor_base,
279                                                             num_rows);
280     typename TTypes<T>::UnalignedConstTensor offset(offset_base, num_rows);
281     typename TTypes<T>::UnalignedConstTensor mean(mean_base, num_rows);
282 
283     for (int col = 0; col < num_cols; ++col) {
284       T* output_base = &output_mapper(0, col);
285       typename TTypes<T>::UnalignedTensor output(output_base, num_rows);
286 
287       auto scaled = (output - mean) * scaling_factor;
288       auto shifted = scaled + offset;
289 
290       output = Activation::template apply<decltype(shifted)>(shifted);
291     }
292   }
293 
294  private:
295   T epsilon;
296   const T* scaling_factor_data;
297   const T* offset_data;
298   const T* estimated_mean_data;
299 };
300 
301 template <typename T>
302 struct FusedBatchNormOutputKernel<T, LeakyRelu> {
303   FusedBatchNormOutputKernel(T epsilon, const FusedBatchNormArgs<T>& args)
304       : epsilon(epsilon),
305         scaling_factor_data(args.scaling_factor.data()),
306         offset_data(args.offset_data),
307         estimated_mean_data(args.estimated_mean_data),
308         leakyrelu_alpha(args.leakyrelu_alpha) {}
309 
310   template <typename StorageIndex, typename Scalar>
311   EIGEN_ALWAYS_INLINE void operator()(
312       const ContractionOutputMapper<Scalar, StorageIndex>& output_mapper,
313       const Eigen::TensorContractionParams& params, StorageIndex i,
314       StorageIndex j, StorageIndex num_rows, StorageIndex num_cols) const {
315     DCHECK(params.swapped_arguments);
316 
317     const T* scaling_factor_base = scaling_factor_data + i;
318     const T* offset_base = offset_data + i;
319     const T* mean_base = estimated_mean_data + i;
320 
321     typename TTypes<T>::UnalignedConstTensor scaling_factor(scaling_factor_base,
322                                                             num_rows);
323     typename TTypes<T>::UnalignedConstTensor offset(offset_base, num_rows);
324     typename TTypes<T>::UnalignedConstTensor mean(mean_base, num_rows);
325 
326     for (int col = 0; col < num_cols; ++col) {
327       T* output_base = &output_mapper(0, col);
328       typename TTypes<T>::UnalignedTensor output(output_base, num_rows);
329 
330       auto scaled = (output - mean) * scaling_factor;
331       auto shifted = scaled + offset;
332 
333       output = LeakyRelu::template apply<decltype(shifted)>(shifted,
334                                                             leakyrelu_alpha);
335     }
336   }
337 
338  private:
339   T epsilon;
340   const T* scaling_factor_data;
341   const T* offset_data;
342   const T* estimated_mean_data;
343   float leakyrelu_alpha;
344 };
345 
346 // Type aliases for the output kernels, purely for the sake of better launch
347 // dispatching code readability.
348 template <typename T>
349 using WithBiasAdd = BiasAddOutputKernel<T>;
350 template <typename T>
351 using WithBiasAddAndRelu = BiasAddOutputKernel<T, Relu>;
352 template <typename T>
353 using WithBiasAddAndRelu6 = BiasAddOutputKernel<T, Relu6>;
354 template <typename T>
355 using WithBiasAddAndElu = BiasAddOutputKernel<T, Elu>;
356 template <typename T>
357 using WithBiasAddAndLeakyRelu = BiasAddOutputKernel<T, LeakyRelu>;
358 template <typename T>
359 using WithFusedBatchNorm = FusedBatchNormOutputKernel<T>;
360 template <typename T>
361 using WithFusedBatchNormAndRelu = FusedBatchNormOutputKernel<T, Relu>;
362 template <typename T>
363 using WithFusedBatchNormAndRelu6 = FusedBatchNormOutputKernel<T, Relu6>;
364 template <typename T>
365 using WithFusedBatchNormAndElu = FusedBatchNormOutputKernel<T, Elu>;
366 template <typename T>
367 using WithFusedBatchNormAndLeakyRelu = FusedBatchNormOutputKernel<T, LeakyRelu>;
368 
369 template <typename T>
370 Status InitBiasAddArgs(OpKernelContext* context, BiasAddArgs<T>* args,
371                        const float* leakyrelu_alpha = nullptr) {
372   // Bias of the following dimensions: [ output_depth ]
373   const Tensor& bias = context->input(2);
374 
375   if (bias.dims() != 1)
376     return errors::InvalidArgument("bias must be 1-dimensional",
377                                    bias.shape().DebugString());
378 
379   const auto data_ptr = [](const Tensor& tensor) -> const T* {
380     return reinterpret_cast<const T*>(tensor.tensor_data().data());
381   };
382 
383   args->bias_add_data = data_ptr(bias);
384 
385   if (leakyrelu_alpha) {
386     args->leakyrelu_alpha = *leakyrelu_alpha;
387   }
388 
389   return Status::OK();
390 }
391 
392 template <typename T>
393 Status InitFusedBatchNormArgs(OpKernelContext* context, float epsilon,
394                               FusedBatchNormArgs<T>* args,
395                               const float* leakyrelu_alpha = nullptr) {
396   const Tensor& scale = context->input(2);
397   const Tensor& offset = context->input(3);
398   const Tensor& estimated_mean = context->input(4);
399   const Tensor& estimated_variance = context->input(5);
400 
401   if (scale.dims() != 1)
402     return errors::InvalidArgument("scale must be 1-dimensional",
403                                    scale.shape().DebugString());
404   if (offset.dims() != 1)
405     return errors::InvalidArgument("offset must be 1-dimensional",
406                                    offset.shape().DebugString());
407   if (estimated_mean.dims() != 1)
408     return errors::InvalidArgument("estimated_mean must be 1-dimensional",
409                                    estimated_mean.shape().DebugString());
410   if (estimated_variance.dims() != 1)
411     return errors::InvalidArgument("estimated_variance must be 1-dimensional",
412                                    estimated_variance.shape().DebugString());
413 
414   const auto data_ptr = [](const Tensor& tensor) -> const T* {
415     return reinterpret_cast<const T*>(tensor.tensor_data().data());
416   };
417 
418   args->scale_data = data_ptr(scale);
419   args->offset_data = data_ptr(offset);
420   args->estimated_mean_data = data_ptr(estimated_mean);
421   args->estimated_variance_data = data_ptr(estimated_variance);
422 
423   // Precompute scaling factor once for all output blocks (kernels).
424   args->scaling_factor =
425       (estimated_variance.flat<T>() + static_cast<T>(epsilon)).rsqrt() *
426       scale.flat<T>();
427 
428   if (leakyrelu_alpha) {
429     args->leakyrelu_alpha = *leakyrelu_alpha;
430   }
431 
432   return Status::OK();
433 }
434 
435 }  // namespace tensorflow
436 
437 #endif  // TENSORFLOW_CORE_KERNELS_FUSED_EIGEN_OUTPUT_KERNELS_H_
438