1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 // Output kernels for fusing computation into Eigen Tensor contractions: 17 // (1) FusedConv2DOp 18 // (2) FusedMatMulOp 19 // 20 // Supported fused computations: 21 // (1) {Conv2D/MatMul} + BiasAdd + <Activation> 22 // (2) {Conv2D/MatMul} + FusedBatchNorm + <Activation> 23 // 24 // Activation: Relu, Relu6, Elu, etc... 25 26 #ifndef TENSORFLOW_CORE_KERNELS_FUSED_EIGEN_OUTPUT_KERNELS_H_ 27 #define TENSORFLOW_CORE_KERNELS_FUSED_EIGEN_OUTPUT_KERNELS_H_ 28 29 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" 30 #include "tensorflow/core/framework/op_kernel.h" 31 #include "tensorflow/core/framework/tensor.h" 32 #include "tensorflow/core/framework/tensor_types.h" 33 34 namespace tensorflow { 35 36 enum class FusedComputationType { 37 kUndefined, 38 kBiasAdd, 39 kBiasAddWithRelu, 40 kBiasAddWithRelu6, 41 kBiasAddWithElu, 42 kBiasAddWithLeakyRelu, 43 kFusedBatchNorm, 44 kFusedBatchNormWithRelu, 45 kFusedBatchNormWithRelu6, 46 kFusedBatchNormWithElu, 47 kFusedBatchNormWithLeakyRelu 48 }; 49 50 // We have to pass around additional arguments for all possible fusion types. 51 struct FusedComputationArgs { 52 float epsilon = 0.0; // Used by `FusedBatchNorm` fusion only 53 float leakyrelu_alpha = 0.0; // Used by `LeakyRelu` fusion only 54 }; 55 56 struct FusedComputationPattern { 57 FusedComputationType fused_computation; 58 std::vector<string> fused_ops; 59 }; 60 61 // Parse attributes from the kernel construction context, and verifies that they 62 // specify valid fused computation pattern. 63 Status InitializeFusedComputation( 64 OpKernelConstruction* context, const string& kernel_name, 65 const std::vector<FusedComputationPattern>& patterns, 66 FusedComputationType* fused_computation, 67 FusedComputationArgs* fused_computation_args); 68 69 // Type alias for the tensor contraction output mapper. 70 template <typename Scalar, typename StorageIndex> 71 using ContractionOutputMapper = 72 Eigen::internal::blas_data_mapper<Scalar, StorageIndex, Eigen::ColMajor>; 73 74 // Returns input expression without any transformations. 75 struct Identity { 76 template <typename XprType> 77 static auto apply(XprType expr) -> XprType { 78 return expr; 79 }; 80 }; 81 82 // Applies `Relu` to the passed input expression. 83 struct Relu { 84 template <typename XprType> 85 static auto apply(XprType expr) 86 -> decltype(expr.cwiseMax(std::declval<typename XprType::Scalar>())) { 87 return expr.cwiseMax(static_cast<typename XprType::Scalar>(0)); 88 }; 89 }; 90 91 // Applies `Relu6` to the passed input expression. 92 struct Relu6 { 93 template <typename XprType> 94 static auto apply(XprType expr) 95 -> decltype(expr.cwiseMax(std::declval<typename XprType::Scalar>()) 96 .cwiseMin(std::declval<typename XprType::Scalar>())) { 97 return expr.cwiseMax(static_cast<typename XprType::Scalar>(0)) 98 .cwiseMin(static_cast<typename XprType::Scalar>(6)); 99 }; 100 }; 101 102 // Applies `Elu` to the passed input expression. 103 struct Elu { 104 template <typename XprType> 105 static auto apply(XprType expr) -> decltype( 106 (expr < std::declval<typename XprType::Scalar>()) 107 .select(expr.exp() - 108 expr.constant(std::declval<typename XprType::Scalar>()), 109 expr)) { 110 return (expr < static_cast<typename XprType::Scalar>(0)) 111 .select(expr.exp() - 112 expr.constant(static_cast<typename XprType::Scalar>(1)), 113 expr); 114 }; 115 }; 116 117 // Applies `LeakyRelu` to the passed input expression. 118 struct LeakyRelu { 119 template <typename XprType> 120 static auto apply(XprType expr, const float leakyrelu_alpha) -> decltype( 121 (expr < std::declval<typename XprType::Scalar>()) 122 .select(expr * 123 expr.constant(std::declval<typename XprType::Scalar>()), 124 expr)) { 125 return (expr < static_cast<typename XprType::Scalar>(0)) 126 .select(expr * expr.constant(static_cast<typename XprType::Scalar>( 127 leakyrelu_alpha)), 128 expr); 129 }; 130 }; 131 132 template <typename T> 133 struct BiasAddArgs { 134 const T* bias_add_data = nullptr; 135 float leakyrelu_alpha; 136 IsSupportedBiasAddArgs137 static bool IsSupported(FusedComputationType fusion) { 138 return fusion == FusedComputationType::kBiasAdd || 139 fusion == FusedComputationType::kBiasAddWithRelu || 140 fusion == FusedComputationType::kBiasAddWithRelu6 || 141 fusion == FusedComputationType::kBiasAddWithElu || 142 fusion == FusedComputationType::kBiasAddWithLeakyRelu; 143 } 144 }; 145 146 template <typename T> 147 struct FusedBatchNormArgs { 148 const T* scale_data = nullptr; 149 const T* offset_data = nullptr; 150 const T* estimated_mean_data = nullptr; 151 const T* estimated_variance_data = nullptr; 152 153 // Precomputed expression: 154 // scaling_factor = (estimated_variance + epsilon).rsqrt() * scale 155 Eigen::Tensor<T, 1, Eigen::RowMajor> scaling_factor; 156 157 float leakyrelu_alpha; 158 IsSupportedFusedBatchNormArgs159 static bool IsSupported(FusedComputationType fusion) { 160 return fusion == FusedComputationType::kFusedBatchNorm || 161 fusion == FusedComputationType::kFusedBatchNormWithRelu || 162 fusion == FusedComputationType::kFusedBatchNormWithRelu6 || 163 fusion == FusedComputationType::kFusedBatchNormWithElu || 164 fusion == FusedComputationType::kFusedBatchNormWithLeakyRelu; 165 } 166 }; 167 168 // TensorContraction swaps lhs with rhs, and changes layout from RowMajor 169 // (default in Tensorflow) to ColMajor (preferred in Eigen), and computes matmul 170 // using these tensors. 171 // 172 // (1) Spatial Convolution (see eigen_spatial_convolutions.h): 173 // 174 // TensorContraction output matrix (before reshape) has a ColMajor layout, and 175 // has dimensions: 176 // - rows: output_channels 177 // - cols: all other dimensions 178 // 179 // First element in every column is: 180 // [batch ??, height ??, width ??, out_channel = i] 181 // 182 // We do not know what are the values of the 'batch', 'height', and 'width' 183 // here (if we know original dimensions, they can be computed from 'j'). 184 // 185 // Each column of an output block is a continuous slice along the output 186 // channel dimension, so we can use it to efficiently compute any 187 // transformation that depends only on a channel value (e.g. add channel 188 // bias). 189 // 190 // (2) Matrix Multiplication (see matmul_op.cc): 191 // 192 // For the `MxK * KxN` matrix multiplication, output matrix has a `MxN` 193 // dimensions. Each column in output block is a slice of the innermost 194 // dimension of the output matrix starting at offset 'i'. 195 // 196 // Example: In Tensorflow MatMul [8x32] * [32x64], each output block column 197 // will correspond to MatMul output row of size 64 (because Tensorflow uses 198 // row major storage order). 199 200 // Output kernel that fuses BiasAdd operation into the output of tensor 201 // contraction + activation function defined by Activation. 202 template <typename T, typename Activation = Identity> 203 struct BiasAddOutputKernel { BiasAddOutputKernelBiasAddOutputKernel204 explicit BiasAddOutputKernel(const BiasAddArgs<T>& args) 205 : bias_data(args.bias_add_data) {} 206 207 template <typename StorageIndex, typename Scalar> operatorBiasAddOutputKernel208 EIGEN_ALWAYS_INLINE void operator()( 209 const ContractionOutputMapper<Scalar, StorageIndex>& output_mapper, 210 const Eigen::TensorContractionParams& params, StorageIndex i, 211 StorageIndex j, StorageIndex num_rows, StorageIndex num_cols) const { 212 DCHECK(params.swapped_arguments); 213 214 const T* bias_base = bias_data + i; 215 typename TTypes<T>::UnalignedConstTensor bias(bias_base, num_rows); 216 217 for (int col = 0; col < num_cols; ++col) { 218 T* output_base = &output_mapper(0, col); 219 typename TTypes<T>::UnalignedTensor output(output_base, num_rows); 220 const auto expr = output + bias; 221 output = Activation::template apply<decltype(expr)>(expr); 222 } 223 } 224 225 private: 226 const T* bias_data; 227 }; 228 229 template <typename T> 230 struct BiasAddOutputKernel<T, LeakyRelu> { 231 explicit BiasAddOutputKernel(const BiasAddArgs<T>& args) 232 : bias_data(args.bias_add_data), leakyrelu_alpha(args.leakyrelu_alpha) {} 233 234 template <typename StorageIndex, typename Scalar> 235 EIGEN_ALWAYS_INLINE void operator()( 236 const ContractionOutputMapper<Scalar, StorageIndex>& output_mapper, 237 const Eigen::TensorContractionParams& params, StorageIndex i, 238 StorageIndex j, StorageIndex num_rows, StorageIndex num_cols) const { 239 DCHECK(params.swapped_arguments); 240 241 const T* bias_base = bias_data + i; 242 typename TTypes<T>::UnalignedConstTensor bias(bias_base, num_rows); 243 244 for (int col = 0; col < num_cols; ++col) { 245 T* output_base = &output_mapper(0, col); 246 typename TTypes<T>::UnalignedTensor output(output_base, num_rows); 247 const auto expr = output + bias; 248 output = LeakyRelu::template apply<decltype(expr)>(expr, leakyrelu_alpha); 249 } 250 } 251 252 private: 253 const T* bias_data; 254 float leakyrelu_alpha; 255 }; 256 257 // Output kernel that fuses FusedBatchNorm operation into the output of tensor 258 // contraction + activation function defined by Activation. 259 template <typename T, typename Activation = Identity> 260 struct FusedBatchNormOutputKernel { 261 FusedBatchNormOutputKernel(T epsilon, const FusedBatchNormArgs<T>& args) 262 : epsilon(epsilon), 263 scaling_factor_data(args.scaling_factor.data()), 264 offset_data(args.offset_data), 265 estimated_mean_data(args.estimated_mean_data) {} 266 267 template <typename StorageIndex, typename Scalar> 268 EIGEN_ALWAYS_INLINE void operator()( 269 const ContractionOutputMapper<Scalar, StorageIndex>& output_mapper, 270 const Eigen::TensorContractionParams& params, StorageIndex i, 271 StorageIndex j, StorageIndex num_rows, StorageIndex num_cols) const { 272 DCHECK(params.swapped_arguments); 273 274 const T* scaling_factor_base = scaling_factor_data + i; 275 const T* offset_base = offset_data + i; 276 const T* mean_base = estimated_mean_data + i; 277 278 typename TTypes<T>::UnalignedConstTensor scaling_factor(scaling_factor_base, 279 num_rows); 280 typename TTypes<T>::UnalignedConstTensor offset(offset_base, num_rows); 281 typename TTypes<T>::UnalignedConstTensor mean(mean_base, num_rows); 282 283 for (int col = 0; col < num_cols; ++col) { 284 T* output_base = &output_mapper(0, col); 285 typename TTypes<T>::UnalignedTensor output(output_base, num_rows); 286 287 auto scaled = (output - mean) * scaling_factor; 288 auto shifted = scaled + offset; 289 290 output = Activation::template apply<decltype(shifted)>(shifted); 291 } 292 } 293 294 private: 295 T epsilon; 296 const T* scaling_factor_data; 297 const T* offset_data; 298 const T* estimated_mean_data; 299 }; 300 301 template <typename T> 302 struct FusedBatchNormOutputKernel<T, LeakyRelu> { 303 FusedBatchNormOutputKernel(T epsilon, const FusedBatchNormArgs<T>& args) 304 : epsilon(epsilon), 305 scaling_factor_data(args.scaling_factor.data()), 306 offset_data(args.offset_data), 307 estimated_mean_data(args.estimated_mean_data), 308 leakyrelu_alpha(args.leakyrelu_alpha) {} 309 310 template <typename StorageIndex, typename Scalar> 311 EIGEN_ALWAYS_INLINE void operator()( 312 const ContractionOutputMapper<Scalar, StorageIndex>& output_mapper, 313 const Eigen::TensorContractionParams& params, StorageIndex i, 314 StorageIndex j, StorageIndex num_rows, StorageIndex num_cols) const { 315 DCHECK(params.swapped_arguments); 316 317 const T* scaling_factor_base = scaling_factor_data + i; 318 const T* offset_base = offset_data + i; 319 const T* mean_base = estimated_mean_data + i; 320 321 typename TTypes<T>::UnalignedConstTensor scaling_factor(scaling_factor_base, 322 num_rows); 323 typename TTypes<T>::UnalignedConstTensor offset(offset_base, num_rows); 324 typename TTypes<T>::UnalignedConstTensor mean(mean_base, num_rows); 325 326 for (int col = 0; col < num_cols; ++col) { 327 T* output_base = &output_mapper(0, col); 328 typename TTypes<T>::UnalignedTensor output(output_base, num_rows); 329 330 auto scaled = (output - mean) * scaling_factor; 331 auto shifted = scaled + offset; 332 333 output = LeakyRelu::template apply<decltype(shifted)>(shifted, 334 leakyrelu_alpha); 335 } 336 } 337 338 private: 339 T epsilon; 340 const T* scaling_factor_data; 341 const T* offset_data; 342 const T* estimated_mean_data; 343 float leakyrelu_alpha; 344 }; 345 346 // Type aliases for the output kernels, purely for the sake of better launch 347 // dispatching code readability. 348 template <typename T> 349 using WithBiasAdd = BiasAddOutputKernel<T>; 350 template <typename T> 351 using WithBiasAddAndRelu = BiasAddOutputKernel<T, Relu>; 352 template <typename T> 353 using WithBiasAddAndRelu6 = BiasAddOutputKernel<T, Relu6>; 354 template <typename T> 355 using WithBiasAddAndElu = BiasAddOutputKernel<T, Elu>; 356 template <typename T> 357 using WithBiasAddAndLeakyRelu = BiasAddOutputKernel<T, LeakyRelu>; 358 template <typename T> 359 using WithFusedBatchNorm = FusedBatchNormOutputKernel<T>; 360 template <typename T> 361 using WithFusedBatchNormAndRelu = FusedBatchNormOutputKernel<T, Relu>; 362 template <typename T> 363 using WithFusedBatchNormAndRelu6 = FusedBatchNormOutputKernel<T, Relu6>; 364 template <typename T> 365 using WithFusedBatchNormAndElu = FusedBatchNormOutputKernel<T, Elu>; 366 template <typename T> 367 using WithFusedBatchNormAndLeakyRelu = FusedBatchNormOutputKernel<T, LeakyRelu>; 368 369 template <typename T> 370 Status InitBiasAddArgs(OpKernelContext* context, BiasAddArgs<T>* args, 371 const float* leakyrelu_alpha = nullptr) { 372 // Bias of the following dimensions: [ output_depth ] 373 const Tensor& bias = context->input(2); 374 375 if (bias.dims() != 1) 376 return errors::InvalidArgument("bias must be 1-dimensional", 377 bias.shape().DebugString()); 378 379 const auto data_ptr = [](const Tensor& tensor) -> const T* { 380 return reinterpret_cast<const T*>(tensor.tensor_data().data()); 381 }; 382 383 args->bias_add_data = data_ptr(bias); 384 385 if (leakyrelu_alpha) { 386 args->leakyrelu_alpha = *leakyrelu_alpha; 387 } 388 389 return Status::OK(); 390 } 391 392 template <typename T> 393 Status InitFusedBatchNormArgs(OpKernelContext* context, float epsilon, 394 FusedBatchNormArgs<T>* args, 395 const float* leakyrelu_alpha = nullptr) { 396 const Tensor& scale = context->input(2); 397 const Tensor& offset = context->input(3); 398 const Tensor& estimated_mean = context->input(4); 399 const Tensor& estimated_variance = context->input(5); 400 401 if (scale.dims() != 1) 402 return errors::InvalidArgument("scale must be 1-dimensional", 403 scale.shape().DebugString()); 404 if (offset.dims() != 1) 405 return errors::InvalidArgument("offset must be 1-dimensional", 406 offset.shape().DebugString()); 407 if (estimated_mean.dims() != 1) 408 return errors::InvalidArgument("estimated_mean must be 1-dimensional", 409 estimated_mean.shape().DebugString()); 410 if (estimated_variance.dims() != 1) 411 return errors::InvalidArgument("estimated_variance must be 1-dimensional", 412 estimated_variance.shape().DebugString()); 413 414 const auto data_ptr = [](const Tensor& tensor) -> const T* { 415 return reinterpret_cast<const T*>(tensor.tensor_data().data()); 416 }; 417 418 args->scale_data = data_ptr(scale); 419 args->offset_data = data_ptr(offset); 420 args->estimated_mean_data = data_ptr(estimated_mean); 421 args->estimated_variance_data = data_ptr(estimated_variance); 422 423 // Precompute scaling factor once for all output blocks (kernels). 424 args->scaling_factor = 425 (estimated_variance.flat<T>() + static_cast<T>(epsilon)).rsqrt() * 426 scale.flat<T>(); 427 428 if (leakyrelu_alpha) { 429 args->leakyrelu_alpha = *leakyrelu_alpha; 430 } 431 432 return Status::OK(); 433 } 434 435 } // namespace tensorflow 436 437 #endif // TENSORFLOW_CORE_KERNELS_FUSED_EIGEN_OUTPUT_KERNELS_H_ 438