1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #include <algorithm> 17 #include <cmath> 18 #include <type_traits> 19 20 #define GEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK 21 #include "public/gemmlowp.h" 22 #include "tensorflow/core/framework/bounds_check.h" 23 #include "tensorflow/core/framework/kernel_shape_util.h" 24 #include "tensorflow/core/framework/numeric_op.h" 25 #include "tensorflow/core/framework/op_kernel.h" 26 #include "tensorflow/core/framework/register_types.h" 27 #include "tensorflow/core/framework/tensor.h" 28 #include "tensorflow/core/framework/tensor_shape.h" 29 #include "tensorflow/core/framework/tensor_types.h" 30 #include "tensorflow/core/framework/types.h" 31 #include "tensorflow/core/kernels/neon/depthwiseconv_float.h" 32 #include "tensorflow/core/lib/core/status.h" 33 #include "tensorflow/core/platform/logging.h" 34 #include "tensorflow/core/platform/mem.h" 35 #include "tensorflow/core/platform/types.h" 36 #include "tensorflow/core/util/padding.h" 37 38 namespace tensorflow { 39 40 // A version of tensorflow/core/kernels/depthwise_conv_op.cc that 41 // uses the neon intrinsics. 42 class NeonDepthwiseConv2dNativeOp : public BinaryOp<float> { 43 public: NeonDepthwiseConv2dNativeOp(OpKernelConstruction * context)44 explicit NeonDepthwiseConv2dNativeOp(OpKernelConstruction* context) 45 : BinaryOp<float>(context) { 46 OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_)); 47 OP_REQUIRES(context, strides_.size() == 4, 48 errors::InvalidArgument("Sliding window strides field must " 49 "specify 4 dimensions")); 50 OP_REQUIRES(context, strides_[1] == strides_[2], 51 errors::InvalidArgument( 52 "Current implementation only supports equal length " 53 "strides in the row and column dimensions.")); 54 OP_REQUIRES( 55 context, (strides_[0] == 1 && strides_[3] == 1), 56 errors::InvalidArgument("Current implementation does not yet support " 57 "strides in the batch and depth dimensions.")); 58 OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_)); 59 } 60 Compute(OpKernelContext * context)61 void Compute(OpKernelContext* context) override { 62 const Tensor& input = context->input(0); 63 const Tensor& filter = context->input(1); 64 65 // For 2D convolution, there should be 4 dimensions. 66 OP_REQUIRES(context, input.dims() == 4, 67 errors::InvalidArgument("input must be 4-dimensional", 68 input.shape().DebugString())); 69 OP_REQUIRES(context, filter.dims() == 4, 70 errors::InvalidArgument("filter must be 4-dimensional: ", 71 filter.shape().DebugString())); 72 73 const int32 in_depth = input.dim_size(3); 74 OP_REQUIRES(context, in_depth == filter.dim_size(2), 75 errors::InvalidArgument( 76 "input and filter must have the same depth: ", in_depth, 77 " vs ", filter.dim_size(2))); 78 const int32 batch = input.dim_size(0); 79 const int32 input_rows = input.dim_size(1); 80 const int32 input_cols = input.dim_size(2); 81 82 const int32 filter_rows = filter.dim_size(0); 83 const int32 filter_cols = filter.dim_size(1); 84 const int32 depth_multiplier = filter.dim_size(3); 85 86 const int32 out_depth = in_depth * depth_multiplier; 87 88 const int32 stride = strides_[1]; 89 90 int64 out_rows = 0, out_cols = 0, pad_rows = 0, pad_cols = 0; 91 OP_REQUIRES_OK(context, 92 GetWindowedOutputSize(input_rows, filter_rows, stride, 93 padding_, &out_rows, &pad_rows)); 94 OP_REQUIRES_OK(context, 95 GetWindowedOutputSize(input_cols, filter_cols, stride, 96 padding_, &out_cols, &pad_cols)); 97 TensorShape out_shape({batch, out_rows, out_cols, out_depth}); 98 OP_REQUIRES( 99 context, 100 FastBoundsCheck(out_shape.num_elements(), 101 std::numeric_limits<int32>::max()), 102 errors::InvalidArgument("Output elements too large for NEON kernel")); 103 104 // Output tensor is of the following dimensions: 105 // [ in_batch, out_rows, out_cols, out_depth ] 106 Tensor* output = nullptr; 107 OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output)); 108 109 VLOG(2) << "NeonDepthwiseConv2dNative: " 110 << " Input: [" << batch << ", " << input_rows << ", " << input_cols 111 << ", " << in_depth << "]; Filter: [" << filter_rows << ", " 112 << filter_cols << ", " << in_depth << ", " << depth_multiplier 113 << "]; stride = " << stride << ", pad_rows = " << pad_rows 114 << ", pad_cols = " << pad_cols << ", output: [" << batch << ", " 115 << out_rows << ", " << out_cols << ", " << out_depth << "]"; 116 117 // If there is nothing to compute, return. 118 if (out_shape.num_elements() == 0) { 119 return; 120 } 121 122 const float* input_ptr = input.template flat<float>().data(); 123 const float* filter_ptr = filter.template flat<float>().data(); 124 float* output_ptr = output->template flat<float>().data(); 125 126 auto input_neon_dims = ToNeonDims(input.shape()); 127 auto filter_neon_dims = FilterToNeonDims(filter.shape()); 128 auto bias_neon_dims = BiasNeonDims(filter.shape()); 129 130 int64 bias_size = bias_neon_dims.sizes[0]; 131 float* bias_ptr = static_cast<float*>(port::AlignedMalloc( 132 bias_size * sizeof(float), Allocator::kAllocatorAlignment)); 133 memset(bias_ptr, 0, bias_size * sizeof(float)); 134 135 neon::DepthwiseConv<neon::FusedActivationFunctionType::kNone>( 136 input_ptr, input_neon_dims, filter_ptr, filter_neon_dims, bias_ptr, 137 bias_neon_dims, stride, pad_cols, pad_rows, depth_multiplier, 138 output_ptr, ToNeonDims(out_shape)); 139 140 port::AlignedFree(bias_ptr); 141 } 142 143 private: SetNeonDimStrides(neon::Dims<4> * d)144 void SetNeonDimStrides(neon::Dims<4>* d) { 145 int64 stride = 1; 146 for (int i = 0; i < 4; ++i) { 147 d->strides[i] = stride; 148 stride *= d->sizes[i]; 149 } 150 } 151 ToNeonDims(const TensorShape & input)152 neon::Dims<4> ToNeonDims(const TensorShape& input) { 153 // Dims in the neon kernels are channel, x, y, batch order. 154 neon::Dims<4> result; 155 result.sizes[0] = input.dim_size(3); 156 result.sizes[1] = input.dim_size(2); 157 result.sizes[2] = input.dim_size(1); 158 result.sizes[3] = input.dim_size(0); 159 SetNeonDimStrides(&result); 160 return result; 161 } 162 FilterToNeonDims(const TensorShape & filter)163 neon::Dims<4> FilterToNeonDims(const TensorShape& filter) { 164 // Dims in the neon kernels are channel, x, y, batch order. 165 neon::Dims<4> result; 166 result.sizes[0] = filter.dim_size(2) * filter.dim_size(3); 167 result.sizes[1] = filter.dim_size(1); 168 result.sizes[2] = filter.dim_size(0); 169 result.sizes[3] = 1; 170 SetNeonDimStrides(&result); 171 172 return result; 173 } 174 BiasNeonDims(const TensorShape & filter)175 neon::Dims<4> BiasNeonDims(const TensorShape& filter) { 176 // Dims in the neon kernels are channel, x, y, batch order. 177 // Bias has only output channel set. 178 neon::Dims<4> result; 179 result.sizes[0] = 180 filter.dim_size(2) * filter.dim_size(3); // output channels 181 result.sizes[1] = 1; 182 result.sizes[2] = 1; 183 result.sizes[3] = 1; 184 SetNeonDimStrides(&result); 185 186 return result; 187 } 188 189 std::vector<int32> strides_; 190 Padding padding_; 191 192 TF_DISALLOW_COPY_AND_ASSIGN(NeonDepthwiseConv2dNativeOp); 193 }; 194 195 #define REGISTER_CPU_KERNEL(T) \ 196 REGISTER_KERNEL_BUILDER(Name("DepthwiseConv2dNative") \ 197 .Device(DEVICE_CPU) \ 198 .TypeConstraint<float>("T") \ 199 .Label("neon"), \ 200 NeonDepthwiseConv2dNativeOp); 201 202 TF_CALL_float(REGISTER_CPU_KERNEL); 203 204 } // namespace tensorflow 205