• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include <algorithm>
17 #include <cmath>
18 #include <type_traits>
19 
20 #define GEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK
21 #include "public/gemmlowp.h"
22 #include "tensorflow/core/framework/bounds_check.h"
23 #include "tensorflow/core/framework/kernel_shape_util.h"
24 #include "tensorflow/core/framework/numeric_op.h"
25 #include "tensorflow/core/framework/op_kernel.h"
26 #include "tensorflow/core/framework/register_types.h"
27 #include "tensorflow/core/framework/tensor.h"
28 #include "tensorflow/core/framework/tensor_shape.h"
29 #include "tensorflow/core/framework/tensor_types.h"
30 #include "tensorflow/core/framework/types.h"
31 #include "tensorflow/core/kernels/neon/depthwiseconv_float.h"
32 #include "tensorflow/core/lib/core/status.h"
33 #include "tensorflow/core/platform/logging.h"
34 #include "tensorflow/core/platform/mem.h"
35 #include "tensorflow/core/platform/types.h"
36 #include "tensorflow/core/util/padding.h"
37 
38 namespace tensorflow {
39 
40 // A version of tensorflow/core/kernels/depthwise_conv_op.cc that
41 // uses the neon intrinsics.
42 class NeonDepthwiseConv2dNativeOp : public BinaryOp<float> {
43  public:
NeonDepthwiseConv2dNativeOp(OpKernelConstruction * context)44   explicit NeonDepthwiseConv2dNativeOp(OpKernelConstruction* context)
45       : BinaryOp<float>(context) {
46     OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_));
47     OP_REQUIRES(context, strides_.size() == 4,
48                 errors::InvalidArgument("Sliding window strides field must "
49                                         "specify 4 dimensions"));
50     OP_REQUIRES(context, strides_[1] == strides_[2],
51                 errors::InvalidArgument(
52                     "Current implementation only supports equal length "
53                     "strides in the row and column dimensions."));
54     OP_REQUIRES(
55         context, (strides_[0] == 1 && strides_[3] == 1),
56         errors::InvalidArgument("Current implementation does not yet support "
57                                 "strides in the batch and depth dimensions."));
58     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
59   }
60 
Compute(OpKernelContext * context)61   void Compute(OpKernelContext* context) override {
62     const Tensor& input = context->input(0);
63     const Tensor& filter = context->input(1);
64 
65     // For 2D convolution, there should be 4 dimensions.
66     OP_REQUIRES(context, input.dims() == 4,
67                 errors::InvalidArgument("input must be 4-dimensional",
68                                         input.shape().DebugString()));
69     OP_REQUIRES(context, filter.dims() == 4,
70                 errors::InvalidArgument("filter must be 4-dimensional: ",
71                                         filter.shape().DebugString()));
72 
73     const int32 in_depth = input.dim_size(3);
74     OP_REQUIRES(context, in_depth == filter.dim_size(2),
75                 errors::InvalidArgument(
76                     "input and filter must have the same depth: ", in_depth,
77                     " vs ", filter.dim_size(2)));
78     const int32 batch = input.dim_size(0);
79     const int32 input_rows = input.dim_size(1);
80     const int32 input_cols = input.dim_size(2);
81 
82     const int32 filter_rows = filter.dim_size(0);
83     const int32 filter_cols = filter.dim_size(1);
84     const int32 depth_multiplier = filter.dim_size(3);
85 
86     const int32 out_depth = in_depth * depth_multiplier;
87 
88     const int32 stride = strides_[1];
89 
90     int64 out_rows = 0, out_cols = 0, pad_rows = 0, pad_cols = 0;
91     OP_REQUIRES_OK(context,
92                    GetWindowedOutputSize(input_rows, filter_rows, stride,
93                                          padding_, &out_rows, &pad_rows));
94     OP_REQUIRES_OK(context,
95                    GetWindowedOutputSize(input_cols, filter_cols, stride,
96                                          padding_, &out_cols, &pad_cols));
97     TensorShape out_shape({batch, out_rows, out_cols, out_depth});
98     OP_REQUIRES(
99         context,
100         FastBoundsCheck(out_shape.num_elements(),
101                         std::numeric_limits<int32>::max()),
102         errors::InvalidArgument("Output elements too large for NEON kernel"));
103 
104     // Output tensor is of the following dimensions:
105     // [ in_batch, out_rows, out_cols, out_depth ]
106     Tensor* output = nullptr;
107     OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output));
108 
109     VLOG(2) << "NeonDepthwiseConv2dNative: "
110             << " Input: [" << batch << ", " << input_rows << ", " << input_cols
111             << ", " << in_depth << "]; Filter: [" << filter_rows << ", "
112             << filter_cols << ", " << in_depth << ", " << depth_multiplier
113             << "]; stride = " << stride << ", pad_rows = " << pad_rows
114             << ", pad_cols = " << pad_cols << ", output: [" << batch << ", "
115             << out_rows << ", " << out_cols << ", " << out_depth << "]";
116 
117     // If there is nothing to compute, return.
118     if (out_shape.num_elements() == 0) {
119       return;
120     }
121 
122     const float* input_ptr = input.template flat<float>().data();
123     const float* filter_ptr = filter.template flat<float>().data();
124     float* output_ptr = output->template flat<float>().data();
125 
126     auto input_neon_dims = ToNeonDims(input.shape());
127     auto filter_neon_dims = FilterToNeonDims(filter.shape());
128     auto bias_neon_dims = BiasNeonDims(filter.shape());
129 
130     int64 bias_size = bias_neon_dims.sizes[0];
131     float* bias_ptr = static_cast<float*>(port::AlignedMalloc(
132         bias_size * sizeof(float), Allocator::kAllocatorAlignment));
133     memset(bias_ptr, 0, bias_size * sizeof(float));
134 
135     neon::DepthwiseConv<neon::FusedActivationFunctionType::kNone>(
136         input_ptr, input_neon_dims, filter_ptr, filter_neon_dims, bias_ptr,
137         bias_neon_dims, stride, pad_cols, pad_rows, depth_multiplier,
138         output_ptr, ToNeonDims(out_shape));
139 
140     port::AlignedFree(bias_ptr);
141   }
142 
143  private:
SetNeonDimStrides(neon::Dims<4> * d)144   void SetNeonDimStrides(neon::Dims<4>* d) {
145     int64 stride = 1;
146     for (int i = 0; i < 4; ++i) {
147       d->strides[i] = stride;
148       stride *= d->sizes[i];
149     }
150   }
151 
ToNeonDims(const TensorShape & input)152   neon::Dims<4> ToNeonDims(const TensorShape& input) {
153     // Dims in the neon kernels are channel, x, y, batch order.
154     neon::Dims<4> result;
155     result.sizes[0] = input.dim_size(3);
156     result.sizes[1] = input.dim_size(2);
157     result.sizes[2] = input.dim_size(1);
158     result.sizes[3] = input.dim_size(0);
159     SetNeonDimStrides(&result);
160     return result;
161   }
162 
FilterToNeonDims(const TensorShape & filter)163   neon::Dims<4> FilterToNeonDims(const TensorShape& filter) {
164     // Dims in the neon kernels are channel, x, y, batch order.
165     neon::Dims<4> result;
166     result.sizes[0] = filter.dim_size(2) * filter.dim_size(3);
167     result.sizes[1] = filter.dim_size(1);
168     result.sizes[2] = filter.dim_size(0);
169     result.sizes[3] = 1;
170     SetNeonDimStrides(&result);
171 
172     return result;
173   }
174 
BiasNeonDims(const TensorShape & filter)175   neon::Dims<4> BiasNeonDims(const TensorShape& filter) {
176     // Dims in the neon kernels are channel, x, y, batch order.
177     // Bias has only output channel set.
178     neon::Dims<4> result;
179     result.sizes[0] =
180         filter.dim_size(2) * filter.dim_size(3);  // output channels
181     result.sizes[1] = 1;
182     result.sizes[2] = 1;
183     result.sizes[3] = 1;
184     SetNeonDimStrides(&result);
185 
186     return result;
187   }
188 
189   std::vector<int32> strides_;
190   Padding padding_;
191 
192   TF_DISALLOW_COPY_AND_ASSIGN(NeonDepthwiseConv2dNativeOp);
193 };
194 
195 #define REGISTER_CPU_KERNEL(T)                            \
196   REGISTER_KERNEL_BUILDER(Name("DepthwiseConv2dNative")   \
197                               .Device(DEVICE_CPU)         \
198                               .TypeConstraint<float>("T") \
199                               .Label("neon"),             \
200                           NeonDepthwiseConv2dNativeOp);
201 
202 TF_CALL_float(REGISTER_CPU_KERNEL);
203 
204 }  // namespace tensorflow
205