1 /*
2 * Copyright (c) 2017-2020 Arm Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24 #include "src/core/NEON/kernels/NEBatchNormalizationLayerKernel.h"
25
26 #include "arm_compute/core/Helpers.h"
27 #include "arm_compute/core/TensorInfo.h"
28 #include "arm_compute/core/Utils.h"
29 #include "arm_compute/core/Validate.h"
30 #include "arm_compute/core/Window.h"
31 #include "src/core/CPP/Validate.h"
32 #include "src/core/NEON/NEFixedPoint.h"
33 #include "src/core/NEON/NEMath.h"
34 #include "src/core/helpers/AutoConfiguration.h"
35 #include "src/core/helpers/WindowHelpers.h"
36
37 #include "src/core/NEON/kernels/detail/NEActivationFunctionDetail.h"
38 #include "src/core/NEON/wrapper/wrapper.h"
39
40 #include <map>
41
42 namespace arm_compute
43 {
44 namespace
45 {
46 Status
validate_arguments(const ITensorInfo * input,const ITensorInfo * output,const ITensorInfo * mean,const ITensorInfo * var,const ITensorInfo * beta,const ITensorInfo * gamma,float epsilon,ActivationLayerInfo act_info)47 validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *mean, const ITensorInfo *var,
48 const ITensorInfo *beta, const ITensorInfo *gamma, float epsilon, ActivationLayerInfo act_info)
49 {
50 ARM_COMPUTE_UNUSED(epsilon);
51 ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
52 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
53
54 if(act_info.enabled())
55 {
56 ActivationLayerInfo::ActivationFunction act = act_info.activation();
57 ARM_COMPUTE_RETURN_ERROR_ON(act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::RELU
58 && act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::BOUNDED_RELU
59 && act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU);
60 ARM_COMPUTE_RETURN_ERROR_ON(act_info.b() > act_info.a());
61 }
62
63 if(nullptr != output)
64 {
65 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
66 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
67 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
68 }
69
70 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, mean, var);
71 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mean, var);
72 if(beta != nullptr)
73 {
74 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, beta);
75 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mean, beta);
76 }
77 if(gamma != nullptr)
78 {
79 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, gamma);
80 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mean, gamma);
81 }
82 ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL)) != mean->dimension(0));
83
84 return Status{};
85 }
86
validate_and_configure_window(ITensorInfo * input,ITensorInfo * output,ITensorInfo * mean,ITensorInfo * var,ITensorInfo * gamma,ITensorInfo * beta)87 std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, ITensorInfo *mean, ITensorInfo *var, ITensorInfo *gamma, ITensorInfo *beta)
88 {
89 ARM_COMPUTE_UNUSED(mean, var, gamma, beta);
90
91 // Configure kernel window
92 Window win = calculate_max_window(*input, Steps());
93
94 if(output != nullptr)
95 {
96 // Output auto initialization if not yet initialized
97 auto_init_if_empty(*output, *input->clone());
98
99 // NEBatchNormalizationLayerKernel doesn't need padding so update_window_and_padding() can be skipped
100 Coordinates coord;
101 coord.set_num_dimensions(output->num_dimensions());
102 output->set_valid_region(ValidRegion(coord, output->tensor_shape()));
103 }
104
105 return std::make_pair(Status{}, win);
106 }
107 } //namespace
108
109 template <typename T, bool fused_activation, typename F>
batch_normalization_nchw(const Window & window)110 void NEBatchNormalizationLayerKernel::batch_normalization_nchw(const Window &window)
111 {
112 /** NEON vector tag type. */
113 using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
114
115 const int window_step_x = 16 / sizeof(T);
116 const auto window_start_x = static_cast<int>(window.x().start());
117 const auto window_end_x = static_cast<int>(window.x().end());
118
119 Window win_to_use = window;
120 win_to_use.set(Window::DimX, Window::Dimension(0, 1, 1));
121
122 Iterator input(_input, win_to_use);
123 Iterator output(_output, win_to_use);
124
125 F activation_functor(_act_info);
126
127 // Hold information about the current feature map we are iterating.
128 // Only compute denominator and NEON vectors once per feature map.
129 int slice = -1;
130
131 const auto input_mean = reinterpret_cast<const T *>(_mean->ptr_to_element(Coordinates(0, 0)));
132 const auto input_var = reinterpret_cast<const T *>(_var->ptr_to_element(Coordinates(0, 0)));
133 const auto input_gamma = (_gamma != nullptr) ? reinterpret_cast<const T *>(_gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
134 const auto input_beta = (_beta != nullptr) ? reinterpret_cast<const T *>(_beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
135
136 T mean = static_cast<T>(0);
137 T var = static_cast<T>(0);
138 T gamma = static_cast<T>(1);
139 T beta = static_cast<T>(0);
140 T denominator = static_cast<T>(0);
141
142 auto mean_vec = wrapper::vdup_n(mean, ExactTagType{});
143 auto var_vec = wrapper::vdup_n(var, ExactTagType{});
144 auto gamma_vec = wrapper::vdup_n(gamma, ExactTagType{});
145 auto beta_vec = wrapper::vdup_n(beta, ExactTagType{});
146 auto denominator_vec = wrapper::vdup_n(denominator, ExactTagType{});
147 const auto epsilon_vec = wrapper::vdup_n(static_cast<T>(_epsilon), ExactTagType{});
148 execute_window_loop(win_to_use, [&](const Coordinates & id)
149 {
150 const auto input_ptr = reinterpret_cast<const T *>(input.ptr());
151 const auto output_ptr = reinterpret_cast<T *>(output.ptr());
152
153 if(slice != id.z())
154 {
155 mean = input_mean[id.z()];
156 var = input_var[id.z()];
157 mean_vec = wrapper::vdup_n(mean, ExactTagType{});
158 var_vec = wrapper::vdup_n(var, ExactTagType{});
159 if(input_gamma != nullptr)
160 {
161 gamma = input_gamma[id.z()];
162 gamma_vec = wrapper::vdup_n(gamma, ExactTagType{});
163 }
164 if(input_beta != nullptr)
165 {
166 beta = input_beta[id.z()];
167 beta_vec = wrapper::vdup_n(beta, ExactTagType{});
168 }
169
170 // Calculate denominator
171 denominator_vec = wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec));
172 denominator = wrapper::vgetlane(denominator_vec, 0);
173 slice = id.z();
174 }
175
176 // Perform core calculations using vector operations
177 int x = window_start_x;
178 for(; x <= (window_end_x - window_step_x); x += window_step_x)
179 {
180 // Calculate x bar
181 const auto numerator = wrapper::vsub(wrapper::vloadq(input_ptr + x), mean_vec);
182 const auto x_bar = wrapper::vmul(numerator, denominator_vec);
183 auto res = wrapper::vmla(beta_vec, x_bar, gamma_vec);
184
185 // Perform fused activation
186 if(fused_activation)
187 {
188 activation_functor(res);
189 }
190
191 // Store results
192 wrapper::vstore(output_ptr + x, res);
193 }
194
195 // Compute left-over elements
196 for(; x < window_end_x; ++x)
197 {
198 const T numerator = input_ptr[x] - mean;
199 const T x_bar = numerator * denominator;
200 T res = beta + x_bar * gamma;
201
202 // Perform fused activation
203 if(fused_activation)
204 {
205 activation_functor(res);
206 }
207
208 // Store results
209 *(output_ptr + x) = res;
210 }
211 },
212 input, output);
213 }
214
215 template <typename T, bool fused_activation, typename F>
batch_normalization_nhwc(const Window & window)216 void NEBatchNormalizationLayerKernel::batch_normalization_nhwc(const Window &window)
217 {
218 /** NEON vector tag type. */
219 using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
220
221 const int window_step_x = 16 / sizeof(T);
222 const auto window_start_x = static_cast<int>(window.x().start());
223 const auto window_end_x = static_cast<int>(window.x().end());
224
225 Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
226 win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
227
228 Iterator input(_input, win_collapsed);
229 Iterator output(_output, win_collapsed);
230
231 F activation_functor(_act_info);
232
233 const auto input_mean = reinterpret_cast<const T *>(_mean->ptr_to_element(Coordinates(0, 0)));
234 const auto input_var = reinterpret_cast<const T *>(_var->ptr_to_element(Coordinates(0, 0)));
235 const auto input_gamma = (_gamma != nullptr) ? reinterpret_cast<const T *>(_gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
236 const auto input_beta = (_beta != nullptr) ? reinterpret_cast<const T *>(_beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
237
238 const auto epsilon_vec = wrapper::vdup_n(static_cast<T>(_epsilon), ExactTagType{});
239 execute_window_loop(win_collapsed, [&](const Coordinates &)
240 {
241 const auto input_ptr = reinterpret_cast<const T *>(input.ptr());
242 const auto output_ptr = reinterpret_cast<T *>(output.ptr());
243
244 // Perform core calculations using vector operations
245 int x = window_start_x;
246 for(; x <= (window_end_x - window_step_x); x += window_step_x)
247 {
248 // Conctruct vectors
249 const auto mean_vec = wrapper::vloadq(input_mean + x);
250 const auto var_vec = wrapper::vloadq(input_var + x);
251 const auto gamma_vec = (input_gamma != nullptr) ? wrapper::vloadq(input_gamma + x) : wrapper::vdup_n(static_cast<T>(1.f), ExactTagType{});
252 const auto beta_vec = (input_beta != nullptr) ? wrapper::vloadq(input_beta + x) : wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
253
254 // Calculate denominator
255 const auto denominator = wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec));
256
257 // Calculate x bar
258 const auto numerator = wrapper::vsub(wrapper::vloadq(input_ptr + x), mean_vec);
259 const auto x_bar = wrapper::vmul(numerator, denominator);
260 auto res = wrapper::vmla(beta_vec, x_bar, gamma_vec);
261
262 // Perform fused activation
263 if(fused_activation)
264 {
265 activation_functor(res);
266 }
267
268 // Store results
269 wrapper::vstore(output_ptr + x, res);
270 }
271
272 // Compute left-over elements
273 for(; x < window_end_x; ++x)
274 {
275 // Conctruct vectors
276 const T gamma = (input_gamma != nullptr) ? input_gamma[x] : 1.f;
277 const T beta = (input_beta != nullptr) ? input_beta[x] : 0.f;
278
279 const T denominator = sqrt(input_var[x] + _epsilon);
280 const T numerator = input_ptr[x] - input_mean[x];
281 const T x_bar = numerator / denominator;
282 T res = beta + x_bar * gamma;
283
284 // Perform fused activation
285 if(fused_activation)
286 {
287 activation_functor(res);
288 }
289
290 // Store results
291 *reinterpret_cast<T *>(output_ptr + x) = res;
292 }
293 },
294 input, output);
295 }
296
configure_non_fused()297 void NEBatchNormalizationLayerKernel::configure_non_fused()
298 {
299 const bool is_nhwc = _input->info()->data_layout() == DataLayout::NHWC;
300 switch(_input->info()->data_type())
301 {
302 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
303 case DataType::F16:
304 _func = (is_nhwc) ? &NEBatchNormalizationLayerKernel::batch_normalization_nhwc<float16_t, false, detail::dummy<float16_t, 8>> :
305 &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float16_t, false, detail::dummy<float16_t, 8>>;
306 break;
307 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
308 case DataType::F32:
309 _func = (is_nhwc) ? &NEBatchNormalizationLayerKernel::batch_normalization_nhwc<float, false, detail::dummy<float, 4>> :
310 &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float, false, detail::dummy<float, 4>>;
311 break;
312 default:
313 ARM_COMPUTE_ERROR("Element size not supported");
314 break;
315 }
316 }
317
configure_fused()318 void NEBatchNormalizationLayerKernel::configure_fused()
319 {
320 // NCHW Fused Batched Normalization with activation functions : FP32
321 static std::map<ActivationLayerInfo::ActivationFunction, BatchNormFunctionPtr> bn_fused_map_f32_nchw =
322 {
323 { ActivationLayerInfo::ActivationFunction::RELU, &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float, true, detail::relu<float, 4>> },
324 { ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float, true, detail::brelu<float, 4>> },
325 { ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float, true, detail::lubrelu<float, 4>> }
326 };
327 // NHWC Fused Batched Normalization with activation functions : FP32
328 static std::map<ActivationLayerInfo::ActivationFunction, BatchNormFunctionPtr> bn_fused_map_f32_nhwc =
329 {
330 { ActivationLayerInfo::ActivationFunction::RELU, &NEBatchNormalizationLayerKernel::batch_normalization_nhwc<float, true, detail::relu<float, 4>> },
331 { ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_nhwc<float, true, detail::brelu<float, 4>> },
332 { ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_nhwc<float, true, detail::lubrelu<float, 4>> }
333 };
334 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
335 // NCHW Fused Batched Normalization with activation functions : FP16
336 static std::map<ActivationLayerInfo::ActivationFunction, BatchNormFunctionPtr> bn_fused_map_f16_nchw =
337 {
338 { ActivationLayerInfo::ActivationFunction::RELU, &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float16_t, true, detail::relu<float16_t, 8>> },
339 { ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float16_t, true, detail::brelu<float16_t, 8>> },
340 { ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float16_t, true, detail::lubrelu<float16_t, 8>> }
341 };
342 // NHWC Fused Batched Normalization with activation functions : FP16
343 static std::map<ActivationLayerInfo::ActivationFunction, BatchNormFunctionPtr> bn_fused_map_f16_nhwc =
344 {
345 { ActivationLayerInfo::ActivationFunction::RELU, &NEBatchNormalizationLayerKernel::batch_normalization_nhwc<float16_t, true, detail::relu<float16_t, 8>> },
346 { ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_nhwc<float16_t, true, detail::brelu<float16_t, 8>> },
347 { ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_nhwc<float16_t, true, detail::lubrelu<float16_t, 8>> }
348 };
349 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
350
351 switch(_input->info()->data_type())
352 {
353 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
354 case DataType::F16:
355 _func = (_input->info()->data_layout() == DataLayout::NHWC) ? bn_fused_map_f16_nhwc[_act_info.activation()] : bn_fused_map_f16_nchw[_act_info.activation()];
356 break;
357 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
358 case DataType::F32:
359 _func = (_input->info()->data_layout() == DataLayout::NHWC) ? bn_fused_map_f32_nhwc[_act_info.activation()] : bn_fused_map_f32_nchw[_act_info.activation()];
360 break;
361 default:
362 ARM_COMPUTE_ERROR("Element size not supported");
363 break;
364 }
365 }
366
NEBatchNormalizationLayerKernel()367 NEBatchNormalizationLayerKernel::NEBatchNormalizationLayerKernel()
368 : _func(nullptr), _input(nullptr), _output(nullptr), _mean(nullptr), _var(nullptr), _gamma(nullptr), _beta(nullptr), _epsilon(), _act_info()
369 {
370 }
371
configure(ITensor * input,ITensor * output,const ITensor * mean,const ITensor * var,const ITensor * beta,const ITensor * gamma,float epsilon,ActivationLayerInfo act_info)372 void NEBatchNormalizationLayerKernel::configure(ITensor *input, ITensor *output,
373 const ITensor *mean, const ITensor *var,
374 const ITensor *beta, const ITensor *gamma,
375 float epsilon, ActivationLayerInfo act_info)
376 {
377 ARM_COMPUTE_ERROR_ON_NULLPTR(input, mean, var);
378
379 ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (output != nullptr) ? output->info() : nullptr,
380 mean->info(), var->info(),
381 (beta != nullptr) ? beta->info() : nullptr,
382 (gamma != nullptr) ? gamma->info() : nullptr,
383 epsilon, act_info));
384
385 _input = input;
386 _output = input;
387 _mean = mean;
388 _var = var;
389 _gamma = gamma;
390 _beta = beta;
391 _epsilon = epsilon;
392 _act_info = act_info;
393
394 const bool run_in_place = (output == nullptr) || (output == input);
395 if(!run_in_place)
396 {
397 _output = output;
398 }
399
400 // Configure activation function to run
401 if(_act_info.enabled())
402 {
403 configure_fused();
404 }
405 else
406 {
407 configure_non_fused();
408 }
409
410 // Configure kernel window
411 auto win_config = validate_and_configure_window(input->info(), (run_in_place) ? nullptr : output->info(), mean->info(), var->info(), (gamma != nullptr) ? gamma->info() : nullptr,
412 (beta != nullptr) ? beta->info() : nullptr);
413 ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
414 INEKernel::configure(win_config.second);
415 }
416
validate(const ITensorInfo * input,const ITensorInfo * output,const ITensorInfo * mean,const ITensorInfo * var,const ITensorInfo * beta,const ITensorInfo * gamma,float epsilon,ActivationLayerInfo act_info)417 Status NEBatchNormalizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
418 const ITensorInfo *mean, const ITensorInfo *var,
419 const ITensorInfo *beta, const ITensorInfo *gamma,
420 float epsilon, ActivationLayerInfo act_info)
421 {
422 ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, mean, var, beta, gamma, epsilon, act_info));
423 ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output ? output->clone().get() : nullptr, mean->clone().get(), var->clone().get(),
424 (gamma != nullptr) ? gamma->clone().get() : nullptr, (beta != nullptr) ? beta->clone().get() : nullptr)
425 .first);
426
427 return Status{};
428 }
429
run(const Window & window,const ThreadInfo & info)430 void NEBatchNormalizationLayerKernel::run(const Window &window, const ThreadInfo &info)
431 {
432 ARM_COMPUTE_UNUSED(info);
433 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
434 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
435 ARM_COMPUTE_ERROR_ON(_func == nullptr);
436
437 (this->*_func)(window);
438 }
439 } // namespace arm_compute
440