• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2019-2020 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 #include "src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.h"
25 
26 #include "arm_compute/core/Helpers.h"
27 #include "arm_compute/core/ITensor.h"
28 #include "arm_compute/core/TensorInfo.h"
29 #include "arm_compute/core/Types.h"
30 #include "arm_compute/core/Window.h"
31 #include "src/core/CPP/Validate.h"
32 #include "src/core/NEON/NEMath.h"
33 #include "src/core/NEON/wrapper/wrapper.h"
34 #include "src/core/helpers/AutoConfiguration.h"
35 #include "src/core/helpers/WindowHelpers.h"
36 
37 namespace arm_compute
38 {
39 namespace
40 {
validate_arguments(const ITensorInfo * input,const ITensorInfo * output,float epsilon)41 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, float epsilon)
42 {
43     ARM_COMPUTE_UNUSED(epsilon);
44     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
45     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
46     ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() > 2, "Input tensor cannot have more than 2 dimensions");
47     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
48 
49     // Checks performed when output is configured
50     if((output != nullptr) && (output->total_size() != 0))
51     {
52         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
53         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
54     }
55     return Status{};
56 }
57 
validate_and_configure_window(ITensorInfo * input,ITensorInfo * output)58 std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
59 {
60     if(output != nullptr)
61     {
62         ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
63         // Output auto inizialitation if not yet initialized
64         auto_init_if_empty(*output, *input);
65     }
66 
67     // This kernel doesn't need padding. A left-over for loop on dimension X, we cannot have any read or write out of memory
68     // For this reason num_elems_processed_per_iteration is set to 1
69     Window win = calculate_max_window(*input, Steps());
70     if(output != nullptr)
71     {
72         output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
73     }
74 
75     return std::make_pair(Status{}, win);
76 }
77 } // namespace
78 
79 template <typename ScalarType, int size>
mean_stddev_normalization(const Window & window)80 void NEMeanStdDevNormalizationKernel::mean_stddev_normalization(const Window &window)
81 {
82     using ExactTagType = typename wrapper::traits::neon_vector<ScalarType, size>::tag_type;
83 
84     // Set build options
85     Window win = window;
86     win.set(Window::DimX, Window::Dimension(0, 1, 1));
87 
88     const int  window_step_x  = size;
89     const auto window_start_x = static_cast<int>(window.x().start());
90     const auto window_end_x   = static_cast<int>(window.x().end());
91 
92     Iterator input(_input, win);
93     Iterator output(_output, win);
94 
95     execute_window_loop(win, [&](const Coordinates &)
96     {
97         int  x       = window_start_x;
98         auto in_ptr  = reinterpret_cast<const ScalarType *>(input.ptr());
99         auto out_ptr = reinterpret_cast<ScalarType *>(output.ptr());
100 
101         auto sum_vec    = wrapper::vdup_n(static_cast<ScalarType>(0.f), ExactTagType{});
102         auto sum_sq_vec = wrapper::vdup_n(static_cast<ScalarType>(0.f), ExactTagType{});
103 
104         for(; x <= (window_end_x - window_step_x); x += window_step_x)
105         {
106             auto data  = wrapper::vloadq(in_ptr + x);
107             sum_vec    = wrapper::vadd(sum_vec, data);
108             sum_sq_vec = wrapper::vadd(sum_sq_vec, wrapper::vmul(data, data));
109         }
110 
111         auto sum_carry_res    = wrapper::vpadd(wrapper::vgethigh(sum_vec), wrapper::vgetlow(sum_vec));
112         auto sum_sq_carry_res = wrapper::vpadd(wrapper::vgethigh(sum_sq_vec), wrapper::vgetlow(sum_sq_vec));
113         for(int i = 0; i < size / 4; ++i)
114         {
115             sum_carry_res    = wrapper::vpadd(sum_carry_res, sum_carry_res);
116             sum_sq_carry_res = wrapper::vpadd(sum_sq_carry_res, sum_sq_carry_res);
117         }
118 
119         auto sum    = wrapper::vgetlane(sum_carry_res, 0);
120         auto sum_sq = wrapper::vgetlane(sum_sq_carry_res, 0);
121 
122         // Compute left-over elements
123         for(; x < window_end_x; ++x)
124         {
125             ScalarType data = *(in_ptr + x);
126             sum += data;
127             sum_sq += data * data;
128         }
129 
130         ScalarType mean       = sum / _input->info()->dimension(0);
131         ScalarType var        = (sum_sq / _input->info()->dimension(0)) - (mean * mean);
132         ScalarType stddev_inv = 1.f / sqrt(var + _epsilon);
133 
134         auto mean_vec       = wrapper::vdup_n(mean, ExactTagType{});
135         auto stddev_inv_vec = wrapper::vdup_n(stddev_inv, ExactTagType{});
136         for(x = window_start_x; x <= (window_end_x - window_step_x); x += window_step_x)
137         {
138             auto data = wrapper::vloadq(in_ptr + x);
139             auto res  = wrapper::vmul(wrapper::vsub(data, mean_vec), stddev_inv_vec);
140             // Store results
141             wrapper::vstore(out_ptr + x, res);
142         }
143         for(; x < window_end_x; ++x)
144         {
145             *(out_ptr + x) = (*(in_ptr + x) - mean) * stddev_inv;
146         }
147     },
148     input, output);
149 }
150 
NEMeanStdDevNormalizationKernel()151 NEMeanStdDevNormalizationKernel::NEMeanStdDevNormalizationKernel()
152     : _input(nullptr), _output(nullptr), _epsilon(1e-8f), _func(nullptr)
153 {
154 }
155 
configure(ITensor * input,ITensor * output,float epsilon)156 void NEMeanStdDevNormalizationKernel::configure(ITensor *input, ITensor *output, float epsilon)
157 {
158     ARM_COMPUTE_ERROR_ON_NULLPTR(input);
159 
160     ARM_COMPUTE_ERROR_THROW_ON(NEMeanStdDevNormalizationKernel::validate(input->info(), (output != nullptr) ? output->info() : nullptr, epsilon));
161 
162     _input   = input;
163     _output  = (output == nullptr) ? input : output;
164     _epsilon = epsilon;
165 
166     // Configure kernel window
167     auto win_config = validate_and_configure_window(input->info(), (output == nullptr) ? nullptr : output->info());
168     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
169     ICPPKernel::configure(win_config.second);
170 
171     // Configure function to run based on different data types
172     const DataType data_type = input->info()->data_type();
173     switch(data_type)
174     {
175         case DataType::F32:
176             _func = &NEMeanStdDevNormalizationKernel::mean_stddev_normalization<float, 4>;
177             break;
178 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
179         case DataType::F16:
180             _func = &NEMeanStdDevNormalizationKernel::mean_stddev_normalization<float16_t, 8>;
181             break;
182 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
183         default:
184             ARM_COMPUTE_ERROR("Not Supported");
185             break;
186     }
187 }
188 
validate(const ITensorInfo * input,const ITensorInfo * output,float epsilon)189 Status NEMeanStdDevNormalizationKernel::validate(const ITensorInfo *input, const ITensorInfo *output, float epsilon)
190 {
191     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, epsilon));
192     ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), (output != nullptr) ? output->clone().get() : nullptr).first);
193     return Status{};
194 }
195 
run(const Window & window,const ThreadInfo & info)196 void NEMeanStdDevNormalizationKernel::run(const Window &window, const ThreadInfo &info)
197 {
198     ARM_COMPUTE_UNUSED(info);
199     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
200     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
201     ARM_COMPUTE_ERROR_ON(_func == nullptr);
202 
203     (this->*_func)(window);
204 }
205 } // namespace arm_compute
206