• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2017-2020 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 #include "src/core/NEON/kernels/NEMinMaxLayerKernel.h"
25 
26 #include "arm_compute/core/Coordinates.h"
27 #include "arm_compute/core/Error.h"
28 #include "arm_compute/core/Helpers.h"
29 #include "arm_compute/core/IAccessWindow.h"
30 #include "arm_compute/core/ITensor.h"
31 #include "arm_compute/core/TensorInfo.h"
32 #include "arm_compute/core/Types.h"
33 #include "arm_compute/core/Validate.h"
34 #include "arm_compute/core/Window.h"
35 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
36 #include "src/core/helpers/AutoConfiguration.h"
37 #include "src/core/helpers/WindowHelpers.h"
38 
39 #include <algorithm>
40 #include <arm_neon.h>
41 #include <climits>
42 #include <cstddef>
43 
44 using namespace arm_compute::misc::shape_calculator;
45 
46 namespace arm_compute
47 {
48 namespace
49 {
validate_arguments(const ITensorInfo * input,const ITensorInfo * output)50 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
51 {
52     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
53     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
54     ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() < 3);
55 
56     if(output->tensor_shape().total_size() > 0)
57     {
58         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
59 
60         TensorShape output_shape = compute_min_max_shape(input);
61 
62         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
63     }
64 
65     return Status{};
66 }
67 
validate_and_configure_window(ITensorInfo * input,ITensorInfo * output)68 std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
69 {
70     TensorShape output_shape = compute_min_max_shape(input);
71 
72     // Output auto initialization if not yet initialized
73     auto_init_if_empty(*output, output_shape, 1, input->data_type());
74 
75     constexpr unsigned int num_elems_processed_per_iteration = 1;
76 
77     // Configure kernel window
78     Window                 win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
79     AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
80     AccessWindowHorizontal output_access(output, 0, 2);
81 
82     bool window_changed = update_window_and_padding(win, input_access, output_access);
83 
84     output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
85 
86     Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
87     return std::make_tuple(err, win);
88 }
89 } // namespace
90 
NEMinMaxLayerKernel()91 NEMinMaxLayerKernel::NEMinMaxLayerKernel()
92     : _input(nullptr), _output(nullptr), _mtx()
93 {
94 }
95 
configure(const ITensor * input,ITensor * output)96 void NEMinMaxLayerKernel::configure(const ITensor *input, ITensor *output)
97 {
98     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
99     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info()));
100 
101     _input  = input;
102     _output = output;
103 
104     auto win_config = validate_and_configure_window(input->info(), output->info());
105 
106     ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
107 
108     INEKernel::configure(std::get<1>(win_config));
109 }
110 
validate(const ITensorInfo * input,const ITensorInfo * output)111 Status NEMinMaxLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output)
112 {
113     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
114     ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get())));
115 
116     return Status{};
117 }
118 
run(const Window & window,const ThreadInfo & info)119 void NEMinMaxLayerKernel::run(const Window &window, const ThreadInfo &info)
120 {
121     ARM_COMPUTE_UNUSED(info);
122     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
123     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
124 
125     const int x_start = window.x().start();
126     const int x_end   = window.x().end();
127 
128     Window window_output;
129     window_output.use_tensor_dimensions(_output->info()->tensor_shape());
130     window_output.set(Window::DimX, Window::Dimension(0, 1, 1));
131 
132     // Handle X dimension manually to split into two loops
133     // First one will use vector operations, second one processes the left over pixels
134     Window window_input(window);
135     window_input.set(Window::DimX, Window::Dimension(0, 1, 1));
136     window_input.set(3, Window::Dimension(0, 1, 1));
137 
138     Iterator input(_input, window_input);
139     Iterator output(_output, window_output);
140 
141     execute_window_loop(window_output, [&](const Coordinates & id_batch)
142     {
143         float32x2_t carry_min = vdup_n_f32(std::numeric_limits<float>::max());
144         float32x2_t carry_max = vdup_n_f32(std::numeric_limits<float>::lowest());
145 
146         float carry_min_scalar = std::numeric_limits<float>::max();
147         float carry_max_scalar = std::numeric_limits<float>::lowest();
148 
149         execute_window_loop(window_input, [&](const Coordinates &)
150         {
151             int        x      = x_start;
152             const auto in_ptr = reinterpret_cast<const float *>(input.ptr() + id_batch[1] * _input->info()->strides_in_bytes()[3]);
153 
154             // Vector loop
155             for(; x <= x_end - 8; x += 8)
156             {
157                 const float32x4x2_t pixels   = vld2q_f32(in_ptr + x);
158                 const float32x4_t   tmp_min1 = vminq_f32(pixels.val[0], pixels.val[1]);
159                 const float32x4_t   tmp_max1 = vmaxq_f32(pixels.val[0], pixels.val[1]);
160                 const float32x2_t   tmp_min2 = vmin_f32(vget_high_f32(tmp_min1), vget_low_f32(tmp_min1));
161                 const float32x2_t   tmp_max2 = vmax_f32(vget_high_f32(tmp_max1), vget_low_f32(tmp_max1));
162                 carry_min                    = vmin_f32(tmp_min2, carry_min);
163                 carry_max                    = vmax_f32(tmp_max2, carry_max);
164             }
165 
166             // Process leftover pixels
167             for(; x < x_end; ++x)
168             {
169                 const float pixel = in_ptr[x];
170                 carry_min_scalar  = std::min(pixel, carry_min_scalar);
171                 carry_max_scalar  = std::max(pixel, carry_max_scalar);
172             }
173         },
174         input);
175 
176         // Reduce result
177         carry_min = vpmin_f32(carry_min, carry_min);
178         carry_max = vpmax_f32(carry_max, carry_max);
179         carry_min = vpmin_f32(carry_min, carry_min);
180         carry_max = vpmax_f32(carry_max, carry_max);
181 
182         // Extract max/min values
183         const float min_i = std::min(vget_lane_f32(carry_min, 0), carry_min_scalar);
184         const float max_i = std::max(vget_lane_f32(carry_max, 0), carry_max_scalar);
185 
186         auto out_ptr = reinterpret_cast<float *>(output.ptr());
187 
188         // Perform reduction of local min/max values
189         update_min_max(out_ptr, min_i, max_i);
190     },
191     output);
192 }
193 
reset()194 void NEMinMaxLayerKernel::reset()
195 {
196     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
197 
198     float32x2_t reset_values = vdup_n_f32(0.0f);
199     reset_values             = vset_lane_f32(std::numeric_limits<float>::max(), reset_values, 0);
200     reset_values             = vset_lane_f32(std::numeric_limits<float>::lowest(), reset_values, 1);
201 
202     Window window_output;
203     window_output.use_tensor_dimensions(_output->info()->tensor_shape());
204     window_output.set(Window::DimX, Window::Dimension(0, 1, 1));
205 
206     Iterator output(_output, window_output);
207 
208     execute_window_loop(window_output, [&](const Coordinates &)
209     {
210         vst1_f32(reinterpret_cast<float *>(output.ptr()), reset_values);
211     },
212     output);
213 }
214 
update_min_max(float * out_ptr,float min,float max)215 void NEMinMaxLayerKernel::update_min_max(float *out_ptr, float min, float max)
216 {
217     arm_compute::lock_guard<Mutex> lock(_mtx);
218 
219     const float32x2_t old_min = vld1_dup_f32(out_ptr);
220     const float32x2_t old_max = vld1_dup_f32(out_ptr + 1);
221     const float32x2_t new_min = vmin_f32(vdup_n_f32(min), old_min);
222     const float32x2_t new_max = vmax_f32(vdup_n_f32(max), old_max);
223 
224     vst1_f32(out_ptr, vzip_f32(new_min, new_max).val[0]);
225 }
226 } // namespace arm_compute
227