• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2016-2020 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 #include "src/core/NEON/kernels/NEMinMaxLocationKernel.h"
25 
26 #include "arm_compute/core/Coordinates.h"
27 #include "arm_compute/core/Error.h"
28 #include "arm_compute/core/Helpers.h"
29 #include "arm_compute/core/IAccessWindow.h"
30 #include "arm_compute/core/ITensor.h"
31 #include "arm_compute/core/TensorInfo.h"
32 #include "arm_compute/core/Types.h"
33 #include "arm_compute/core/Validate.h"
34 #include "arm_compute/core/Window.h"
35 #include "arm_compute/core/utils/misc/Utility.h"
36 #include "src/core/helpers/AutoConfiguration.h"
37 #include "src/core/helpers/WindowHelpers.h"
38 
39 #include <algorithm>
40 #include <arm_neon.h>
41 #include <climits>
42 #include <cstddef>
43 
44 namespace arm_compute
45 {
NEMinMaxKernel()46 NEMinMaxKernel::NEMinMaxKernel()
47     : _func(), _input(nullptr), _min(), _max(), _mtx()
48 {
49 }
50 
configure(const IImage * input,void * min,void * max)51 void NEMinMaxKernel::configure(const IImage *input, void *min, void *max)
52 {
53     ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
54     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16, DataType::F32);
55     ARM_COMPUTE_ERROR_ON(nullptr == min);
56     ARM_COMPUTE_ERROR_ON(nullptr == max);
57 
58     _input = input;
59     _min   = min;
60     _max   = max;
61 
62     switch(_input->info()->data_type())
63     {
64         case DataType::U8:
65             _func = &NEMinMaxKernel::minmax_U8;
66             break;
67         case DataType::S16:
68             _func = &NEMinMaxKernel::minmax_S16;
69             break;
70         case DataType::F32:
71             _func = &NEMinMaxKernel::minmax_F32;
72             break;
73         default:
74             ARM_COMPUTE_ERROR("Unsupported data type");
75             break;
76     }
77 
78     // Configure kernel window
79     constexpr unsigned int num_elems_processed_per_iteration = 1;
80 
81     Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
82 
83     INEKernel::configure(win);
84 }
85 
run(const Window & window,const ThreadInfo & info)86 void NEMinMaxKernel::run(const Window &window, const ThreadInfo &info)
87 {
88     ARM_COMPUTE_UNUSED(info);
89     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
90     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
91     ARM_COMPUTE_ERROR_ON(_func == nullptr);
92 
93     (this->*_func)(window);
94 }
95 
reset()96 void NEMinMaxKernel::reset()
97 {
98     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
99     switch(_input->info()->data_type())
100     {
101         case DataType::U8:
102             *static_cast<int32_t *>(_min) = UCHAR_MAX;
103             *static_cast<int32_t *>(_max) = 0;
104             break;
105         case DataType::S16:
106             *static_cast<int32_t *>(_min) = SHRT_MAX;
107             *static_cast<int32_t *>(_max) = SHRT_MIN;
108             break;
109         case DataType::F32:
110             *static_cast<float *>(_min) = std::numeric_limits<float>::max();
111             *static_cast<float *>(_max) = std::numeric_limits<float>::lowest();
112             break;
113         default:
114             ARM_COMPUTE_ERROR("Unsupported data type");
115             break;
116     }
117 }
118 
119 template <typename T>
update_min_max(const T min,const T max)120 void NEMinMaxKernel::update_min_max(const T min, const T max)
121 {
122     arm_compute::lock_guard<arm_compute::Mutex> lock(_mtx);
123 
124     using type = typename std::conditional<std::is_same<T, float>::value, float, int32_t>::type;
125 
126     auto min_ptr = static_cast<type *>(_min);
127     auto max_ptr = static_cast<type *>(_max);
128 
129     if(min < *min_ptr)
130     {
131         *min_ptr = min;
132     }
133 
134     if(max > *max_ptr)
135     {
136         *max_ptr = max;
137     }
138 }
139 
minmax_U8(Window win)140 void NEMinMaxKernel::minmax_U8(Window win)
141 {
142     uint8x8_t carry_min = vdup_n_u8(UCHAR_MAX);
143     uint8x8_t carry_max = vdup_n_u8(0);
144 
145     uint8_t carry_max_scalar = 0;
146     uint8_t carry_min_scalar = UCHAR_MAX;
147 
148     const int x_start = win.x().start();
149     const int x_end   = win.x().end();
150 
151     // Handle X dimension manually to split into two loops
152     // First one will use vector operations, second one processes the left over pixels
153     win.set(Window::DimX, Window::Dimension(0, 1, 1));
154 
155     Iterator input(_input, win);
156 
157     execute_window_loop(win, [&](const Coordinates &)
158     {
159         int x = x_start;
160 
161         // Vector loop
162         for(; x <= x_end - 16; x += 16)
163         {
164             const uint8x16_t pixels  = vld1q_u8(input.ptr() + x);
165             const uint8x8_t  tmp_min = vmin_u8(vget_high_u8(pixels), vget_low_u8(pixels));
166             const uint8x8_t  tmp_max = vmax_u8(vget_high_u8(pixels), vget_low_u8(pixels));
167             carry_min                = vmin_u8(tmp_min, carry_min);
168             carry_max                = vmax_u8(tmp_max, carry_max);
169         }
170 
171         // Process leftover pixels
172         for(; x < x_end; ++x)
173         {
174             const uint8_t pixel = input.ptr()[x];
175             carry_min_scalar    = std::min(pixel, carry_min_scalar);
176             carry_max_scalar    = std::max(pixel, carry_max_scalar);
177         }
178     },
179     input);
180 
181     // Reduce result
182     carry_min = vpmin_u8(carry_min, carry_min);
183     carry_max = vpmax_u8(carry_max, carry_max);
184     carry_min = vpmin_u8(carry_min, carry_min);
185     carry_max = vpmax_u8(carry_max, carry_max);
186     carry_min = vpmin_u8(carry_min, carry_min);
187     carry_max = vpmax_u8(carry_max, carry_max);
188 
189     // Extract max/min values
190     const uint8_t min_i = std::min(vget_lane_u8(carry_min, 0), carry_min_scalar);
191     const uint8_t max_i = std::max(vget_lane_u8(carry_max, 0), carry_max_scalar);
192 
193     // Perform reduction of local min/max values
194     update_min_max(min_i, max_i);
195 }
196 
minmax_S16(Window win)197 void NEMinMaxKernel::minmax_S16(Window win)
198 {
199     int16x4_t carry_min = vdup_n_s16(SHRT_MAX);
200     int16x4_t carry_max = vdup_n_s16(SHRT_MIN);
201 
202     int16_t carry_max_scalar = SHRT_MIN;
203     int16_t carry_min_scalar = SHRT_MAX;
204 
205     const int x_start = win.x().start();
206     const int x_end   = win.x().end();
207 
208     // Handle X dimension manually to split into two loops
209     // First one will use vector operations, second one processes the left over pixels
210     win.set(Window::DimX, Window::Dimension(0, 1, 1));
211 
212     Iterator input(_input, win);
213 
214     execute_window_loop(win, [&](const Coordinates &)
215     {
216         int        x      = x_start;
217         const auto in_ptr = reinterpret_cast<const int16_t *>(input.ptr());
218 
219         // Vector loop
220         for(; x <= x_end - 16; x += 16)
221         {
222             const int16x8x2_t pixels   = vld2q_s16(in_ptr + x);
223             const int16x8_t   tmp_min1 = vminq_s16(pixels.val[0], pixels.val[1]);
224             const int16x8_t   tmp_max1 = vmaxq_s16(pixels.val[0], pixels.val[1]);
225             const int16x4_t   tmp_min2 = vmin_s16(vget_high_s16(tmp_min1), vget_low_s16(tmp_min1));
226             const int16x4_t   tmp_max2 = vmax_s16(vget_high_s16(tmp_max1), vget_low_s16(tmp_max1));
227             carry_min                  = vmin_s16(tmp_min2, carry_min);
228             carry_max                  = vmax_s16(tmp_max2, carry_max);
229         }
230 
231         // Process leftover pixels
232         for(; x < x_end; ++x)
233         {
234             const int16_t pixel = in_ptr[x];
235             carry_min_scalar    = std::min(pixel, carry_min_scalar);
236             carry_max_scalar    = std::max(pixel, carry_max_scalar);
237         }
238 
239     },
240     input);
241 
242     // Reduce result
243     carry_min = vpmin_s16(carry_min, carry_min);
244     carry_max = vpmax_s16(carry_max, carry_max);
245     carry_min = vpmin_s16(carry_min, carry_min);
246     carry_max = vpmax_s16(carry_max, carry_max);
247 
248     // Extract max/min values
249     const int16_t min_i = std::min(vget_lane_s16(carry_min, 0), carry_min_scalar);
250     const int16_t max_i = std::max(vget_lane_s16(carry_max, 0), carry_max_scalar);
251 
252     // Perform reduction of local min/max values
253     update_min_max(min_i, max_i);
254 }
255 
minmax_F32(Window win)256 void NEMinMaxKernel::minmax_F32(Window win)
257 {
258     float32x2_t carry_min = vdup_n_f32(std::numeric_limits<float>::max());
259     float32x2_t carry_max = vdup_n_f32(std::numeric_limits<float>::lowest());
260 
261     float carry_min_scalar = std::numeric_limits<float>::max();
262     float carry_max_scalar = std::numeric_limits<float>::lowest();
263 
264     const int x_start = win.x().start();
265     const int x_end   = win.x().end();
266 
267     // Handle X dimension manually to split into two loops
268     // First one will use vector operations, second one processes the left over pixels
269     win.set(Window::DimX, Window::Dimension(0, 1, 1));
270 
271     Iterator input(_input, win);
272 
273     execute_window_loop(win, [&](const Coordinates &)
274     {
275         int        x      = x_start;
276         const auto in_ptr = reinterpret_cast<const float *>(input.ptr());
277 
278         // Vector loop
279         for(; x <= x_end - 8; x += 8)
280         {
281             const float32x4x2_t pixels   = vld2q_f32(in_ptr + x);
282             const float32x4_t   tmp_min1 = vminq_f32(pixels.val[0], pixels.val[1]);
283             const float32x4_t   tmp_max1 = vmaxq_f32(pixels.val[0], pixels.val[1]);
284             const float32x2_t   tmp_min2 = vmin_f32(vget_high_f32(tmp_min1), vget_low_f32(tmp_min1));
285             const float32x2_t   tmp_max2 = vmax_f32(vget_high_f32(tmp_max1), vget_low_f32(tmp_max1));
286             carry_min                    = vmin_f32(tmp_min2, carry_min);
287             carry_max                    = vmax_f32(tmp_max2, carry_max);
288         }
289 
290         // Process leftover pixels
291         for(; x < x_end; ++x)
292         {
293             const float pixel = in_ptr[x];
294             carry_min_scalar  = std::min(pixel, carry_min_scalar);
295             carry_max_scalar  = std::max(pixel, carry_max_scalar);
296         }
297 
298     },
299     input);
300 
301     // Reduce result
302     carry_min = vpmin_f32(carry_min, carry_min);
303     carry_max = vpmax_f32(carry_max, carry_max);
304     carry_min = vpmin_f32(carry_min, carry_min);
305     carry_max = vpmax_f32(carry_max, carry_max);
306 
307     // Extract max/min values
308     const float min_i = std::min(vget_lane_f32(carry_min, 0), carry_min_scalar);
309     const float max_i = std::max(vget_lane_f32(carry_max, 0), carry_max_scalar);
310 
311     // Perform reduction of local min/max values
312     update_min_max(min_i, max_i);
313 }
314 
NEMinMaxLocationKernel()315 NEMinMaxLocationKernel::NEMinMaxLocationKernel()
316     : _func(nullptr), _input(nullptr), _min(nullptr), _max(nullptr), _min_count(nullptr), _max_count(nullptr), _min_loc(nullptr), _max_loc(nullptr)
317 {
318 }
319 
is_parallelisable() const320 bool NEMinMaxLocationKernel::is_parallelisable() const
321 {
322     return false;
323 }
324 
325 template <class T, std::size_t... N>
326 struct NEMinMaxLocationKernel::create_func_table<T, utility::index_sequence<N...>>
327 {
328     static const std::array<NEMinMaxLocationKernel::MinMaxLocFunction, sizeof...(N)> func_table;
329 };
330 
331 template <class T, std::size_t... N>
332 const std::array<NEMinMaxLocationKernel::MinMaxLocFunction, sizeof...(N)> NEMinMaxLocationKernel::create_func_table<T, utility::index_sequence<N...>>::func_table
333 {
334     &NEMinMaxLocationKernel::minmax_loc<T, bool(N & 8), bool(N & 4), bool(N & 2), bool(N & 1)>...
335 };
336 
configure(const IImage * input,void * min,void * max,ICoordinates2DArray * min_loc,ICoordinates2DArray * max_loc,uint32_t * min_count,uint32_t * max_count)337 void NEMinMaxLocationKernel::configure(const IImage *input, void *min, void *max,
338                                        ICoordinates2DArray *min_loc, ICoordinates2DArray *max_loc,
339                                        uint32_t *min_count, uint32_t *max_count)
340 {
341     ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
342     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16, DataType::F32);
343     ARM_COMPUTE_ERROR_ON(nullptr == min);
344     ARM_COMPUTE_ERROR_ON(nullptr == max);
345 
346     _input     = input;
347     _min       = min;
348     _max       = max;
349     _min_count = min_count;
350     _max_count = max_count;
351     _min_loc   = min_loc;
352     _max_loc   = max_loc;
353 
354     unsigned int count_min = (nullptr != min_count ? 1 : 0);
355     unsigned int count_max = (nullptr != max_count ? 1 : 0);
356     unsigned int loc_min   = (nullptr != min_loc ? 1 : 0);
357     unsigned int loc_max   = (nullptr != max_loc ? 1 : 0);
358 
359     unsigned int table_idx = (count_min << 3) | (count_max << 2) | (loc_min << 1) | loc_max;
360 
361     switch(input->info()->data_type())
362     {
363         case DataType::U8:
364             _func = create_func_table<uint8_t, utility::index_sequence_t<16>>::func_table[table_idx];
365             break;
366         case DataType::S16:
367             _func = create_func_table<int16_t, utility::index_sequence_t<16>>::func_table[table_idx];
368             break;
369         case DataType::F32:
370             _func = create_func_table<float, utility::index_sequence_t<16>>::func_table[table_idx];
371             break;
372         default:
373             ARM_COMPUTE_ERROR("Unsupported data type");
374             break;
375     }
376 
377     constexpr unsigned int num_elems_processed_per_iteration = 1;
378 
379     // Configure kernel window
380     Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
381 
382     update_window_and_padding(win, AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration));
383 
384     INEKernel::configure(win);
385 }
386 
run(const Window & window,const ThreadInfo & info)387 void NEMinMaxLocationKernel::run(const Window &window, const ThreadInfo &info)
388 {
389     ARM_COMPUTE_UNUSED(info);
390     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
391     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
392     ARM_COMPUTE_ERROR_ON(_func == nullptr);
393 
394     (this->*_func)(window);
395 }
396 
397 template <class T, bool count_min, bool count_max, bool loc_min, bool loc_max>
minmax_loc(const Window & win)398 void NEMinMaxLocationKernel::minmax_loc(const Window &win)
399 {
400     if(count_min || count_max || loc_min || loc_max)
401     {
402         Iterator input(_input, win);
403 
404         size_t min_count = 0;
405         size_t max_count = 0;
406 
407         // Clear min location array
408         if(loc_min)
409         {
410             _min_loc->clear();
411         }
412 
413         // Clear max location array
414         if(loc_max)
415         {
416             _max_loc->clear();
417         }
418 
419         using type = typename std::conditional<std::is_same<T, float>::value, float, int32_t>::type;
420 
421         auto min_ptr = static_cast<type *>(_min);
422         auto max_ptr = static_cast<type *>(_max);
423 
424         execute_window_loop(win, [&](const Coordinates & id)
425         {
426             auto    in_ptr = reinterpret_cast<const T *>(input.ptr());
427             int32_t idx    = id.x();
428             int32_t idy    = id.y();
429 
430             const T       pixel = *in_ptr;
431             Coordinates2D p{ idx, idy };
432 
433             if(count_min || loc_min)
434             {
435                 if(*min_ptr == pixel)
436                 {
437                     if(count_min)
438                     {
439                         ++min_count;
440                     }
441 
442                     if(loc_min)
443                     {
444                         _min_loc->push_back(p);
445                     }
446                 }
447             }
448 
449             if(count_max || loc_max)
450             {
451                 if(*max_ptr == pixel)
452                 {
453                     if(count_max)
454                     {
455                         ++max_count;
456                     }
457 
458                     if(loc_max)
459                     {
460                         _max_loc->push_back(p);
461                     }
462                 }
463             }
464         },
465         input);
466 
467         if(count_min)
468         {
469             *_min_count = min_count;
470         }
471 
472         if(count_max)
473         {
474             *_max_count = max_count;
475         }
476     }
477 }
478 } // namespace arm_compute
479