• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2016-2020 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 #include "src/core/NEON/kernels/NEChannelCombineKernel.h"
25 
26 #include "arm_compute/core/Error.h"
27 #include "arm_compute/core/Helpers.h"
28 #include "arm_compute/core/IAccessWindow.h"
29 #include "arm_compute/core/IMultiImage.h"
30 #include "arm_compute/core/ITensor.h"
31 #include "arm_compute/core/MultiImageInfo.h"
32 #include "arm_compute/core/TensorInfo.h"
33 #include "arm_compute/core/Types.h"
34 #include "arm_compute/core/Validate.h"
35 #include "arm_compute/core/Window.h"
36 #include "src/core/helpers/AutoConfiguration.h"
37 #include "src/core/helpers/WindowHelpers.h"
38 
39 #include <arm_neon.h>
40 
41 using namespace arm_compute;
42 
43 namespace arm_compute
44 {
45 class Coordinates;
46 } // namespace arm_compute
47 
NEChannelCombineKernel()48 NEChannelCombineKernel::NEChannelCombineKernel()
49     : _func(nullptr), _planes{ { nullptr } }, _output(nullptr), _output_multi(nullptr), _x_subsampling{ { 1, 1, 1 } }, _y_subsampling{ { 1, 1, 1 } }, _num_elems_processed_per_iteration(8),
50 _is_parallelizable(true)
51 {
52 }
53 
configure(const ITensor * plane0,const ITensor * plane1,const ITensor * plane2,const ITensor * plane3,ITensor * output)54 void NEChannelCombineKernel::configure(const ITensor *plane0, const ITensor *plane1, const ITensor *plane2, const ITensor *plane3, ITensor *output)
55 {
56     ARM_COMPUTE_ERROR_ON_NULLPTR(plane0, plane1, plane2, output);
57     ARM_COMPUTE_ERROR_ON(plane0 == output);
58     ARM_COMPUTE_ERROR_ON(plane1 == output);
59     ARM_COMPUTE_ERROR_ON(plane2 == output);
60 
61     ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(plane0, Format::U8);
62     ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(plane1, Format::U8);
63     ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(plane2, Format::U8);
64     ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output, Format::RGB888, Format::RGBA8888, Format::UYVY422, Format::YUYV422);
65 
66     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(plane0, 1, DataType::U8);
67     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(plane1, 1, DataType::U8);
68     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(plane2, 1, DataType::U8);
69 
70     const Format output_format = output->info()->format();
71 
72     // Check if horizontal dimension of Y plane is even and validate horizontal sub-sampling dimensions for U and V planes
73     if(Format::YUYV422 == output_format || Format::UYVY422 == output_format)
74     {
75         // Validate Y plane of input and output
76         ARM_COMPUTE_ERROR_ON_TENSORS_NOT_EVEN(output_format, plane0, output);
77 
78         // Validate U and V plane of the input
79         ARM_COMPUTE_ERROR_ON_TENSORS_NOT_SUBSAMPLED(output_format, plane0->info()->tensor_shape(), plane1, plane2);
80     }
81 
82     _planes[0] = plane0;
83     _planes[1] = plane1;
84     _planes[2] = plane2;
85     _planes[3] = nullptr;
86 
87     // Validate the last input tensor only for RGBA format
88     if(Format::RGBA8888 == output_format)
89     {
90         ARM_COMPUTE_ERROR_ON_NULLPTR(plane3);
91         ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(plane3);
92 
93         ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(plane3, Format::U8);
94         ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(plane3, 1, DataType::U8);
95 
96         _planes[3] = plane3;
97     }
98 
99     _output       = output;
100     _output_multi = nullptr;
101 
102     // Half the processed elements for U and V channels due to horizontal sub-sampling of 2
103     if(Format::YUYV422 == output_format || Format::UYVY422 == output_format)
104     {
105         _x_subsampling[1] = 2;
106         _x_subsampling[2] = 2;
107     }
108 
109     _num_elems_processed_per_iteration = 8;
110     _is_parallelizable                 = true;
111 
112     // Select function and number of elements to process given the output format
113     switch(output_format)
114     {
115         case Format::RGB888:
116             _func = &NEChannelCombineKernel::combine_3C;
117             break;
118         case Format::RGBA8888:
119             _func = &NEChannelCombineKernel::combine_4C;
120             break;
121         case Format::UYVY422:
122             _num_elems_processed_per_iteration = 16;
123             _func                              = &NEChannelCombineKernel::combine_YUV_1p<true>;
124             break;
125         case Format::YUYV422:
126             _num_elems_processed_per_iteration = 16;
127             _func                              = &NEChannelCombineKernel::combine_YUV_1p<false>;
128             break;
129         default:
130             ARM_COMPUTE_ERROR("Not supported format.");
131             break;
132     }
133 
134     Window win = calculate_max_window(*plane0->info(), Steps(_num_elems_processed_per_iteration));
135 
136     AccessWindowHorizontal output_access(output->info(), 0, _num_elems_processed_per_iteration);
137     AccessWindowHorizontal plane0_access(plane0->info(), 0, _num_elems_processed_per_iteration / _x_subsampling[1], 1.f / _x_subsampling[0]);
138     AccessWindowHorizontal plane1_access(plane1->info(), 0, _num_elems_processed_per_iteration / _x_subsampling[1], 1.f / _x_subsampling[1]);
139     AccessWindowHorizontal plane2_access(plane2->info(), 0, _num_elems_processed_per_iteration / _x_subsampling[1], 1.f / _x_subsampling[2]);
140     AccessWindowHorizontal plane3_access(plane3 == nullptr ? nullptr : plane3->info(), 0, _num_elems_processed_per_iteration);
141 
142     update_window_and_padding(
143         win,
144         plane0_access,
145         plane1_access,
146         plane2_access,
147         plane3_access,
148         output_access);
149 
150     ValidRegion valid_region = intersect_valid_regions(plane0->info()->valid_region(),
151                                                        plane1->info()->valid_region(),
152                                                        plane2->info()->valid_region());
153 
154     if(plane3 != nullptr)
155     {
156         valid_region = intersect_valid_regions(plane3->info()->valid_region(), valid_region);
157     }
158 
159     output_access.set_valid_region(win, ValidRegion(valid_region.anchor, output->info()->tensor_shape()));
160 
161     INEKernel::configure(win);
162 }
163 
configure(const IImage * plane0,const IImage * plane1,const IImage * plane2,IMultiImage * output)164 void NEChannelCombineKernel::configure(const IImage *plane0, const IImage *plane1, const IImage *plane2, IMultiImage *output)
165 {
166     ARM_COMPUTE_ERROR_ON_NULLPTR(plane0, plane1, plane2, output);
167     ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(plane0);
168     ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(plane1);
169     ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(plane2);
170 
171     ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(plane0, Format::U8);
172     ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(plane1, Format::U8);
173     ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(plane2, Format::U8);
174     ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output, Format::NV12, Format::NV21, Format::IYUV, Format::YUV444);
175 
176     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(plane0, 1, DataType::U8);
177     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(plane1, 1, DataType::U8);
178     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(plane2, 1, DataType::U8);
179 
180     const Format output_format = output->info()->format();
181 
182     // Validate shape of Y plane to be even and shape of sub-sampling dimensions for U and V planes
183     // Perform validation only for formats which require sub-sampling.
184     if(Format::YUV444 != output_format)
185     {
186         // Validate Y plane of input and output
187         ARM_COMPUTE_ERROR_ON_TENSORS_NOT_EVEN(output_format, plane0, output->plane(0));
188 
189         // Validate U and V plane of the input
190         ARM_COMPUTE_ERROR_ON_TENSORS_NOT_SUBSAMPLED(output_format, plane0->info()->tensor_shape(), plane1, plane2);
191 
192         // Validate second plane U (NV12 and NV21 have a UV88 combined plane while IYUV has only the U plane)
193         // MultiImage generates the correct tensor shape but also check in case the tensor shape of planes was changed to a wrong size
194         ARM_COMPUTE_ERROR_ON_TENSORS_NOT_SUBSAMPLED(output_format, plane0->info()->tensor_shape(), output->plane(1));
195 
196         // Validate the last plane V of format IYUV
197         if(Format::IYUV == output_format)
198         {
199             // Validate Y plane of the output
200             ARM_COMPUTE_ERROR_ON_TENSORS_NOT_SUBSAMPLED(output_format, plane0->info()->tensor_shape(), output->plane(2));
201         }
202     }
203 
204     _planes[0]    = plane0;
205     _planes[1]    = plane1;
206     _planes[2]    = plane2;
207     _planes[3]    = nullptr;
208     _output       = nullptr;
209     _output_multi = output;
210 
211     bool         has_two_planes           = false;
212     unsigned int num_elems_written_plane1 = 8;
213 
214     _num_elems_processed_per_iteration = 8;
215     _is_parallelizable                 = true;
216 
217     switch(output_format)
218     {
219         case Format::NV12:
220         case Format::NV21:
221             _x_subsampling           = { { 1, 2, 2 } };
222             _y_subsampling           = { { 1, 2, 2 } };
223             _func                    = &NEChannelCombineKernel::combine_YUV_2p;
224             has_two_planes           = true;
225             num_elems_written_plane1 = 16;
226             break;
227         case Format::IYUV:
228             _is_parallelizable = false;
229             _x_subsampling     = { { 1, 2, 2 } };
230             _y_subsampling     = { { 1, 2, 2 } };
231             _func              = &NEChannelCombineKernel::combine_YUV_3p;
232             break;
233         case Format::YUV444:
234             _is_parallelizable = false;
235             _x_subsampling     = { { 1, 1, 1 } };
236             _y_subsampling     = { { 1, 1, 1 } };
237             _func              = &NEChannelCombineKernel::combine_YUV_3p;
238             break;
239         default:
240             ARM_COMPUTE_ERROR("Not supported format.");
241             break;
242     }
243 
244     const unsigned int y_step = *std::max_element(_y_subsampling.begin(), _y_subsampling.end());
245 
246     Window                win = calculate_max_window(*plane0->info(), Steps(_num_elems_processed_per_iteration, y_step));
247     AccessWindowRectangle output_plane0_access(output->plane(0)->info(), 0, 0, _num_elems_processed_per_iteration, 1, 1.f, 1.f / _y_subsampling[0]);
248     AccessWindowRectangle output_plane1_access(output->plane(1)->info(), 0, 0, num_elems_written_plane1, 1, 1.f / _x_subsampling[1], 1.f / _y_subsampling[1]);
249     AccessWindowRectangle output_plane2_access(has_two_planes ? nullptr : output->plane(2)->info(), 0, 0, _num_elems_processed_per_iteration, 1, 1.f / _x_subsampling[2], 1.f / _y_subsampling[2]);
250 
251     update_window_and_padding(win,
252                               AccessWindowHorizontal(plane0->info(), 0, _num_elems_processed_per_iteration),
253                               AccessWindowRectangle(plane1->info(), 0, 0, _num_elems_processed_per_iteration, 1, 1.f / _x_subsampling[1], 1.f / _y_subsampling[1]),
254                               AccessWindowRectangle(plane2->info(), 0, 0, _num_elems_processed_per_iteration, 1, 1.f / _x_subsampling[2], 1.f / _y_subsampling[2]),
255                               output_plane0_access,
256                               output_plane1_access,
257                               output_plane2_access);
258 
259     ValidRegion plane0_valid_region  = plane0->info()->valid_region();
260     ValidRegion output_plane1_region = has_two_planes ? intersect_valid_regions(plane1->info()->valid_region(), plane2->info()->valid_region()) : plane2->info()->valid_region();
261 
262     output_plane0_access.set_valid_region(win, ValidRegion(plane0_valid_region.anchor, output->plane(0)->info()->tensor_shape()));
263     output_plane1_access.set_valid_region(win, ValidRegion(output_plane1_region.anchor, output->plane(1)->info()->tensor_shape()));
264     output_plane2_access.set_valid_region(win, ValidRegion(plane2->info()->valid_region().anchor, output->plane(2)->info()->tensor_shape()));
265 
266     INEKernel::configure(win);
267 }
268 
is_parallelisable() const269 bool NEChannelCombineKernel::is_parallelisable() const
270 {
271     return _is_parallelizable;
272 }
273 
run(const Window & window,const ThreadInfo & info)274 void NEChannelCombineKernel::run(const Window &window, const ThreadInfo &info)
275 {
276     ARM_COMPUTE_UNUSED(info);
277     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
278     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
279     ARM_COMPUTE_ERROR_ON(_func == nullptr);
280 
281     (this->*_func)(window);
282 }
283 
combine_3C(const Window & win)284 void NEChannelCombineKernel::combine_3C(const Window &win)
285 {
286     Iterator p0(_planes[0], win);
287     Iterator p1(_planes[1], win);
288     Iterator p2(_planes[2], win);
289     Iterator out(_output, win);
290 
291     execute_window_loop(win, [&](const Coordinates &)
292     {
293         const auto p0_ptr  = static_cast<uint8_t *>(p0.ptr());
294         const auto p1_ptr  = static_cast<uint8_t *>(p1.ptr());
295         const auto p2_ptr  = static_cast<uint8_t *>(p2.ptr());
296         const auto out_ptr = static_cast<uint8_t *>(out.ptr());
297 
298         const uint8x8x3_t pixels =
299         {
300             {
301                 vld1_u8(p0_ptr),
302                 vld1_u8(p1_ptr),
303                 vld1_u8(p2_ptr)
304             }
305         };
306 
307         vst3_u8(out_ptr, pixels);
308     },
309     p0, p1, p2, out);
310 }
311 
combine_4C(const Window & win)312 void NEChannelCombineKernel::combine_4C(const Window &win)
313 {
314     Iterator p0(_planes[0], win);
315     Iterator p1(_planes[1], win);
316     Iterator p2(_planes[2], win);
317     Iterator p3(_planes[3], win);
318     Iterator out(_output, win);
319 
320     execute_window_loop(win, [&](const Coordinates &)
321     {
322         const auto p0_ptr  = static_cast<uint8_t *>(p0.ptr());
323         const auto p1_ptr  = static_cast<uint8_t *>(p1.ptr());
324         const auto p2_ptr  = static_cast<uint8_t *>(p2.ptr());
325         const auto p3_ptr  = static_cast<uint8_t *>(p3.ptr());
326         const auto out_ptr = static_cast<uint8_t *>(out.ptr());
327 
328         const uint8x8x4_t pixels =
329         {
330             {
331                 vld1_u8(p0_ptr),
332                 vld1_u8(p1_ptr),
333                 vld1_u8(p2_ptr),
334                 vld1_u8(p3_ptr)
335             }
336         };
337 
338         vst4_u8(out_ptr, pixels);
339     },
340     p0, p1, p2, p3, out);
341 }
342 
343 template <bool is_uyvy>
combine_YUV_1p(const Window & win)344 void NEChannelCombineKernel::combine_YUV_1p(const Window &win)
345 {
346     // Create sub-sampled uv window and init uv planes
347     Window win_uv(win);
348     win_uv.set_dimension_step(Window::DimX, win.x().step() / _x_subsampling[1]);
349     win_uv.validate();
350 
351     Iterator p0(_planes[0], win);
352     Iterator p1(_planes[1], win_uv);
353     Iterator p2(_planes[2], win_uv);
354     Iterator out(_output, win);
355 
356     constexpr auto shift = is_uyvy ? 1 : 0;
357 
358     execute_window_loop(win, [&](const Coordinates &)
359     {
360         const auto p0_ptr  = static_cast<uint8_t *>(p0.ptr());
361         const auto p1_ptr  = static_cast<uint8_t *>(p1.ptr());
362         const auto p2_ptr  = static_cast<uint8_t *>(p2.ptr());
363         const auto out_ptr = static_cast<uint8_t *>(out.ptr());
364 
365         const uint8x8x2_t pixels_y = vld2_u8(p0_ptr);
366         const uint8x8x2_t pixels_uv =
367         {
368             {
369                 vld1_u8(p1_ptr),
370                 vld1_u8(p2_ptr)
371             }
372         };
373 
374         uint8x8x4_t pixels{ {} };
375         pixels.val[0 + shift] = pixels_y.val[0];
376         pixels.val[1 - shift] = pixels_uv.val[0];
377         pixels.val[2 + shift] = pixels_y.val[1];
378         pixels.val[3 - shift] = pixels_uv.val[1];
379 
380         vst4_u8(out_ptr, pixels);
381     },
382     p0, p1, p2, out);
383 }
384 
combine_YUV_2p(const Window & win)385 void NEChannelCombineKernel::combine_YUV_2p(const Window &win)
386 {
387     ARM_COMPUTE_ERROR_ON(win.x().start() % _x_subsampling[1]);
388     ARM_COMPUTE_ERROR_ON(win.y().start() % _y_subsampling[1]);
389 
390     // Copy first plane
391     copy_plane(win, 0);
392 
393     // Update UV window
394     Window uv_win(win);
395     uv_win.set(Window::DimX, Window::Dimension(uv_win.x().start() / _x_subsampling[1], uv_win.x().end() / _x_subsampling[1], uv_win.x().step() / _x_subsampling[1]));
396     uv_win.set(Window::DimY, Window::Dimension(uv_win.y().start() / _y_subsampling[1], uv_win.y().end() / _y_subsampling[1], 1));
397     uv_win.validate();
398 
399     // Update output win
400     Window out_win(win);
401     out_win.set(Window::DimX, Window::Dimension(out_win.x().start(), out_win.x().end(), out_win.x().step() / _x_subsampling[1]));
402     out_win.set(Window::DimY, Window::Dimension(out_win.y().start() / _y_subsampling[1], out_win.y().end() / _y_subsampling[1], 1));
403     out_win.validate();
404 
405     // Construct second plane
406     const int shift = (Format::NV12 == _output_multi->info()->format()) ? 0 : 1;
407     Iterator  p1(_planes[1 + shift], uv_win);
408     Iterator  p2(_planes[2 - shift], uv_win);
409     Iterator  out(_output_multi->plane(1), out_win);
410 
411     // Increase step size after iterator is created to calculate stride correctly for multi channel format
412     out_win.set_dimension_step(Window::DimX, out_win.x().step() * _x_subsampling[1]);
413 
414     execute_window_loop(out_win, [&](const Coordinates &)
415     {
416         const uint8x8x2_t pixels =
417         {
418             {
419                 vld1_u8(p1.ptr()),
420                 vld1_u8(p2.ptr())
421             }
422         };
423 
424         vst2_u8(out.ptr(), pixels);
425     },
426     p1, p2, out);
427 }
428 
combine_YUV_3p(const Window & win)429 void NEChannelCombineKernel::combine_YUV_3p(const Window &win)
430 {
431     copy_plane(win, 0);
432     copy_plane(win, 1);
433     copy_plane(win, 2);
434 }
435 
copy_plane(const Window & win,uint32_t plane_id)436 void NEChannelCombineKernel::copy_plane(const Window &win, uint32_t plane_id)
437 {
438     ARM_COMPUTE_ERROR_ON(win.x().start() % _x_subsampling[plane_id]);
439     ARM_COMPUTE_ERROR_ON(win.y().start() % _y_subsampling[plane_id]);
440 
441     // Update window
442     Window tmp_win(win);
443     tmp_win.set(Window::DimX, Window::Dimension(tmp_win.x().start() / _x_subsampling[plane_id], tmp_win.x().end() / _x_subsampling[plane_id], tmp_win.x().step() / _x_subsampling[plane_id]));
444     tmp_win.set(Window::DimY, Window::Dimension(tmp_win.y().start() / _y_subsampling[plane_id], tmp_win.y().end() / _y_subsampling[plane_id], 1));
445 
446     Iterator in(_planes[plane_id], tmp_win);
447     Iterator out(_output_multi->plane(plane_id), tmp_win);
448 
449     execute_window_loop(tmp_win, [&](const Coordinates &)
450     {
451         const uint8x8_t pixels = vld1_u8(in.ptr());
452 
453         vst1_u8(out.ptr(), pixels);
454     },
455     in, out);
456 }
457