• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * By downloading, copying, installing or using the software you agree to this license.
3  * If you do not agree to this license, do not download, install,
4  * copy or use the software.
5  *
6  *
7  *                           License Agreement
8  *                For Open Source Computer Vision Library
9  *                        (3-clause BSD License)
10  *
11  * Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
12  * Third party copyrights are property of their respective owners.
13  *
14  * Redistribution and use in source and binary forms, with or without modification,
15  * are permitted provided that the following conditions are met:
16  *
17  *   * Redistributions of source code must retain the above copyright notice,
18  *     this list of conditions and the following disclaimer.
19  *
20  *   * Redistributions in binary form must reproduce the above copyright notice,
21  *     this list of conditions and the following disclaimer in the documentation
22  *     and/or other materials provided with the distribution.
23  *
24  *   * Neither the names of the copyright holders nor the names of the contributors
25  *     may be used to endorse or promote products derived from this software
26  *     without specific prior written permission.
27  *
28  * This software is provided by the copyright holders and contributors "as is" and
29  * any express or implied warranties, including, but not limited to, the implied
30  * warranties of merchantability and fitness for a particular purpose are disclaimed.
31  * In no event shall copyright holders or contributors be liable for any direct,
32  * indirect, incidental, special, exemplary, or consequential damages
33  * (including, but not limited to, procurement of substitute goods or services;
34  * loss of use, data, or profits; or business interruption) however caused
35  * and on any theory of liability, whether in contract, strict liability,
36  * or tort (including negligence or otherwise) arising in any way out of
37  * the use of this software, even if advised of the possibility of such damage.
38  */
39 
40 #include "common.hpp"
41 
42 #include "saturate_cast.hpp"
43 
44 namespace CAROTENE_NS {
45 
46 #ifdef CAROTENE_NEON
47 
48 namespace {
49 
50 enum
51 {
52     SHIFT = 14,
53     SHIFT_DELTA = 1 << (SHIFT - 1),
54 
55     R2Y_BT601   = 4899,
56     G2Y_BT601   = 9617,
57     B2Y_BT601   = 1868,
58 
59     R2Y_BT709   = 3483,
60     G2Y_BT709   = 11718,
61     B2Y_BT709   = 1183,
62 };
63 
convertToGray(const uint16x8_t & v_r,const uint16x8_t & v_g,const uint16x8_t & v_b,const uint16x4_t & v_r2y,const uint16x4_t & v_g2y,const uint16x4_t & v_b2y)64 inline uint8x8_t convertToGray(const uint16x8_t & v_r,
65                                const uint16x8_t & v_g,
66                                const uint16x8_t & v_b,
67                                const uint16x4_t & v_r2y,
68                                const uint16x4_t & v_g2y,
69                                const uint16x4_t & v_b2y)
70 {
71     uint32x4_t v_dst0 = vmull_u16(vget_low_u16(v_g), v_g2y);
72     uint32x4_t v_dst1 = vmull_u16(vget_high_u16(v_g), v_g2y);
73 
74     v_dst0 = vmlal_u16(v_dst0, vget_low_u16(v_r), v_r2y);
75     v_dst1 = vmlal_u16(v_dst1, vget_high_u16(v_r), v_r2y);
76 
77     v_dst0 = vmlal_u16(v_dst0, vget_low_u16(v_b), v_b2y);
78     v_dst1 = vmlal_u16(v_dst1, vget_high_u16(v_b), v_b2y);
79 
80     uint8x8_t v_gray = vqmovn_u16(vcombine_u16(vrshrn_n_u32(v_dst0, SHIFT),
81                                                vrshrn_n_u32(v_dst1, SHIFT)));
82 
83     return v_gray;
84 }
85 
86 } // namespace
87 
88 #endif
89 
rgb2gray(const Size2D & size,COLOR_SPACE color_space,const u8 * srcBase,ptrdiff_t srcStride,u8 * dstBase,ptrdiff_t dstStride)90 void rgb2gray(const Size2D &size, COLOR_SPACE color_space,
91               const u8 * srcBase, ptrdiff_t srcStride,
92               u8 * dstBase, ptrdiff_t dstStride)
93 {
94     internal::assertSupportedConfiguration();
95 #ifdef CAROTENE_NEON
96     const u32 R2Y = color_space == COLOR_SPACE_BT601 ? R2Y_BT601 : R2Y_BT709;
97     const u32 G2Y = color_space == COLOR_SPACE_BT601 ? G2Y_BT601 : G2Y_BT709;
98     const u32 B2Y = color_space == COLOR_SPACE_BT601 ? B2Y_BT601 : B2Y_BT709;
99 
100 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
101     register int16x4_t v_r2y asm ("d31") = vmov_n_s16(R2Y);
102     register int16x4_t v_g2y asm ("d30") = vmov_n_s16(G2Y);
103     register int16x4_t v_b2y asm ("d29") = vmov_n_s16(B2Y);
104 #else
105     uint16x4_t v_r2y = vdup_n_u16(R2Y),
106                v_g2y = vdup_n_u16(G2Y),
107                v_b2y = vdup_n_u16(B2Y);
108 
109     size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
110 #endif
111     size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
112 
113     for (size_t i = 0u; i < size.height; ++i)
114     {
115         const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
116         u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
117         size_t sj = 0u, dj = 0u;
118 
119 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
120         for (; dj < roiw8; sj += 24, dj += 8)
121         {
122             internal::prefetch(src + sj);
123             __asm__ (
124             "vld3.8 {d0-d2}, [%[in]] @RGB                       \n\t"
125             "vmovl.u8 q2, d0         @R (d4,d5)                 \n\t"
126             "vmovl.u8 q3, d1         @G (d6,d7)                 \n\t"
127             "vmovl.u8 q4, d2         @B (d8,d9)                 \n\t"
128             "vmull.u16 q5, d6, d30   @Y (q5,q6):  G             \n\t"
129             "vmull.u16 q6, d7, d30   @Y (q5,q6):  G             \n\t"
130             "vmlal.s16 q5, d8, d29   @Y (q5,q6):  GB            \n\t"
131             "vmlal.s16 q6, d9, d29   @Y (q5,q6):  GB            \n\t"
132             "vmlal.s16 q5, d4, d31   @Y (q5,q6):  GBR           \n\t"
133             "vmlal.s16 q6, d5, d31   @Y (q5,q6):  GBR           \n\t"
134             "vrshrn.s32 d8, q5, #14  @Y  -> q4                  \n\t"
135             "vrshrn.s32 d9, q6, #14  @Y  -> q4                  \n\t"
136             "vqmovn.u16 d4, q4                                  \n\t"
137             "vst1.8 {d4}, [%[out]]                              \n\t"
138             : /*no output*/
139             : [out] "r" (dst + dj), [in] "r" (src + sj), "w" (v_r2y), "w" (v_g2y), "w" (v_b2y)
140             : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13"
141             );
142         }
143 #else
144         for (; dj < roiw16; sj += 48, dj += 16)
145         {
146             internal::prefetch(src + sj);
147             uint8x16x3_t v_src0 = vld3q_u8(src + sj);
148             // 0
149             uint16x8_t v_r = vmovl_u8(vget_low_u8(v_src0.val[0])),
150                        v_g = vmovl_u8(vget_low_u8(v_src0.val[1])),
151                        v_b = vmovl_u8(vget_low_u8(v_src0.val[2]));
152             uint8x8_t v_gray0 = convertToGray(v_r, v_g, v_b, v_r2y, v_g2y, v_b2y);
153 
154             v_r = vmovl_u8(vget_high_u8(v_src0.val[0])),
155             v_g = vmovl_u8(vget_high_u8(v_src0.val[1])),
156             v_b = vmovl_u8(vget_high_u8(v_src0.val[2]));
157             uint8x8_t v_gray1 = convertToGray(v_r, v_g, v_b, v_r2y, v_g2y, v_b2y);
158 
159             vst1q_u8(dst + dj, vcombine_u8(v_gray0, v_gray1));
160         }
161 
162         if (dj < roiw8)
163         {
164             uint8x8x3_t v_src = vld3_u8(src + sj);
165             uint16x8_t v_r = vmovl_u8(v_src.val[0]),
166                        v_g = vmovl_u8(v_src.val[1]),
167                        v_b = vmovl_u8(v_src.val[2]);
168             uint8x8_t v_gray = convertToGray(v_r, v_g, v_b, v_r2y, v_g2y, v_b2y);
169 
170             vst1_u8(dst + dj, v_gray);
171             sj += 24; dj += 8;
172         }
173 #endif
174 
175         for (; dj < size.width; sj += 3, dj++)
176         {
177             u32 val = src[sj] * R2Y + src[sj + 1] * G2Y + src[sj + 2] * B2Y;
178             dst[dj] = internal::saturate_cast<u8>((val + SHIFT_DELTA) >> SHIFT);
179         }
180     }
181 #else
182     (void)size;
183     (void)color_space;
184     (void)srcBase;
185     (void)srcStride;
186     (void)dstBase;
187     (void)dstStride;
188 #endif
189 }
190 
rgbx2gray(const Size2D & size,COLOR_SPACE color_space,const u8 * srcBase,ptrdiff_t srcStride,u8 * dstBase,ptrdiff_t dstStride)191 void rgbx2gray(const Size2D &size, COLOR_SPACE color_space,
192                const u8 * srcBase, ptrdiff_t srcStride,
193                u8 * dstBase, ptrdiff_t dstStride)
194 {
195     internal::assertSupportedConfiguration();
196 #ifdef CAROTENE_NEON
197     const u32 R2Y = color_space == COLOR_SPACE_BT601 ? R2Y_BT601 : R2Y_BT709;
198     const u32 G2Y = color_space == COLOR_SPACE_BT601 ? G2Y_BT601 : G2Y_BT709;
199     const u32 B2Y = color_space == COLOR_SPACE_BT601 ? B2Y_BT601 : B2Y_BT709;
200 
201 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
202     register int16x4_t v_r2y asm ("d31") = vmov_n_s16(R2Y);
203     register int16x4_t v_g2y asm ("d30") = vmov_n_s16(G2Y);
204     register int16x4_t v_b2y asm ("d29") = vmov_n_s16(B2Y);
205 #else
206     uint16x4_t v_r2y = vdup_n_u16(R2Y),
207                v_g2y = vdup_n_u16(G2Y),
208                v_b2y = vdup_n_u16(B2Y);
209 
210     size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
211 #endif
212     size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
213 
214     for (size_t i = 0u; i < size.height; ++i)
215     {
216         const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
217         u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
218         size_t sj = 0u, dj = 0u;
219 
220 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
221         for (; dj < roiw8; sj += 32, dj += 8)
222         {
223             internal::prefetch(src + sj);
224             __asm__ (
225             "vld4.8 {d0-d3}, [%[in]] @RGBA                      \n\t"
226             "vmovl.u8 q2, d0         @R (d4,d5)                 \n\t"
227             "vmovl.u8 q3, d1         @G (d6,d7)                 \n\t"
228             "vmovl.u8 q4, d2         @B (d8,d9)                 \n\t"
229             "vmull.u16 q5, d6, d30   @Y (q5,q6):  G             \n\t"
230             "vmull.u16 q6, d7, d30   @Y (q5,q6):  G             \n\t"
231             "vmlal.s16 q5, d8, d29   @Y (q5,q6):  GB            \n\t"
232             "vmlal.s16 q6, d9, d29   @Y (q5,q6):  GB            \n\t"
233             "vmlal.s16 q5, d4, d31   @Y (q5,q6):  GBR           \n\t"
234             "vmlal.s16 q6, d5, d31   @Y (q5,q6):  GBR           \n\t"
235             "vrshrn.s32 d8, q5, #14  @Y  -> q4                  \n\t"
236             "vrshrn.s32 d9, q6, #14  @Y  -> q4                  \n\t"
237             "vqmovn.u16 d4, q4                                  \n\t"
238             "vst1.8 {d4}, [%[out]]                              \n\t"
239             : /*no output*/
240             : [out] "r" (dst + dj), [in] "r" (src + sj), "w" (v_r2y), "w" (v_g2y), "w" (v_b2y)
241             : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13"
242             );
243         }
244 #else
245         for (; dj < roiw16; sj += 64, dj += 16)
246         {
247             internal::prefetch(src + sj);
248             uint8x16x4_t v_src0 = vld4q_u8(src + sj);
249 
250             // 0
251             uint16x8_t v_r = vmovl_u8(vget_low_u8(v_src0.val[0])),
252                        v_g = vmovl_u8(vget_low_u8(v_src0.val[1])),
253                        v_b = vmovl_u8(vget_low_u8(v_src0.val[2]));
254             uint8x8_t v_gray0 = convertToGray(v_r, v_g, v_b, v_r2y, v_g2y, v_b2y);
255 
256             v_r = vmovl_u8(vget_high_u8(v_src0.val[0])),
257             v_g = vmovl_u8(vget_high_u8(v_src0.val[1])),
258             v_b = vmovl_u8(vget_high_u8(v_src0.val[2]));
259             uint8x8_t v_gray1 = convertToGray(v_r, v_g, v_b, v_r2y, v_g2y, v_b2y);
260 
261             vst1q_u8(dst + dj, vcombine_u8(v_gray0, v_gray1));
262         }
263 
264         if (dj < roiw8)
265         {
266             uint8x8x4_t v_src = vld4_u8(src + sj);
267             uint16x8_t v_r = vmovl_u8(v_src.val[0]),
268                        v_g = vmovl_u8(v_src.val[1]),
269                        v_b = vmovl_u8(v_src.val[2]);
270             uint8x8_t v_gray = convertToGray(v_r, v_g, v_b, v_r2y, v_g2y, v_b2y);
271 
272             vst1_u8(dst + dj, v_gray);
273             sj += 32; dj += 8;
274         }
275 #endif
276 
277         for (; dj < size.width; sj += 4, dj++)
278         {
279             u32 val = src[sj] * R2Y + src[sj + 1] * G2Y + src[sj + 2] * B2Y;
280             dst[dj] = internal::saturate_cast<u8>((val + SHIFT_DELTA) >> SHIFT);
281         }
282     }
283 #else
284     (void)size;
285     (void)color_space;
286     (void)srcBase;
287     (void)srcStride;
288     (void)dstBase;
289     (void)dstStride;
290 #endif
291 }
292 
bgr2gray(const Size2D & size,COLOR_SPACE color_space,const u8 * srcBase,ptrdiff_t srcStride,u8 * dstBase,ptrdiff_t dstStride)293 void bgr2gray(const Size2D &size, COLOR_SPACE color_space,
294               const u8 * srcBase, ptrdiff_t srcStride,
295               u8 * dstBase, ptrdiff_t dstStride)
296 {
297     internal::assertSupportedConfiguration();
298 #ifdef CAROTENE_NEON
299     const u32 R2Y = color_space == COLOR_SPACE_BT601 ? R2Y_BT601 : R2Y_BT709;
300     const u32 G2Y = color_space == COLOR_SPACE_BT601 ? G2Y_BT601 : G2Y_BT709;
301     const u32 B2Y = color_space == COLOR_SPACE_BT601 ? B2Y_BT601 : B2Y_BT709;
302 
303 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
304     register int16x4_t v_r2y asm ("d31") = vmov_n_s16(R2Y);
305     register int16x4_t v_g2y asm ("d30") = vmov_n_s16(G2Y);
306     register int16x4_t v_b2y asm ("d29") = vmov_n_s16(B2Y);
307 #else
308     uint16x4_t v_r2y = vdup_n_u16(R2Y),
309                v_g2y = vdup_n_u16(G2Y),
310                v_b2y = vdup_n_u16(B2Y);
311 
312     size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
313 #endif
314     size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
315 
316     for (size_t i = 0u; i < size.height; ++i)
317     {
318         const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
319         u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
320         size_t sj = 0u, dj = 0u;
321 
322 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
323         for (; dj < roiw8; sj += 24, dj += 8)
324         {
325             internal::prefetch(src + sj);
326             __asm__ (
327             "vld3.8 {d0-d2}, [%[in]] @BGR                       \n\t"
328             "vmovl.u8 q2, d2         @R (d4,d5)                 \n\t"
329             "vmovl.u8 q3, d1         @G (d6,d7)                 \n\t"
330             "vmovl.u8 q4, d0         @B (d8,d9)                 \n\t"
331             "vmull.u16 q5, d6, d30   @Y (q5,q6):  G             \n\t"
332             "vmull.u16 q6, d7, d30   @Y (q5,q6):  G             \n\t"
333             "vmlal.s16 q5, d8, d29   @Y (q5,q6):  GB            \n\t"
334             "vmlal.s16 q6, d9, d29   @Y (q5,q6):  GB            \n\t"
335             "vmlal.s16 q5, d4, d31   @Y (q5,q6):  GBR           \n\t"
336             "vmlal.s16 q6, d5, d31   @Y (q5,q6):  GBR           \n\t"
337             "vrshrn.s32 d8, q5, #14  @Y  -> q4                  \n\t"
338             "vrshrn.s32 d9, q6, #14  @Y  -> q4                  \n\t"
339             "vqmovn.u16 d4, q4                                  \n\t"
340             "vst1.8 {d4}, [%[out]]                              \n\t"
341             : /*no output*/
342             : [out] "r" (dst + dj), [in] "r" (src + sj), "w" (v_r2y), "w" (v_g2y), "w" (v_b2y)
343             : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13"
344             );
345         }
346 #else
347         for (; dj < roiw16; sj += 48, dj += 16)
348         {
349             internal::prefetch(src + sj);
350             uint8x16x3_t v_src0 = vld3q_u8(src + sj);
351 
352             // 0
353             uint16x8_t v_b = vmovl_u8(vget_low_u8(v_src0.val[0])),
354                        v_g = vmovl_u8(vget_low_u8(v_src0.val[1])),
355                        v_r = vmovl_u8(vget_low_u8(v_src0.val[2]));
356             uint8x8_t v_gray0 = convertToGray(v_r, v_g, v_b, v_r2y, v_g2y, v_b2y);
357 
358             v_b = vmovl_u8(vget_high_u8(v_src0.val[0])),
359             v_g = vmovl_u8(vget_high_u8(v_src0.val[1])),
360             v_r = vmovl_u8(vget_high_u8(v_src0.val[2]));
361             uint8x8_t v_gray1 = convertToGray(v_r, v_g, v_b, v_r2y, v_g2y, v_b2y);
362 
363             vst1q_u8(dst + dj, vcombine_u8(v_gray0, v_gray1));
364         }
365 
366         if (dj < roiw8)
367         {
368             uint8x8x3_t v_src = vld3_u8(src + sj);
369             uint16x8_t v_b = vmovl_u8(v_src.val[0]),
370                        v_g = vmovl_u8(v_src.val[1]),
371                        v_r = vmovl_u8(v_src.val[2]);
372             uint8x8_t v_gray = convertToGray(v_r, v_g, v_b, v_r2y, v_g2y, v_b2y);
373 
374             vst1_u8(dst + dj, v_gray);
375             sj += 24; dj += 8;
376         }
377 #endif
378 
379         for (; dj < size.width; sj += 3, dj++)
380         {
381             u32 val = src[sj] * B2Y + src[sj + 1] * G2Y + src[sj + 2] * R2Y;
382             dst[dj] = internal::saturate_cast<u8>((val + SHIFT_DELTA) >> SHIFT);
383         }
384     }
385 #else
386     (void)size;
387     (void)color_space;
388     (void)srcBase;
389     (void)srcStride;
390     (void)dstBase;
391     (void)dstStride;
392 #endif
393 }
394 
bgrx2gray(const Size2D & size,COLOR_SPACE color_space,const u8 * srcBase,ptrdiff_t srcStride,u8 * dstBase,ptrdiff_t dstStride)395 void bgrx2gray(const Size2D &size, COLOR_SPACE color_space,
396                const u8 * srcBase, ptrdiff_t srcStride,
397                u8 * dstBase, ptrdiff_t dstStride)
398 {
399     internal::assertSupportedConfiguration();
400 #ifdef CAROTENE_NEON
401     const u32 R2Y = color_space == COLOR_SPACE_BT601 ? R2Y_BT601 : R2Y_BT709;
402     const u32 G2Y = color_space == COLOR_SPACE_BT601 ? G2Y_BT601 : G2Y_BT709;
403     const u32 B2Y = color_space == COLOR_SPACE_BT601 ? B2Y_BT601 : B2Y_BT709;
404 
405 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
406     register int16x4_t v_r2y asm ("d31") = vmov_n_s16(R2Y);
407     register int16x4_t v_g2y asm ("d30") = vmov_n_s16(G2Y);
408     register int16x4_t v_b2y asm ("d29") = vmov_n_s16(B2Y);
409 #else
410     uint16x4_t v_r2y = vdup_n_u16(R2Y),
411                v_g2y = vdup_n_u16(G2Y),
412                v_b2y = vdup_n_u16(B2Y);
413 
414     size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
415 #endif
416     size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
417 
418     for (size_t i = 0u; i < size.height; ++i)
419     {
420         const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
421         u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
422         size_t sj = 0u, dj = 0u;
423 
424 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
425         for (; dj < roiw8; sj += 32, dj += 8)
426         {
427             internal::prefetch(src + sj);
428             __asm__ (
429             "vld4.8 {d0-d3}, [%[in]] @BGRA                      \n\t"
430             "vmovl.u8 q2, d2         @R (d4,d5)                 \n\t"
431             "vmovl.u8 q3, d1         @G (d6,d7)                 \n\t"
432             "vmovl.u8 q4, d0         @B (d8,d9)                 \n\t"
433             "vmull.u16 q5, d6, d30   @Y (q5,q6):  G             \n\t"
434             "vmull.u16 q6, d7, d30   @Y (q5,q6):  G             \n\t"
435             "vmlal.s16 q5, d8, d29   @Y (q5,q6):  GB            \n\t"
436             "vmlal.s16 q6, d9, d29   @Y (q5,q6):  GB            \n\t"
437             "vmlal.s16 q5, d4, d31   @Y (q5,q6):  GBR           \n\t"
438             "vmlal.s16 q6, d5, d31   @Y (q5,q6):  GBR           \n\t"
439             "vrshrn.s32 d8, q5, #14  @Y  -> q4                  \n\t"
440             "vrshrn.s32 d9, q6, #14  @Y  -> q4                  \n\t"
441             "vqmovn.u16 d4, q4                                  \n\t"
442             "vst1.8 {d4}, [%[out]]                              \n\t"
443             : /*no output*/
444             : [out] "r" (dst + dj), [in] "r" (src + sj), "w" (v_r2y), "w" (v_g2y), "w" (v_b2y)
445             : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13"
446             );
447         }
448 #else
449         for (; dj < roiw16; sj += 64, dj += 16)
450         {
451             internal::prefetch(src + sj);
452             uint8x16x4_t v_src0 = vld4q_u8(src + sj);
453 
454             // 0
455             uint16x8_t v_b = vmovl_u8(vget_low_u8(v_src0.val[0])),
456                        v_g = vmovl_u8(vget_low_u8(v_src0.val[1])),
457                        v_r = vmovl_u8(vget_low_u8(v_src0.val[2]));
458             uint8x8_t v_gray0 = convertToGray(v_r, v_g, v_b, v_r2y, v_g2y, v_b2y);
459 
460             v_b = vmovl_u8(vget_high_u8(v_src0.val[0])),
461             v_g = vmovl_u8(vget_high_u8(v_src0.val[1])),
462             v_r = vmovl_u8(vget_high_u8(v_src0.val[2]));
463             uint8x8_t v_gray1 = convertToGray(v_r, v_g, v_b, v_r2y, v_g2y, v_b2y);
464 
465             vst1q_u8(dst + dj, vcombine_u8(v_gray0, v_gray1));
466         }
467 
468         if (dj < roiw8)
469         {
470             uint8x8x4_t v_src = vld4_u8(src + sj);
471             uint16x8_t v_b = vmovl_u8(v_src.val[0]),
472                        v_g = vmovl_u8(v_src.val[1]),
473                        v_r = vmovl_u8(v_src.val[2]);
474             uint8x8_t v_gray = convertToGray(v_r, v_g, v_b, v_r2y, v_g2y, v_b2y);
475 
476             vst1_u8(dst + dj, v_gray);
477             sj += 32; dj += 8;
478         }
479 #endif
480 
481         for (; dj < size.width; sj += 4, dj++)
482         {
483             u32 val = src[sj] * B2Y + src[sj + 1] * G2Y + src[sj + 2] * R2Y;
484             dst[dj] = internal::saturate_cast<u8>((val + SHIFT_DELTA) >> SHIFT);
485         }
486     }
487 #else
488     (void)size;
489     (void)color_space;
490     (void)srcBase;
491     (void)srcStride;
492     (void)dstBase;
493     (void)dstStride;
494 #endif
495 }
496 
gray2rgb(const Size2D & size,const u8 * srcBase,ptrdiff_t srcStride,u8 * dstBase,ptrdiff_t dstStride)497 void gray2rgb(const Size2D &size,
498               const u8 * srcBase, ptrdiff_t srcStride,
499               u8 * dstBase, ptrdiff_t dstStride)
500 {
501     internal::assertSupportedConfiguration();
502 #ifdef CAROTENE_NEON
503     size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
504     size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
505 
506     for (size_t i = 0u; i < size.height; ++i)
507     {
508         const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
509         u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
510         size_t sj = 0u, dj = 0u;
511 
512         for (; sj < roiw16; sj += 16, dj += 48)
513         {
514             internal::prefetch(src + sj);
515 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
516             __asm__ (
517             "vld1.8 {d0-d1}, [%[in0]]            \n\t"
518             "vmov.8 q1, q0                       \n\t"
519             "vmov.8 q2, q0                       \n\t"
520             "vmov.8 q3, q1                       \n\t"
521             "vst3.8 {d2, d4, d6}, [%[out0]]      \n\t"
522             "vst3.8 {d3, d5, d7}, [%[out1]]      \n\t"
523             : /*no output*/
524             : [out0] "r" (dst + dj),      [out1] "r" (dst + dj + 24),
525               [in0] "r" (src + sj)
526             : "d0","d1","d2","d3","d4","d5","d6","d7"
527             );
528 #else
529             uint8x16x3_t vRgb1;
530             vRgb1.val[0] = vld1q_u8(src + sj);
531 
532             vRgb1.val[1] = vRgb1.val[0];
533             vRgb1.val[2] = vRgb1.val[0];
534 
535             vst3q_u8(dst + dj, vRgb1);
536 #endif
537         }
538 
539         if (sj < roiw8)
540         {
541 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
542             __asm__ (
543             "vld1.8 {d0}, [%[in]]                \n\t"
544             "vmov.8 d1, d0                       \n\t"
545             "vmov.8 d2, d0                       \n\t"
546             "vst3.8 {d0-d2}, [%[out]]            \n\t"
547             : /*no output*/
548             : [out] "r" (dst + dj), [in] "r" (src + sj)
549             : "d0","d1","d2"
550             );
551 #else
552             uint8x8x3_t vRgb2;
553             vRgb2.val[0] = vld1_u8(src + sj);
554             vRgb2.val[1] = vRgb2.val[0];
555             vRgb2.val[2] = vRgb2.val[0];
556 
557             vst3_u8(dst + dj, vRgb2);
558 #endif
559             sj += 8; dj += 24;
560         }
561 
562         for (; sj < size.width; sj++, dj += 3)
563         {
564             dst[dj+0] = src[sj];
565             dst[dj+1] = src[sj];
566             dst[dj+2] = src[sj];
567         }
568     }
569 #else
570     (void)size;
571     (void)srcBase;
572     (void)srcStride;
573     (void)dstBase;
574     (void)dstStride;
575 #endif
576 }
577 
gray2rgbx(const Size2D & size,const u8 * srcBase,ptrdiff_t srcStride,u8 * dstBase,ptrdiff_t dstStride)578 void gray2rgbx(const Size2D &size,
579                const u8 * srcBase, ptrdiff_t srcStride,
580                u8 * dstBase, ptrdiff_t dstStride)
581 {
582     internal::assertSupportedConfiguration();
583 #ifdef CAROTENE_NEON
584     size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
585     size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
586 
587 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
588     register uint8x16_t vc255   asm ("q4") = vmovq_n_u8(255);
589 #else
590     uint8x16x4_t vRgba;
591     uint8x8x4_t  vRgba2;
592     vRgba.val[3]  = vmovq_n_u8(255);
593     vRgba2.val[3] = vget_low_u8(vRgba.val[3]);
594 #endif
595 
596     for (size_t i = 0u; i < size.height; ++i)
597     {
598         const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
599         u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
600         size_t sj = 0u, dj = 0u;
601 
602         for (; sj < roiw16; sj += 16, dj += 64)
603         {
604             internal::prefetch(src + sj);
605 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
606             __asm__ (
607             "vld1.8 {d0-d1}, [%[in0]]            \n\t"
608             "vmov.8 q1, q0                       \n\t"
609             "vmov.8 q2, q0                       \n\t"
610             "vmov.8 q3, q1                       \n\t"
611             "vst4.8 {d2, d4, d6, d8}, [%[out0]]  \n\t"
612             "vst4.8 {d3, d5, d7, d9}, [%[out1]]  \n\t"
613             : /*no output*/
614             : [out0] "r" (dst + dj),      [out1] "r" (dst + dj + 32),
615               [in0] "r" (src + sj),
616               "w" (vc255)
617             : "d0","d1","d2","d3","d4","d5","d6","d7"
618             );
619 #else
620             vRgba.val[0]  = vld1q_u8(src + sj);
621 
622             vRgba.val[1] = vRgba.val[0];
623             vRgba.val[2] = vRgba.val[0];
624 
625             vst4q_u8(dst + dj, vRgba);
626 #endif
627         }
628 
629         if (sj < roiw8)
630         {
631 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
632             __asm__ (
633             "vld1.8 {d5}, [%[in]]                \n\t"
634             "vmov.8 d6, d5                       \n\t"
635             "vmov.8 d7, d5                       \n\t"
636             "vst4.8 {d5-d8}, [%[out]]            \n\t"
637             : /*no output*/
638             : [out] "r" (dst + dj), [in] "r" (src + sj), "w" (vc255)
639             : "d5","d6","d7"
640             );
641 #else
642             vRgba2.val[0] = vld1_u8(src + sj);
643             vRgba2.val[1] = vRgba2.val[0];
644             vRgba2.val[2] = vRgba2.val[0];
645 
646             vst4_u8(dst + dj, vRgba2);
647 #endif
648             sj += 8; dj += 32;
649         }
650 
651         for (; sj < size.width; sj++, dj += 4)
652         {
653             dst[dj+0] = src[sj];
654             dst[dj+1] = src[sj];
655             dst[dj+2] = src[sj];
656             dst[dj+3] = 255;
657         }
658     }
659 #else
660     (void)size;
661     (void)srcBase;
662     (void)srcStride;
663     (void)dstBase;
664     (void)dstStride;
665 #endif
666 }
667 
rgb2rgbx(const Size2D & size,const u8 * srcBase,ptrdiff_t srcStride,u8 * dstBase,ptrdiff_t dstStride)668 void rgb2rgbx(const Size2D &size,
669               const u8 * srcBase, ptrdiff_t srcStride,
670               u8 * dstBase, ptrdiff_t dstStride)
671 {
672     internal::assertSupportedConfiguration();
673 #ifdef CAROTENE_NEON
674     size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
675 #if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__)
676     register uint8x8_t vc255_0  asm ("d3") = vmov_n_u8(255);
677 #else
678     size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
679     union { uint8x16x4_t v4; uint8x16x3_t v3; } v_dst0;
680     v_dst0.v4.val[3] = vdupq_n_u8(255);
681     union { uint8x8x4_t v4; uint8x8x3_t v3; } v_dst;
682     v_dst.v4.val[3] = vdup_n_u8(255);
683 #endif
684 
685     for (size_t i = 0u; i < size.height; ++i)
686     {
687         const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
688         u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
689         size_t sj = 0u, dj = 0u, j = 0u;
690 
691 #if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__)
692         for (; j < roiw8; sj += 24, dj += 32, j += 8)
693         {
694             internal::prefetch(src + sj);
695             __asm__ (
696                 "vld3.8 {d0, d1, d2}, [%[in0]]             \n\t"
697                 "vst4.8 {d0, d1, d2, d3}, [%[out0]]        \n\t"
698                 : /*no output*/
699                 : [out0] "r" (dst + dj),
700                   [in0]  "r" (src + sj),
701                   "w" (vc255_0)
702                 : "d0","d1","d2"
703             );
704         }
705 #else
706         for (; j < roiw16; sj += 48, dj += 64, j += 16)
707         {
708             internal::prefetch(src + sj);
709             v_dst0.v3 = vld3q_u8(src + sj);
710             vst4q_u8(dst + dj, v_dst0.v4);
711         }
712 
713         if (j < roiw8)
714         {
715             v_dst.v3 = vld3_u8(src + sj);
716             vst4_u8(dst + dj, v_dst.v4);
717             sj += 24; dj += 32; j += 8;
718         }
719 #endif
720 
721         for (; j < size.width; ++j, sj += 3, dj += 4)
722         {
723             dst[dj] = src[sj];
724             dst[dj + 1] = src[sj + 1];
725             dst[dj + 2] = src[sj + 2];
726             dst[dj + 3] = 255;
727         }
728     }
729 #else
730     (void)size;
731     (void)srcBase;
732     (void)srcStride;
733     (void)dstBase;
734     (void)dstStride;
735 #endif
736 }
737 
rgbx2rgb(const Size2D & size,const u8 * srcBase,ptrdiff_t srcStride,u8 * dstBase,ptrdiff_t dstStride)738 void rgbx2rgb(const Size2D &size,
739               const u8 * srcBase, ptrdiff_t srcStride,
740               u8 * dstBase, ptrdiff_t dstStride)
741 {
742     internal::assertSupportedConfiguration();
743 #ifdef CAROTENE_NEON
744     size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
745 #if !(!defined(__aarch64__) && defined(__GNUC__) && defined(__arm__))
746     size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
747     union { uint8x16x4_t v4; uint8x16x3_t v3; } v_dst0;
748     union { uint8x8x4_t v4; uint8x8x3_t v3; } v_dst;
749 #endif
750 
751     for (size_t i = 0u; i < size.height; ++i)
752     {
753         const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
754         u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
755         size_t sj = 0u, dj = 0u, j = 0u;
756 
757 #if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__)
758         for (; j < roiw8; sj += 32, dj += 24, j += 8)
759         {
760             internal::prefetch(src + sj);
761             __asm__ (
762                 "vld4.8 {d0, d1, d2, d3}, [%[in0]]         \n\t"
763                 "vst3.8 {d0, d1, d2}, [%[out0]]            \n\t"
764                 : /*no output*/
765                 : [out0] "r" (dst + dj),
766                   [in0]  "r" (src + sj)
767                 : "d0","d1","d2","d3"
768             );
769         }
770 #else
771         for (; j < roiw16; sj += 64, dj += 48, j += 16)
772         {
773             internal::prefetch(src + sj);
774             v_dst0.v4 = vld4q_u8(src + sj);
775             vst3q_u8(dst + dj, v_dst0.v3);
776         }
777 
778         if (j < roiw8)
779         {
780             v_dst.v4 = vld4_u8(src + sj);
781             vst3_u8(dst + dj, v_dst.v3);
782             sj += 32; dj += 24; j += 8;
783         }
784 #endif
785 
786         for (; j < size.width; ++j, sj += 4, dj += 3)
787         {
788             dst[dj] = src[sj];
789             dst[dj + 1] = src[sj + 1];
790             dst[dj + 2] = src[sj + 2];
791         }
792     }
793 #else
794     (void)size;
795     (void)srcBase;
796     (void)srcStride;
797     (void)dstBase;
798     (void)dstStride;
799 #endif
800 }
801 
rgb2bgr(const Size2D & size,const u8 * srcBase,ptrdiff_t srcStride,u8 * dstBase,ptrdiff_t dstStride)802 void rgb2bgr(const Size2D &size,
803              const u8 * srcBase, ptrdiff_t srcStride,
804              u8 * dstBase, ptrdiff_t dstStride)
805 {
806     internal::assertSupportedConfiguration();
807 #ifdef CAROTENE_NEON
808 #if !(!defined(__aarch64__) && defined(__GNUC__) && defined(__arm__))
809     size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
810 #endif
811     size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
812 
813     for (size_t i = 0u; i < size.height; ++i)
814     {
815         const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
816         u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
817         size_t sj = 0u, dj = 0u, j = 0u;
818 
819 
820 #if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__)
821         for (; j < roiw8; sj += 24, dj += 24, j += 8)
822         {
823             internal::prefetch(src + sj);
824             __asm__ (
825                 "vld3.8 {d0, d1, d2}, [%[in0]]             \n\t"
826                 "vswp d0, d2                               \n\t"
827                 "vst3.8 {d0, d1, d2}, [%[out0]]            \n\t"
828                 : /*no output*/
829                 : [out0] "r" (dst + dj),
830                   [in0]  "r" (src + sj)
831                 : "d0","d1","d2"
832             );
833         }
834 #else
835         for (; j < roiw16; sj += 48, dj += 48, j += 16)
836         {
837             internal::prefetch(src + sj);
838             uint8x16x3_t vals0 = vld3q_u8(src + sj);
839 
840             std::swap(vals0.val[0], vals0.val[2]);
841 
842             vst3q_u8(dst + dj, vals0);
843         }
844 
845         if (j < roiw8)
846         {
847             uint8x8x3_t vals = vld3_u8(src + sj);
848             std::swap(vals.val[0], vals.val[2]);
849             vst3_u8(dst + dj, vals);
850             sj += 24; dj += 24; j += 8;
851         }
852 #endif
853 
854         for (; j < size.width; ++j, sj += 3, dj += 3)
855         {
856             u8 b = src[sj + 2];//Handle src == dst case
857             dst[dj + 2] = src[sj    ];
858             dst[dj + 1] = src[sj + 1];
859             dst[dj    ] = b;
860         }
861     }
862 #else
863     (void)size;
864     (void)srcBase;
865     (void)srcStride;
866     (void)dstBase;
867     (void)dstStride;
868 #endif
869 }
870 
rgbx2bgrx(const Size2D & size,const u8 * srcBase,ptrdiff_t srcStride,u8 * dstBase,ptrdiff_t dstStride)871 void rgbx2bgrx(const Size2D &size,
872                const u8 * srcBase, ptrdiff_t srcStride,
873                u8 * dstBase, ptrdiff_t dstStride)
874 {
875     internal::assertSupportedConfiguration();
876 #ifdef CAROTENE_NEON
877 #if !(!defined(__aarch64__) && defined(__GNUC__) && defined(__arm__))
878     size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
879 #endif
880     size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
881 
882     for (size_t i = 0u; i < size.height; ++i)
883     {
884         const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
885         u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
886         size_t sj = 0u, dj = 0u, j = 0u;
887 
888 #if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__)
889         for (; j < roiw8; sj += 32, dj += 32, j += 8)
890         {
891             internal::prefetch(src + sj);
892             __asm__ (
893                 "vld4.8 {d0, d1, d2, d3}, [%[in0]]         \n\t"
894                 "vswp d0, d2                               \n\t"
895                 "vst4.8 {d0, d1, d2, d3}, [%[out0]]        \n\t"
896                 : /*no output*/
897                 : [out0] "r" (dst + dj),
898                   [in0]  "r" (src + sj)
899                 : "d0","d1","d2","d3"
900             );
901         }
902 #else
903         for (; j < roiw16; sj += 64, dj += 64, j += 16)
904         {
905             internal::prefetch(src + sj);
906             uint8x16x4_t vals0 = vld4q_u8(src + sj);
907 
908             std::swap(vals0.val[0], vals0.val[2]);
909 
910             vst4q_u8(dst + dj, vals0);
911         }
912 
913         if (j < roiw8)
914         {
915             uint8x8x4_t vals = vld4_u8(src + sj);
916             std::swap(vals.val[0], vals.val[2]);
917             vst4_u8(dst + dj, vals);
918             sj += 32; dj += 32; j += 8;
919         }
920 #endif
921 
922         for (; j < size.width; ++j, sj += 4, dj += 4)
923         {
924             u8 b = src[sj + 2];//Handle src == dst case
925             dst[dj + 2] = src[sj    ];
926             dst[dj + 1] = src[sj + 1];
927             dst[dj    ] = b;
928             dst[dj + 3] = src[sj + 3];
929         }
930     }
931 #else
932     (void)size;
933     (void)srcBase;
934     (void)srcStride;
935     (void)dstBase;
936     (void)dstStride;
937 #endif
938 }
939 
rgbx2bgr(const Size2D & size,const u8 * srcBase,ptrdiff_t srcStride,u8 * dstBase,ptrdiff_t dstStride)940 void rgbx2bgr(const Size2D &size,
941               const u8 * srcBase, ptrdiff_t srcStride,
942               u8 * dstBase, ptrdiff_t dstStride)
943 {
944     internal::assertSupportedConfiguration();
945 #ifdef CAROTENE_NEON
946 #if !(!defined(__aarch64__) && defined(__GNUC__) && defined(__arm__))
947     size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
948 #endif
949     size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
950 
951     for (size_t i = 0u; i < size.height; ++i)
952     {
953         const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
954         u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
955         size_t sj = 0u, dj = 0u, j = 0u;
956 
957 #if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__)
958         for (; j < roiw8; sj += 32, dj += 24, j += 8)
959         {
960             internal::prefetch(src + sj);
961             __asm__ (
962                 "vld4.8 {d0, d1, d2, d3}, [%[in0]]         \n\t"
963                 "vswp d0, d2                               \n\t"
964                 "vst3.8 {d0, d1, d2}, [%[out0]]            \n\t"
965                 : /*no output*/
966                 : [out0] "r" (dst + dj),
967                   [in0]  "r" (src + sj)
968                 : "d0","d1","d2","d3"
969             );
970         }
971 #else
972         for (; j < roiw16; sj += 64, dj += 48, j += 16)
973         {
974             internal::prefetch(src + sj);
975             union { uint8x16x4_t v4; uint8x16x3_t v3; } vals0;
976             vals0.v4 = vld4q_u8(src + sj);
977             std::swap(vals0.v3.val[0], vals0.v3.val[2]);
978             vst3q_u8(dst + dj, vals0.v3);
979         }
980 
981         if (j < roiw8)
982         {
983             union { uint8x8x4_t v4; uint8x8x3_t v3; } vals;
984             vals.v4 = vld4_u8(src + sj);
985             std::swap(vals.v3.val[0], vals.v3.val[2]);
986             vst3_u8(dst + dj, vals.v3);
987             sj += 32; dj += 24; j += 8;
988         }
989 #endif
990 
991         for (; j < size.width; ++j, sj += 4, dj += 3)
992         {
993             dst[dj + 2] = src[sj    ];
994             dst[dj + 1] = src[sj + 1];
995             dst[dj    ] = src[sj + 2];
996         }
997     }
998 #else
999     (void)size;
1000     (void)srcBase;
1001     (void)srcStride;
1002     (void)dstBase;
1003     (void)dstStride;
1004 #endif
1005 }
1006 
rgb2bgrx(const Size2D & size,const u8 * srcBase,ptrdiff_t srcStride,u8 * dstBase,ptrdiff_t dstStride)1007 void rgb2bgrx(const Size2D &size,
1008               const u8 * srcBase, ptrdiff_t srcStride,
1009               u8 * dstBase, ptrdiff_t dstStride)
1010 {
1011     internal::assertSupportedConfiguration();
1012 #ifdef CAROTENE_NEON
1013 #if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__)
1014     register uint8x8_t vc255  asm ("d3") = vmov_n_u8(255);
1015 #else
1016     union { uint8x16x4_t v4; uint8x16x3_t v3; } vals0;
1017     vals0.v4.val[3] = vmovq_n_u8(255);
1018     union { uint8x8x4_t v4; uint8x8x3_t v3; } vals8;
1019     vals8.v4.val[3] = vmov_n_u8(255);
1020 #endif
1021 
1022 #if !(!defined(__aarch64__) && defined(__GNUC__) && defined(__arm__))
1023     size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
1024 #endif
1025     size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
1026 
1027     for (size_t i = 0u; i < size.height; ++i)
1028     {
1029         const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
1030         u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
1031         size_t sj = 0u, dj = 0u, j = 0u;
1032 
1033 #if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__)
1034         for (; j < roiw8; sj += 24, dj += 32, j += 8)
1035         {
1036             internal::prefetch(src + sj);
1037             __asm__ (
1038                 "vld3.8 {d0, d1, d2}, [%[in0]]             \n\t"
1039                 "vswp d0, d2                               \n\t"
1040                 "vst4.8 {d0, d1, d2, d3}, [%[out0]]        \n\t"
1041                 : /*no output*/
1042                 : [out0] "r" (dst + dj),
1043                   [in0]  "r" (src + sj),
1044                    "w" (vc255)
1045                 : "d0","d1","d2"
1046             );
1047         }
1048 #else
1049         for (; j < roiw16; sj += 48, dj += 64, j += 16)
1050         {
1051             internal::prefetch(src + sj);
1052             vals0.v3 = vld3q_u8(src + sj);
1053             std::swap(vals0.v4.val[0], vals0.v4.val[2]);
1054             vst4q_u8(dst + dj, vals0.v4);
1055         }
1056 
1057         if (j < roiw8)
1058         {
1059             vals8.v3 = vld3_u8(src + sj);
1060             std::swap(vals8.v4.val[0], vals8.v4.val[2]);
1061             vst4_u8(dst + dj, vals8.v4);
1062             sj += 24; dj += 32; j += 8;
1063         }
1064 #endif
1065 
1066         for (; j < size.width; ++j, sj += 3, dj += 4)
1067         {
1068             dst[dj + 3] = 255;
1069             dst[dj + 2] = src[sj    ];
1070             dst[dj + 1] = src[sj + 1];
1071             dst[dj    ] = src[sj + 2];
1072         }
1073     }
1074 #else
1075     (void)size;
1076     (void)srcBase;
1077     (void)srcStride;
1078     (void)dstBase;
1079     (void)dstStride;
1080 #endif
1081 }
1082 
1083 namespace {
1084 
1085 #ifdef CAROTENE_NEON
convertToHSV(const uint8x8_t vR,const uint8x8_t vG,const uint8x8_t vB,const s32 hrange)1086 inline uint8x8x3_t convertToHSV(const uint8x8_t vR, const uint8x8_t vG, const uint8x8_t vB,
1087                                 const s32 hrange )
1088 {
1089     const s32 hsv_shift = 12;
1090     const f32 vsdiv_table = f32(255 << hsv_shift);
1091     f32 vhdiv_table = f32(hrange << hsv_shift);
1092     const s32 vhrange = hrange;
1093     const s32 v0 = s32(0);
1094     const s32 vshift = s32(1 << (hsv_shift-1));
1095     const s32 v6 = s32(6);
1096 
1097     uint8x8_t vMin = vmin_u8(vR, vG);
1098     uint8x8_t vMax = vmax_u8(vR, vG);
1099 
1100     uint16x8_t vR_u16 = vmovl_u8(vR);
1101     uint16x8_t vG_u16 = vmovl_u8(vG);
1102 
1103     vMax = vmax_u8(vMax, vB);
1104     vMin = vmin_u8(vMin, vB);
1105     uint16x8_t vB_u16 = vmovl_u8(vB);
1106 
1107     uint16x8_t vDiff = vsubl_u8(vMax, vMin);
1108 
1109     uint16x8_t vV = vmovl_u8(vMax);
1110     uint16x8_t vDiffx2 = vaddq_u16(vDiff, vDiff);
1111     uint32x4_t vDiffL = vmovl_u16(vget_low_u16(vDiff));
1112     uint32x4_t vDiffH = vmovl_u16(vget_high_u16(vDiff));
1113 
1114     uint16x8_t vVEqR = vceqq_u16(vR_u16, vV);
1115     uint16x8_t vVEqG = vceqq_u16(vG_u16, vV);
1116 
1117     int16x8_t vG_B = vsubq_s16(vreinterpretq_s16_u16(vG_u16), vreinterpretq_s16_u16(vB_u16));
1118     uint16x8_t vInvR = vmvnq_u16(vVEqR);
1119     int16x8_t vB_R = vsubq_s16(vreinterpretq_s16_u16(vB_u16), vreinterpretq_s16_u16(vR_u16));
1120     int16x8_t vR_G = vsubq_s16(vreinterpretq_s16_u16(vR_u16), vreinterpretq_s16_u16(vG_u16));
1121 
1122     uint16x8_t vMask2 = vandq_u16(vVEqG, vInvR);
1123     vR_u16 = vandq_u16(vreinterpretq_u16_s16(vG_B), vVEqR);
1124     int16x8_t vH2 = vaddq_s16(vB_R, vreinterpretq_s16_u16(vDiffx2));
1125 
1126     vVEqR = vmvnq_u16(vVEqG);
1127     vB_R = vaddq_s16(vreinterpretq_s16_u16(vDiffx2), vreinterpretq_s16_u16(vDiffx2));
1128     vG_B = vandq_s16(vreinterpretq_s16_u16(vInvR), vreinterpretq_s16_u16(vVEqR));
1129     vInvR = vandq_u16(vreinterpretq_u16_s16(vH2), vMask2);
1130     vR_G = vaddq_s16(vR_G, vB_R);
1131     int16x8_t vH = vaddq_s16(vreinterpretq_s16_u16(vR_u16), vreinterpretq_s16_u16(vInvR));
1132 
1133     uint32x4_t vV_L = vmovl_u16(vget_low_u16(vV));
1134     vR_G = vandq_s16(vR_G, vG_B);
1135     uint32x4_t vV_H = vmovl_u16(vget_high_u16(vV));
1136     int16x8_t vDiff4 = vaddq_s16(vH, vR_G);
1137 
1138     int32x4_t vc6 = vdupq_n_s32(v6);
1139     uint32x4_t vLine1 = vmulq_u32(vDiffL, vreinterpretq_u32_s32(vc6));
1140     uint32x4_t vLine2 = vmulq_u32(vDiffH, vreinterpretq_u32_s32(vc6));
1141 
1142     float32x4_t vF1 = vcvtq_f32_u32(vV_L);
1143     float32x4_t vF2 = vcvtq_f32_u32(vV_H);
1144     float32x4_t vHF1 = vcvtq_f32_u32(vLine1);
1145     float32x4_t vHF2 = vcvtq_f32_u32(vLine2);
1146 
1147     float32x4_t vXInv1 = vrecpeq_f32(vF1);
1148     float32x4_t vXInv2 = vrecpeq_f32(vF2);
1149     float32x4_t vXInv3 = vrecpeq_f32(vHF1);
1150     float32x4_t vXInv4 = vrecpeq_f32(vHF2);
1151 
1152     float32x4_t vSt1 = vrecpsq_f32(vXInv1, vF1);
1153     float32x4_t vSt2 = vrecpsq_f32(vXInv2, vF2);
1154     float32x4_t vSt3 = vrecpsq_f32(vXInv3, vHF1);
1155     float32x4_t vSt4 = vrecpsq_f32(vXInv4, vHF2);
1156 
1157     vF1 = vmulq_f32(vXInv1, vSt1);
1158     vF2 = vmulq_f32(vXInv2, vSt2);
1159     vHF1 = vmulq_f32(vXInv3, vSt3);
1160     vHF2 = vmulq_f32(vXInv4, vSt4);
1161 
1162     float32x4_t vDivTab = vdupq_n_f32(vsdiv_table);
1163     vSt1 = vmulq_f32(vF1, vDivTab);
1164     vSt2 = vmulq_f32(vF2, vDivTab);
1165     vDivTab = vdupq_n_f32(vhdiv_table);
1166     vSt3 = vmulq_f32(vHF1, vDivTab);
1167     vSt4 = vmulq_f32(vHF2, vDivTab);
1168 
1169     float32x4_t bias = vdupq_n_f32(0.5f);
1170 
1171     vSt1 = vaddq_f32(vSt1, bias);
1172     vSt2 = vaddq_f32(vSt2, bias);
1173     vSt3 = vaddq_f32(vSt3, bias);
1174     vSt4 = vaddq_f32(vSt4, bias);
1175 
1176     uint32x4_t vRes1 = vcvtq_u32_f32(vSt1);
1177     uint32x4_t vRes2 = vcvtq_u32_f32(vSt2);
1178     uint32x4_t vRes3 = vcvtq_u32_f32(vSt3);
1179     uint32x4_t vRes4 = vcvtq_u32_f32(vSt4);
1180 
1181     int32x4_t vH_L = vmovl_s16(vget_low_s16(vDiff4));
1182     int32x4_t vH_H = vmovl_s16(vget_high_s16(vDiff4));
1183 
1184     uint32x4_t vDiff_Res1 = vmulq_u32(vDiffL, vRes1);
1185     uint32x4_t vDiff_Res2 = vmulq_u32(vDiffH, vRes2);
1186     uint32x4_t vDiff_Res3 = vmulq_u32(vreinterpretq_u32_s32(vH_L), vRes3);
1187     uint32x4_t vDiff_Res4 = vmulq_u32(vreinterpretq_u32_s32(vH_H), vRes4);
1188 
1189     int32x4_t vShift = vdupq_n_s32(vshift);
1190     uint32x4_t vAddRes1 = vaddq_u32(vDiff_Res1, vreinterpretq_u32_s32(vShift));
1191     uint32x4_t vAddRes2 = vaddq_u32(vDiff_Res2, vreinterpretq_u32_s32(vShift));
1192     uint32x4_t vAddRes3 = vaddq_u32(vDiff_Res3, vreinterpretq_u32_s32(vShift));
1193     uint32x4_t vAddRes4 = vaddq_u32(vDiff_Res4, vreinterpretq_u32_s32(vShift));
1194     int16x4_t vShrRes1 = vshrn_n_s32(vreinterpretq_s32_u32(vAddRes1), 8);
1195     int16x4_t vShrRes2 = vshrn_n_s32(vreinterpretq_s32_u32(vAddRes2), 8);
1196     int16x4_t vShrRes3 = vshrn_n_s32(vreinterpretq_s32_u32(vAddRes3), 8);
1197     int16x4_t vShrRes4 = vshrn_n_s32(vreinterpretq_s32_u32(vAddRes4), 8);
1198 
1199     int16x8_t vc0 = vdupq_n_s16((s16)v0);
1200     int8x8_t vShrRes1_s8 = vshrn_n_s16(vcombine_s16(vShrRes1, vShrRes2), 4);
1201     uint16x8_t vCltRes_u16 = vcltq_s16(vcombine_s16(vShrRes3, vShrRes4), vc0);
1202     int8x8_t vShrRes2_s8 = vshrn_n_s16(vcombine_s16(vShrRes3, vShrRes4), 4);
1203 
1204     int8x8_t vCltRes_s8 = vmovn_s16(vreinterpretq_s16_u16(vCltRes_u16));
1205     int8x8_t vcHRange = vdup_n_s8((s8)vhrange);
1206     uint8x8_t vHResAdd = vand_u8(vreinterpret_u8_s8(vCltRes_s8), vreinterpret_u8_s8(vcHRange));
1207     int8x8_t vHRes = vadd_s8(vShrRes2_s8, vreinterpret_s8_u8(vHResAdd));
1208 
1209     uint8x8x3_t vHsv;
1210     vHsv.val[0] = vreinterpret_u8_s8(vHRes);
1211     vHsv.val[1] = vreinterpret_u8_s8(vShrRes1_s8);
1212     vHsv.val[2] = vMax;
1213 
1214     return vHsv;
1215 }
1216 
1217 const u8 fastSaturate8u[] =
1218 {
1219       0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
1220       0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
1221       0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
1222       0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
1223       0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
1224       0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
1225       0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
1226       0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
1227       0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
1228       0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
1229       0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
1230       0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
1231       0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
1232       0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
1233       0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
1234       0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
1235       0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,  14,  15,
1236      16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  30,  31,
1237      32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,
1238      48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,
1239      64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,
1240      80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,
1241      96,  97,  98,  99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,
1242     112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,
1243     128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,
1244     144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159,
1245     160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,
1246     176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191,
1247     192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207,
1248     208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223,
1249     224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
1250     240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255,
1251     255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
1252     255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
1253     255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
1254     255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
1255     255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
1256     255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
1257     255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
1258     255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
1259     255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
1260     255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
1261     255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
1262     255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
1263     255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
1264     255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
1265     255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
1266     255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
1267     255
1268 };
1269 
convertToHSV(const s32 r,const s32 g,const s32 b,const s32 & hrange,const s32 & hsv_shift,u8 * dst)1270 inline void convertToHSV(const s32 r, const s32 g, const s32 b,
1271                          const s32 &hrange, const s32 &hsv_shift,
1272                          u8* dst)
1273 {
1274     s32 h, s, v = b;
1275     s32 vmin = b, diff;
1276     s32 vr, vg;
1277 
1278     v += fastSaturate8u[g-v+256];
1279     v += fastSaturate8u[r-v+256];
1280     vmin -= fastSaturate8u[vmin-g+256];
1281     vmin -= fastSaturate8u[vmin-r+256];
1282 
1283     diff = v - vmin;
1284     vr = v == r ? -1 : 0;
1285     vg = v == g ? -1 : 0;
1286 
1287     s = (s32(diff * (255 << hsv_shift) * (1.0f/(f32)v)) + (1 << (hsv_shift-1))) >> hsv_shift;
1288     h = (vr & (g - b)) + (~vr & ((vg & (b - r + 2 * diff)) + ((~vg) & (r - g + 4 * diff))));
1289     h = ((h * s32((hrange << hsv_shift)/(6.f*diff) + 0.5)) + (1 << (hsv_shift-1))) >> hsv_shift;
1290     h += h < 0 ? hrange : 0;
1291 
1292     dst[0] = internal::saturate_cast<u8>(h);
1293     dst[1] = (u8)s;
1294     dst[2] = (u8)v;
1295 }
1296 
1297 #define CONVERT_TO_HSV_ASM(loadop, rreg, breg)                        \
1298             __asm__ (                                                 \
1299                #loadop    ", [%[in]] @RGB                       \n\t" \
1300             "vmin.u8 d3, d0, d1      @VMin (d3)                 \n\t" \
1301             "vmax.u8 d6, d0, d1      @V (d6)                    \n\t" \
1302             "vmovl.u8 q2, " #rreg "  @V16_R (d4,d5)             \n\t" \
1303             "vmovl.u8 q4, d1         @V16_G (d8,d9)             \n\t" \
1304             "vmax.u8 d6, d6, d2                                 \n\t" \
1305             "vmin.u8 d3, d3, d2                                 \n\t" \
1306             "vmovl.u8 q0, " #breg "  @V16_B (d0,d1)             \n\t" \
1307             "vsubl.u8 q8, d6, d3     @V16_Diff (d16,d17)        \n\t" \
1308                                                                       \
1309             "vmovl.u8 q5, d6         @V16_V (d10,d11)           \n\t" \
1310             "vadd.s16 q10, q8, q8    @V16_Diff_2 (d20,d21)      \n\t" \
1311             "vmovl.u16 q9, d16       @V32_Diff_L (d18,d19)      \n\t" \
1312             "vmovl.u16 q11, d17      @V32_Diff_H (d22,d23)      \n\t" \
1313             "vceq.u16 q12, q2, q5    @V==R(d24,d25)             \n\t" \
1314             "vceq.u16 q13, q4, q5    @V==G(d26,d27)             \n\t" \
1315                                                                       \
1316             "vsub.s16 q8, q4, q0     @V16_G-B (d16,d17)         \n\t" \
1317             "vmvn.u16 q15, q12       @V16~R                     \n\t" \
1318             "vsub.s16 q6, q0, q2     @V16_B-R (d12,d13)         \n\t" \
1319             "vsub.s16 q7, q2, q4     @V16_R-G (d14,d15)         \n\t" \
1320             "vand.u16 q1, q13, q15   @VMask2                    \n\t" \
1321             "vand.u16 q2, q8, q12    @V16_H(d4,d5)              \n\t" \
1322             "vadd.s16 q4, q6, q10    @V16_H2                    \n\t" \
1323             "vmvn.u16 q12, q13       @V16~G                     \n\t" \
1324             "vadd.s16 q6, q10, q10   @VDiff16_4 (d12,d13)       \n\t" \
1325             "vand.u16 q8, q15, q12   @VMask3                    \n\t" \
1326             "vand.u16 q15, q4, q1    @vH2(d30,d31)              \n\t" \
1327             "vadd.s16 q7, q7, q6     @V16_H3 (d14,d15)          \n\t" \
1328             "vadd.s16 q14, q2, q15   @vH16                      \n\t" \
1329             "vmovl.u16 q12, d10      @V32_V_L                   \n\t" \
1330             "vand.s16 q7, q7, q8     @vH16                      \n\t" \
1331             "vmovl.u16 q13, d11      @V32_V_H                   \n\t" \
1332             "vadd.s16 q2, q14, q7    @V16_Diff_4                \n\t" \
1333                                                                       \
1334             "vdup.32 q4, %[v6]                                  \n\t" \
1335             "vmul.u32 q14, q9, q4                               \n\t" \
1336             "vmul.u32 q15, q11, q4                              \n\t" \
1337             "vcvt.f32.u32 q4, q12     @VF1 (d8,d9)              \n\t" \
1338             "vcvt.f32.u32 q8, q13     @VF2                      \n\t" \
1339             "vcvt.f32.u32 q0, q14     @HF1                      \n\t" \
1340             "vcvt.f32.u32 q1, q15     @HF2                      \n\t" \
1341             "vrecpe.f32 q12, q4       @Vxinv                    \n\t" \
1342             "vrecpe.f32 q13, q8       @Vxinv                    \n\t" \
1343             "vrecpe.f32 q5, q0        @Vxinv                    \n\t" \
1344             "vrecpe.f32 q7, q1        @Vxinv                    \n\t" \
1345             "vrecps.f32 q14, q12, q4  @Vst1                     \n\t" \
1346             "vrecps.f32 q15, q13, q8  @Vst1                     \n\t" \
1347             "vrecps.f32 q10, q5, q0   @Vst1                     \n\t" \
1348             "vrecps.f32 q6, q7, q1    @Vst1                     \n\t" \
1349             "vmul.f32 q4, q12, q14                              \n\t" \
1350             "vmul.f32 q8, q13, q15                              \n\t" \
1351             "vmul.f32 q0, q5, q10                               \n\t" \
1352             "vmul.f32 q1, q7, q6                                \n\t" \
1353             "vdup.32 q12, %[vsdiv_table]                        \n\t" \
1354             "vmul.f32 q14, q4, q12                              \n\t" \
1355             "vmul.f32 q15, q8, q12                              \n\t" \
1356             "vdup.32 q12, %[vhdiv_table]                        \n\t" \
1357             "vmul.f32 q10, q0, q12                              \n\t" \
1358             "vmul.f32 q6, q1, q12                               \n\t" \
1359                                                                       \
1360             "vdup.32 q12, %[bias]                               \n\t" \
1361                                                                       \
1362             "vadd.f32 q7, q14, q12                              \n\t" \
1363             "vadd.f32 q13, q15, q12                             \n\t" \
1364             "vcvt.u32.f32 q4, q7                                \n\t" \
1365             "vcvt.u32.f32 q8, q13                               \n\t" \
1366                                                                       \
1367             "vadd.f32 q14, q10, q12                             \n\t" \
1368             "vadd.f32 q7, q6, q12                               \n\t" \
1369             "vcvt.u32.f32 q0, q14                               \n\t" \
1370             "vcvt.u32.f32 q1, q7      @Vres                     \n\t" \
1371                                                                       \
1372             "vmovl.s16 q7, d4         @V32_H_L (d14,d15)        \n\t" \
1373             "vmovl.s16 q5, d5         @V32_H_H (d10,d11)        \n\t" \
1374             "vmul.u32 q14, q9, q4                               \n\t" \
1375             "vmul.u32 q15, q11, q8                              \n\t" \
1376             "vmul.u32 q10, q7, q0                               \n\t" \
1377             "vmul.u32 q6, q5, q1                                \n\t" \
1378                                                                       \
1379             "vdup.32 q12, %[vshift]                             \n\t" \
1380             "vadd.u32 q13, q14, q12                             \n\t" \
1381             "vadd.u32 q8, q15, q12                              \n\t" \
1382             "vadd.u32 q0, q10, q12                              \n\t" \
1383             "vadd.u32 q1, q6, q12                               \n\t" \
1384             "vshrn.s32 d8, q13, #8                              \n\t" \
1385             "vshrn.s32 d9, q8, #8                               \n\t" \
1386             "vshrn.s32 d10, q0, #8                              \n\t" \
1387             "vshrn.s32 d11, q1, #8                              \n\t" \
1388                                                                       \
1389             "vdup.16 q8, %[v0]                                  \n\t" \
1390             "vshrn.s16 d5, q4, #4                               \n\t" \
1391             "vclt.s16 q9, q5, q8                                \n\t" \
1392             "vshrn.s16 d4, q5, #4                               \n\t" \
1393                                                                       \
1394             "vmovn.s16 d9, q9                                   \n\t" \
1395             "vdup.8 d7, %[vhrange]                              \n\t" \
1396             "vand.u8 d10, d9, d7                                \n\t" \
1397             "vadd.s8 d4, d4, d10                                \n\t" \
1398             "vst3.8 {d4-d6}, [%[out]] @HSV                      \n\t" \
1399             : /*no output*/                                           \
1400             : [out] "r" (dst + dj), [in] "r" (src + sj),              \
1401                         [vsdiv_table] "r" (vsdiv_table),              \
1402                         [vshift] "r" (vshift),                        \
1403                         [vhdiv_table] "r" (vhdiv_table),              \
1404                         [v6] "r" (v6), [vhrange] "r" (vhrange),       \
1405                         [v0] "r" (v0), [bias] "r" (bias)              \
1406             : "d0","d1","d2","d3","d4","d5","d6","d7",                \
1407               "d8","d9","d10","d11","d12","d13","d14","d15",          \
1408               "d16","d17","d18","d19","d20","d21","d22","d23",        \
1409               "d24","d25","d26","d27","d28","d29","d30","d31"         \
1410             );
1411 
1412 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
1413 
1414 #define YCRCB_CONSTS                                                        \
1415     register int16x4_t vcYR  asm ("d31") = vmov_n_s16(4899);                \
1416     register int16x4_t vcYG  asm ("d30") = vmov_n_s16(9617);                \
1417     register int16x4_t vcYB  asm ("d29") = vmov_n_s16(1868);                \
1418     register int16x4_t vcCrG asm ("d28") = vmov_n_s16(6860);                \
1419     register int16x4_t vcCrB asm ("d27") = vmov_n_s16(1332);                \
1420     register int16x4_t vcCbR asm ("d26") = vmov_n_s16(2765);                \
1421     register int16x4_t vcCbG asm ("d25") = vmov_n_s16(5427);
1422 
1423 #else
1424 
1425 #define YCRCB_CONSTS                                                        \
1426     const s16       convertCoeffs[] = { 4899, 4899, 4899, 4899,             \
1427                                         9617, 9617, 9617, 9617,             \
1428                                         1868, 1868, 1868, 1868,             \
1429                                         6860, 6860, 6860, 6860,             \
1430                                         1332, 1332, 1332, 1332,             \
1431                                         2765, 2765, 2765, 2765,             \
1432                                         5427, 5427, 5427, 5427  };          \
1433     const int16x8_t vcYRG  = vld1q_s16(convertCoeffs);      /*YR and YG*/   \
1434     const int16x4_t vcYB   = vld1_s16(convertCoeffs + 8);   /*YB*/          \
1435     const int16x8_t vcCrGB = vld1q_s16(convertCoeffs + 12); /*CrG and CrB*/ \
1436     const int16x8_t vcCbRG = vld1q_s16(convertCoeffs + 20); /*CbR and CbG*/
1437 
1438 #endif
1439 
1440 #define CONVERTTOYCRCB(loadcmd, rreg, greg, breg)                           \
1441     __asm__ (                                                               \
1442        #loadcmd   ", [%[in]] @RGB                       \n\t"               \
1443     "vmovl.u8 q2, " #rreg "  @R (d4,d5)                 \n\t"               \
1444     "vmovl.u8 q3, " #greg "  @G (d6,d7)                 \n\t"               \
1445     "vmovl.u8 q4, " #breg "  @B (d8,d9)                 \n\t"               \
1446                                                                             \
1447     "vshll.u16 q7, d4, #13   @Cr(q7,q8):  R             \n\t"               \
1448     "vmull.u16 q5, d6, d30   @Y (q5,q6):  G             \n\t"               \
1449     "vshll.u16 q9, d8, #13   @Cb(q9,q10): B             \n\t"               \
1450     "vshll.u16 q8, d5, #13   @Cr(q7,q8):  R             \n\t"               \
1451     "vmull.u16 q6, d7, d30   @Y (q5,q6):  G             \n\t"               \
1452     "vshll.u16 q10, d9, #13  @Cb(q9,q10): B             \n\t"               \
1453                                                                             \
1454     "vmlsl.s16 q7, d6, d28   @Cr(q7,q8):  RG            \n\t"               \
1455     "vmlal.s16 q5, d8, d29   @Y (q5,q6):  GB            \n\t"               \
1456     "vmlsl.s16 q9, d4, d26   @Cb(q9,q10): BR            \n\t"               \
1457     "vmlsl.s16 q8, d7, d28   @Cr(q7,q8):  RG            \n\t"               \
1458     "vmlal.s16 q6, d9, d29   @Y (q5,q6):  GB            \n\t"               \
1459     "vmlsl.s16 q10, d5, d26  @Cb(q9,q10): BR            \n\t"               \
1460                                                                             \
1461     "vmlsl.s16 q7, d8, d27   @Cr(q7,q8):  RGB           \n\t"               \
1462     "vmlal.s16 q5, d4, d31   @Y (q5,q6):  GBR           \n\t"               \
1463     "vmlsl.s16 q9, d6, d25   @Cb(q9,q10): BRG           \n\t"               \
1464     "vmlsl.s16 q8, d9, d27   @Cr(q7,q8):  RGB           \n\t"               \
1465     "vmlal.s16 q6, d5, d31   @Y (q5,q6):  GBR           \n\t"               \
1466     "vmlsl.s16 q10, d7, d25  @Cb(q9,q10): BRG           \n\t"               \
1467                                                                             \
1468     "vrshrn.s32 d4, q7, #14  @Cr -> q2                  \n\t"               \
1469     "vrshrn.s32 d8, q5, #14  @Y  -> q4                  \n\t"               \
1470     "vrshrn.s32 d6, q9, #14  @Cb -> q3                  \n\t"               \
1471     "vrshrn.s32 d5, q8, #14  @Cr -> q2                  \n\t"               \
1472     "vrshrn.s32 d9, q6, #14  @Y  -> q4                  \n\t"               \
1473     "vrshrn.s32 d7, q10, #14 @Cb -> q3                  \n\t"               \
1474                                                                             \
1475     "vmov.s16 q5, #128                                  \n\t"               \
1476     "vmov.s16 q6, #128                                  \n\t"               \
1477     "vadd.i16 q5, q2         @Cr -> q5                  \n\t"               \
1478     "vadd.i16 q6, q3         @Cb -> q6                  \n\t"               \
1479                                                                             \
1480     "vqmovn.u16 d4, q4                                  \n\t"               \
1481     "vqmovun.s16 d5, q5                                 \n\t"               \
1482     "vqmovun.s16 d6, q6                                 \n\t"               \
1483                                                                             \
1484     "vst3.8 {d4-d6}, [%[out]]                           \n\t"               \
1485     : /*no output*/                                                         \
1486     : [out] "r" (dst + dj), [in] "r" (src + sj),                            \
1487       "w" (vcYR), "w" (vcYG), "w" (vcYB),                                   \
1488       "w" (vcCrB), "w" (vcCrG), "w" (vcCbG), "w" (vcCbR)                    \
1489     : "d0","d1","d2","d3","d4","d5","d6","d7",                              \
1490       "d8","d9","d10","d11","d12","d13","d14","d15",                        \
1491       "d16","d17","d18","d19","d20","d21"                                   \
1492     );
1493 
1494 
convertToYCrCb(const int16x8_t & vR,const int16x8_t & vG,const int16x8_t & vB,const int16x8_t & vcYRG,const int16x4_t & vcYB,const int16x8_t & vcCrGB,const int16x8_t & vcCbRG)1495 inline uint8x8x3_t convertToYCrCb( const int16x8_t& vR, const int16x8_t& vG, const int16x8_t& vB,
1496                                    const int16x8_t& vcYRG, const int16x4_t& vcYB,
1497                                    const int16x8_t& vcCrGB, const int16x8_t& vcCbRG )
1498 {
1499     int32x4_t vCrL = vshll_n_s16(vget_low_s16(vR), 13);                  // R
1500     int32x4_t vCrH = vshll_n_s16(vget_high_s16(vR), 13);                 // R
1501     int32x4_t vYL  = vmull_s16(vget_low_s16(vG), vget_high_s16(vcYRG));  // G
1502     int32x4_t vYH  = vmull_s16(vget_high_s16(vG), vget_high_s16(vcYRG)); // G
1503     int32x4_t vCbL = vshll_n_s16(vget_low_s16(vB), 13);                  // B
1504     int32x4_t vCbH = vshll_n_s16(vget_high_s16(vB), 13);                 // B
1505 
1506     vCrL = vmlsl_s16(vCrL, vget_low_s16(vG), vget_low_s16(vcCrGB));      // RG
1507     vCrH = vmlsl_s16(vCrH, vget_high_s16(vG), vget_low_s16(vcCrGB));     // RG
1508     vYL  = vmlal_s16(vYL, vget_low_s16(vB), vcYB);                       // GB
1509     vYH  = vmlal_s16(vYH, vget_high_s16(vB), vcYB);                      // GB
1510     vCbL = vmlsl_s16(vCbL, vget_low_s16(vR), vget_low_s16(vcCbRG));      // BR
1511     vCbH = vmlsl_s16(vCbH, vget_high_s16(vR), vget_low_s16(vcCbRG));     // BR
1512 
1513     vCrL = vmlsl_s16(vCrL, vget_low_s16(vB), vget_high_s16(vcCrGB));     // RGB
1514     vCrH = vmlsl_s16(vCrH, vget_high_s16(vB), vget_high_s16(vcCrGB));    // RGB
1515     vYL  = vmlal_s16(vYL, vget_low_s16(vR), vget_low_s16(vcYRG));        // GBR
1516     vYH  = vmlal_s16(vYH, vget_high_s16(vR), vget_low_s16(vcYRG));       // GBR
1517     vCbL = vmlsl_s16(vCbL, vget_low_s16(vG), vget_high_s16(vcCbRG));     // BRG
1518     vCbH = vmlsl_s16(vCbH, vget_high_s16(vG), vget_high_s16(vcCbRG));    // BRG
1519 
1520     int16x4_t vCrL_ = vrshrn_n_s32(vCrL, 14);
1521     int16x4_t vCrH_ = vrshrn_n_s32(vCrH, 14);
1522     int16x4_t vYL_  = vrshrn_n_s32(vYL, 14);
1523     int16x4_t vYH_  = vrshrn_n_s32(vYH, 14);
1524     int16x4_t vCbL_ = vrshrn_n_s32(vCbL, 14);
1525     int16x4_t vCbH_ = vrshrn_n_s32(vCbH, 14);
1526 
1527     int16x8_t vCr = vmovq_n_s16(128);
1528     int16x8_t vCb = vmovq_n_s16(128);
1529 
1530     vCr = vaddq_s16(vCr, vcombine_s16(vCrL_, vCrH_));
1531     vCb = vaddq_s16(vCb, vcombine_s16(vCbL_, vCbH_));
1532 
1533     uint8x8x3_t vYCrCb;
1534     vYCrCb.val[0] = vqmovn_u16(vreinterpretq_u16_s16(vcombine_s16(vYL_, vYH_)));
1535     vYCrCb.val[1] = vqmovun_s16(vCr);
1536     vYCrCb.val[2] = vqmovun_s16(vCb);
1537 
1538     return vYCrCb;
1539 }
1540 
1541 #define S_CONVERTTOYCRCB(R, G, B)                                           \
1542     s32 Y =         (R * 4899    + G * 9617 + B * 1868 + (1 << 13)) >> 14;  \
1543     s32 Cr = 128 + ((R * 8192    - G * 6860 - B * 1332 + (1 << 13)) >> 14); \
1544     s32 Cb = 128 + ((R * (-2765) - G * 5427 + B * 8192 + (1 << 13)) >> 14); \
1545     dst[dj + 0] = internal::saturate_cast<u8>(Y);                           \
1546     dst[dj + 1] = internal::saturate_cast<u8>(Cr);                          \
1547     dst[dj + 2] = internal::saturate_cast<u8>(Cb);
1548 
1549 #define COEFF_Y   (   149)
1550 #define COEFF_BU  (   129)
1551 #define COEFF_RV  (   102)
1552 #define COEFF_GU  (    25)
1553 #define COEFF_GV  (    52)
1554 #define COEFF_R   (-14248)
1555 #define COEFF_G   (  8663)
1556 #define COEFF_B   (-17705)
1557 
1558 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
1559 #define YUV420ALPHA3_CONST
1560 #define YUV420ALPHA4_CONST register uint8x16_t c255  asm ("q13") = vmovq_n_u8(255);
1561 #define YUV420ALPHA3_CONVERT
1562 #define YUV420ALPHA4_CONVERT , "w" (c255)
1563 #define YUV420STORE1CMD3 "vst3.8 {d20, d22, d24}"
1564 #define YUV420STORE2CMD3 "vst3.8 {d21, d23, d25}"
1565 #define YUV420STORE1CMD4 "vst4.8 {d20, d22, d24, d26}"
1566 #define YUV420STORE2CMD4 "vst4.8 {d21, d23, d25, d27}"
1567 
1568 #define YUV420_CONSTS(cn, bIdx, vIdx)                            \
1569     register const s32 cR = s16(COEFF_R);                        \
1570     register const s32 cG = s16(COEFF_G);                        \
1571     register const s32 cB = s16(COEFF_B);                        \
1572                                                                  \
1573     register uint8x16_t vc16  asm ("q15") = vmovq_n_u8(16);      \
1574     register uint8x8_t cGU    asm ("d14") = vmov_n_u8(COEFF_GU); \
1575     register uint8x8_t cGV    asm ("d15") = vmov_n_u8(COEFF_GV); \
1576     register uint8x8_t cRV    asm ("d16") = vmov_n_u8(COEFF_RV); \
1577     register uint8x8_t cBU    asm ("d17") = vmov_n_u8(COEFF_BU); \
1578     register uint8x16_t cRGBY asm ("q3")  = vmovq_n_u8(COEFF_Y); \
1579     YUV420ALPHA##cn##_CONST
1580 
1581 #define CONVERTYUV420TORGB(cn, ureg, vreg, rreg, breg)                                                    \
1582     __asm__ (                                                                                             \
1583         "vld2.8 {d0-d1}, [%[inUV]]                      @UV                                         \n\t" \
1584         "vdup.16 q4, %[cG]                              @cG                                         \n\t" \
1585         "vld2.8 {d2-d3}, [%[inY1]]                      @YY                                         \n\t" \
1586         "vdup.16 "#rreg", %[cR]                         @cR                                         \n\t" \
1587         "vld2.8 {d4-d5}, [%[inY2]]                      @YY                                         \n\t" \
1588         "vdup.16 "#breg", %[cB]                         @cB                                         \n\t" \
1589         "vmlsl.u8 q4, "#ureg", d14                      @cG-25u                                     \n\t" \
1590         "vmax.u8 q1, q15                                @max(Y,16)                                  \n\t" \
1591         "vmlal.u8 "#rreg", "#vreg", d16                 @cR+102*v                                   \n\t" \
1592         "vmlal.u8 "#breg", "#ureg", d17                 @cB+129*u                                   \n\t" \
1593         "vmax.u8 q2, q15                                @max(Y,16)                                  \n\t" \
1594         "vmlsl.u8 q4, "#vreg", d15                      @cG-25u-52v                                 \n\t" \
1595                                                                          /*q10,q11,q12,q13 - for output*/ \
1596         "vmull.u8 q9, d3, d6                            @h 149*y                                    \n\t" \
1597         "vmull.u8 q10, d2, d7                           @l 149*y                                    \n\t" \
1598         "vshr.u16 q9, #1                                @h (149*y)/2                                \n\t" \
1599         "vshr.u16 q10, #1                               @l (149*y)/2                                \n\t" \
1600                                                                                                           \
1601         "vhadd.s16 q0, q9, q4                           @hG ((149*y)/2 + cG - 25*u - 52*v)/2        \n\t" \
1602         "vhadd.s16 q12, q10, q6                         @lB ((149*y)/2 + cB + 129*u)/2              \n\t" \
1603         "vhadd.s16 q1, q9, q5                           @hR ((149*y)/2 + cR + 102*v)/2              \n\t" \
1604         "vhadd.s16 q11, q10, q4                         @lG ((149*y)/2 + cG - 25*u - 52*v)/2        \n\t" \
1605         "vhadd.s16 q9, q6                               @hB ((149*y)/2 + cB + 129*u)/2              \n\t" \
1606         "vhadd.s16 q10, q5                              @lR ((149*y)/2 + cR + 102*v)/2              \n\t" \
1607                                                                                                           \
1608         "vqrshrun.s16 d24, q12, #5                      @lB ((149*y)/2 + cB + 129*u)/2/32           \n\t" \
1609         "vqrshrun.s16 d22, q11, #5                      @lG ((149*y)/2 + cG - 25*u - 52*v)/2/32     \n\t" \
1610         "vqrshrun.s16 d20, q10, #5                      @lR ((149*y)/2 + cR + 102*v)/2/32           \n\t" \
1611         "vqrshrun.s16 d23, q0, #5                       @hG ((149*y)/2 + cG - 25*u - 52*v)/2/32     \n\t" \
1612         "vqrshrun.s16 d21, q1, #5                       @hR ((149*y)/2 + cR + 102*v)/2/32           \n\t" \
1613         "vqrshrun.s16 d25, q9, #5                       @hB ((149*y)/2 + cB + 129*u)/2/32           \n\t" \
1614                                                                                                           \
1615         "vzip.8 d22, d23                                @G                \n\t"                           \
1616         "vzip.8 d20, d21                                @R                \n\t"                           \
1617         "vzip.8 d24, d25                                @B                \n\t"                           \
1618                                                                                                           \
1619         YUV420STORE1CMD##cn", [%[out1]]                                \n\t"                              \
1620         YUV420STORE2CMD##cn", [%[out1x]]                               \n\t"                              \
1621                                                                                                           \
1622         "vmull.u8 q9, d5, d6                            @h 149*y                \n\t"                     \
1623         "vmull.u8 q10, d4, d7                           @l 149*y                \n\t"                     \
1624         "vshr.u16 q9, #1                                @h (149*y)/2            \n\t"                     \
1625         "vshr.u16 q10, #1                               @l (149*y)/2            \n\t"                     \
1626                                                                                                           \
1627         "vhadd.s16 q0, q9, q4                           @hG ((149*y)/2 + cG - 25*u - 52*v)/2        \n\t" \
1628         "vhadd.s16 q12, q10, q6                         @lB ((149*y)/2 + cB + 129*u)/2              \n\t" \
1629         "vhadd.s16 q1, q9, q5                           @hR ((149*y)/2 + cR + 102*v)/2              \n\t" \
1630         "vhadd.s16 q11, q10, q4                         @lG ((149*y)/2 + cG - 25*u - 52*v)/2        \n\t" \
1631         "vhadd.s16 q9, q6                               @hB ((149*y)/2 + cB + 129*u)/2              \n\t" \
1632         "vhadd.s16 q10, q5                              @lR ((149*y)/2 + cR + 102*v)/2              \n\t" \
1633                                                                                                           \
1634         "vqrshrun.s16 d24, q12, #5                      @lB ((149*y)/2 + cB + 129*u)/2/32           \n\t" \
1635         "vqrshrun.s16 d22, q11, #5                      @lG ((149*y)/2 + cG - 25*u - 52*v)/2/32     \n\t" \
1636         "vqrshrun.s16 d20, q10, #5                      @lR ((149*y)/2 + cR + 102*v)/2/32           \n\t" \
1637         "vqrshrun.s16 d23, q0, #5                       @hG ((149*y)/2 + cG - 25*u - 52*v)/2/32     \n\t" \
1638         "vqrshrun.s16 d21, q1, #5                       @hR ((149*y)/2 + cR + 102*v)/2/32           \n\t" \
1639         "vqrshrun.s16 d25, q9, #5                       @hB ((149*y)/2 + cB + 129*u)/2/32           \n\t" \
1640                                                                                                           \
1641         "vzip.8 d22, d23                                @G                \n\t"                           \
1642         "vzip.8 d20, d21                                @R                \n\t"                           \
1643         "vzip.8 d24, d25                                @B                \n\t"                           \
1644                                                                                                           \
1645         YUV420STORE1CMD##cn", [%[out2]]                                \n\t"                              \
1646         YUV420STORE2CMD##cn", [%[out2x]]                               \n\t"                              \
1647                                                                                                           \
1648         : /*no output*/                                                                                   \
1649         : [out1] "r" (dst1 + dj), [out2] "r" (dst2 + dj),                                                 \
1650           [out1x] "r" (dst1 + dj+cn*8), [out2x] "r" (dst2 + dj+cn*8),                                     \
1651           [inUV] "r" (uv+j), [inY1] "r" (y1+j), [inY2] "r" (y2+j),                                        \
1652           [cR] "r" (cR), [cG] "r" (cG), [cB] "r" (cB),                                                    \
1653           "w" (vc16), "w" (cGU), "w" (cGV), "w" (cBU), "w" (cRV), "w" (cRGBY) YUV420ALPHA##cn##_CONVERT   \
1654         : "d0","d1","d2","d3","d4","d5","d8","d9","d10","d11","d12",                                      \
1655           "d13","d18","d19","d20","d21","d22","d23","d24","d25"                                           \
1656     );
1657 
1658 #else
1659 
1660 template<int bIdx>
1661 struct _convertYUV420Internals
1662 {
1663     uint16x8_t vc14216;
1664     uint16x8_t vc17672;
1665     uint16x8_t vc8696;
1666     uint8x8_t  vc102;
1667     uint8x8_t  vc25;
1668     uint8x8_t  vc129;
1669     uint8x8_t  vc52;
1670     uint16x8_t vc_1;
1671     uint8x8_t  vc149;
1672     uint8x8_t  vc16;
_convertYUV420InternalsCAROTENE_NS::__anon83de09880b11::_convertYUV420Internals1673     _convertYUV420Internals()
1674     {
1675         vc14216 = vdupq_n_u16(-COEFF_R);
1676         vc17672 = vdupq_n_u16(-COEFF_B);
1677         vc8696  = vdupq_n_u16(COEFF_G);
1678         vc102   = vdup_n_u8(COEFF_RV);
1679         vc25    = vdup_n_u8(COEFF_GU);
1680         vc129   = vdup_n_u8(COEFF_BU);
1681         vc52    = vdup_n_u8(COEFF_GV);
1682         vc_1    = vdupq_n_u16((uint16_t)-1);
1683         vc149   = vdup_n_u8(COEFF_Y);
1684         vc16    = vdup_n_u8(16);
1685     }
1686 
UVrgbToRGBCAROTENE_NS::__anon83de09880b11::_convertYUV420Internals1687     inline void UVrgbToRGB( const int16x8_t &ruv, const int16x8_t &guv, const int16x8_t &buv,
1688                             const u8 *y, uint8x16x3_t &rgbl )
1689     {
1690         //y get line
1691         uint8x8x2_t yl = vld2_u8(y);
1692         yl.val[0] = vmax_u8(yl.val[0], vc16);
1693         yl.val[1] = vmax_u8(yl.val[1], vc16);
1694 
1695         //y part line
1696         uint16x8_t yodd1 = vmlal_u8(vc_1, yl.val[0], vc149); //(-1+149*y)
1697         uint16x8_t yevn1 = vmlal_u8(vc_1, yl.val[1], vc149); //(-1+149*y)
1698         int16x8_t yodd1h = (int16x8_t)vshrq_n_u16(yodd1, 1);  //(-1+149*y)/2
1699         int16x8_t yevn1h = (int16x8_t)vshrq_n_u16(yevn1, 1);  //(-1+149*y)/2
1700 
1701         //y line calc rgb
1702         int16x8_t rodd1w = vhsubq_s16(yodd1h, ruv); //((-1+149*y)/2 - (14216-102*v))/2
1703         int16x8_t gevn1w = vhaddq_s16(yevn1h, guv); //((-1+149*y)/2 + ((8696-25*u)-52*v))/2
1704         int16x8_t bodd1w = vhsubq_s16(yodd1h, buv); //((-1+149*y)/2 - (17672-129*u))/2
1705         int16x8_t revn1w = vhsubq_s16(yevn1h, ruv); //((-1+149*y)/2 - (14216-102*v))/2
1706         int16x8_t godd1w = vhaddq_s16(yodd1h, guv); //((-1+149*y)/2 + ((8696-25*u)-52*v))/2
1707         int16x8_t bevn1w = vhsubq_s16(yevn1h, buv); //((-1+149*y)/2 - (17672-129*u))/2
1708 
1709         //y line clamp + narrow
1710         uint8x8_t rodd1n = vqshrun_n_s16(rodd1w, 5);
1711         uint8x8_t revn1n = vqshrun_n_s16(revn1w, 5);
1712         uint8x8_t godd1n = vqshrun_n_s16(godd1w, 5);
1713         uint8x8x2_t r1 = vzip_u8 (rodd1n, revn1n);
1714         uint8x8_t gevn1n = vqshrun_n_s16(gevn1w, 5);
1715         uint8x8_t bodd1n = vqshrun_n_s16(bodd1w, 5);
1716         uint8x8x2_t g1 = vzip_u8 (godd1n, gevn1n);
1717         uint8x8_t bevn1n = vqshrun_n_s16(bevn1w, 5);
1718         uint8x8x2_t b1 = vzip_u8 (bodd1n, bevn1n);
1719         rgbl.val[2 - bIdx] = vcombine_u8(r1.val[0], r1.val[1]);
1720         rgbl.val[1]        = vcombine_u8(g1.val[0], g1.val[1]);
1721         rgbl.val[0 + bIdx] = vcombine_u8(b1.val[0], b1.val[1]);
1722     }
1723 };
1724 
1725 template<int cn, int bIdx, int vIdx>
1726 struct _convertYUV420
1727 {
1728     _convertYUV420Internals<bIdx> convertYUV420Internals;
1729 
ToRGBCAROTENE_NS::__anon83de09880b11::_convertYUV4201730     inline void ToRGB( const u8 *y1, const u8 *y2, const u8 *uv,
1731                        u8 *dst1, u8 *dst2 )
1732     {
1733         uint8x8x2_t raw_uv = vld2_u8(uv);
1734         uint16x8_t gu =            vmlsl_u8(convertYUV420Internals.vc8696,  raw_uv.val[1-vIdx], convertYUV420Internals.vc25);  //(8696-25*u)
1735         int16x8_t ruv = (int16x8_t)vmlsl_u8(convertYUV420Internals.vc14216, raw_uv.val[vIdx], convertYUV420Internals.vc102); //(14216-102*v)
1736 
1737         int16x8_t buv = (int16x8_t)vmlsl_u8(convertYUV420Internals.vc17672, raw_uv.val[1-vIdx], convertYUV420Internals.vc129); //(17672-129*u)
1738         int16x8_t guv = (int16x8_t)vmlsl_u8(gu,      raw_uv.val[vIdx], convertYUV420Internals.vc52);  //((8696-25*u)-52*v))
1739 
1740         uint8x16x3_t rgbl;
1741         //y line1
1742         convertYUV420Internals.UVrgbToRGB(ruv, guv, buv, y1, rgbl);
1743         vst3q_u8(dst1, rgbl);
1744         //y line2
1745         convertYUV420Internals.UVrgbToRGB(ruv, guv, buv, y2, rgbl);
1746         vst3q_u8(dst2, rgbl);
1747     }
1748 };
1749 
1750 template<int bIdx, int vIdx>
1751 struct _convertYUV420<4, bIdx, vIdx>
1752 {
1753     _convertYUV420Internals<bIdx> convertYUV420Internals;
1754 
ToRGBCAROTENE_NS::__anon83de09880b11::_convertYUV4201755     inline void ToRGB( const u8 *y1, const u8 *y2, const u8 *uv,
1756                        u8 *dst1, u8 *dst2 )
1757     {
1758         uint8x8x2_t raw_uv = vld2_u8(uv);
1759         uint16x8_t gu =            vmlsl_u8(convertYUV420Internals.vc8696,  raw_uv.val[1-vIdx], convertYUV420Internals.vc25);  //(8696-25*u)
1760         int16x8_t ruv = (int16x8_t)vmlsl_u8(convertYUV420Internals.vc14216, raw_uv.val[vIdx], convertYUV420Internals.vc102); //(14216-102*v)
1761 
1762         int16x8_t buv = (int16x8_t)vmlsl_u8(convertYUV420Internals.vc17672, raw_uv.val[1-vIdx], convertYUV420Internals.vc129); //(17672-129*u)
1763         int16x8_t guv = (int16x8_t)vmlsl_u8(gu,      raw_uv.val[vIdx], convertYUV420Internals.vc52);  //((8696-25*u)-52*v))
1764 
1765         union { uint8x16x4_t v4; uint8x16x3_t v3; } rgbl;
1766         rgbl.v4.val[3] = vdupq_n_u8(0xff);
1767         //y line1
1768         convertYUV420Internals.UVrgbToRGB(ruv, guv, buv, y1, rgbl.v3);
1769         vst4q_u8(dst1, rgbl.v4);
1770         //y line2
1771         convertYUV420Internals.UVrgbToRGB(ruv, guv, buv, y2, rgbl.v3);
1772         vst4q_u8(dst2, rgbl.v4);
1773     }
1774 };
1775 
1776 #define YUV420_CONSTS(cn, bIdx, vIdx) _convertYUV420<cn, bIdx, vIdx> convertYUV420;
1777 
1778 #endif
1779 
fillAlpha(u8 *,u8 *)1780 template <int cn> inline void fillAlpha(u8 *, u8 *){}
fillAlpha(u8 * dst1,u8 * dst2)1781 template <> inline void fillAlpha<4>(u8 *dst1, u8 *dst2)
1782 {
1783     dst1[3] = 255;
1784     dst1[7] = 255;
1785     dst2[3] = 255;
1786     dst2[7] = 255;
1787 }
1788 template <int cn, int bIdx, int vIdx>
convertYUV420ToRGB(const u8 * y1,const u8 * y2,const u8 * uv,u8 * dst1,u8 * dst2)1789 inline void convertYUV420ToRGB(const u8 *y1, const u8 *y2, const u8 *uv, u8* dst1, u8 *dst2)
1790 {
1791     int Y11 = y1[0];
1792     int Y12 = y1[1];
1793     int Y21 = y2[0];
1794     int Y22 = y2[1];
1795 
1796     int U = uv[1 - vIdx];
1797     int V = uv[vIdx];
1798 
1799     int y11 = (COEFF_Y * std::max(16, Y11)) >> 1;
1800     int y12 = (COEFF_Y * std::max(16, Y12)) >> 1;
1801     int y21 = (COEFF_Y * std::max(16, Y21)) >> 1;
1802     int y22 = (COEFF_Y * std::max(16, Y22)) >> 1;
1803 
1804     int uvR = COEFF_R +                COEFF_RV * V;
1805     int uvG = COEFF_G - COEFF_GU * U - COEFF_GV * V;
1806     int uvB = COEFF_B + COEFF_BU * U;
1807 
1808     dst1[2-bIdx] = internal::saturate_cast<u8>((((y11 + uvR) >> 1) + (1 << 4)) >> 5);
1809     dst1[1] = internal::saturate_cast<u8>((((y11 + uvG) >> 1) + (1 << 4)) >> 5);
1810     dst1[bIdx] = internal::saturate_cast<u8>((((y11 + uvB) >> 1) + (1 << 4)) >> 5);
1811 
1812     dst1[cn+2-bIdx] = internal::saturate_cast<u8>((((y12 + uvR) >> 1) + (1 << 4)) >> 5);
1813     dst1[cn+1] = internal::saturate_cast<u8>((((y12 + uvG) >> 1) + (1 << 4)) >> 5);
1814     dst1[cn+bIdx] = internal::saturate_cast<u8>((((y12 + uvB) >> 1) + (1 << 4)) >> 5);
1815 
1816     dst2[2-bIdx] = internal::saturate_cast<u8>((((y21 + uvR) >> 1) + (1 << 4)) >> 5);
1817     dst2[1] = internal::saturate_cast<u8>((((y21 + uvG) >> 1) + (1 << 4)) >> 5);
1818     dst2[bIdx] = internal::saturate_cast<u8>((((y21 + uvB) >> 1) + (1 << 4)) >> 5);
1819 
1820     dst2[cn+2-bIdx] = internal::saturate_cast<u8>((((y22 + uvR) >> 1) + (1 << 4)) >> 5);
1821     dst2[cn+1] = internal::saturate_cast<u8>((((y22 + uvG) >> 1) + (1 << 4)) >> 5);
1822     dst2[cn+bIdx] = internal::saturate_cast<u8>((((y22 + uvB) >> 1) + (1 << 4)) >> 5);
1823 
1824     fillAlpha<cn>(dst1, dst2);
1825 }
1826 
1827 // converts R, G, B (B, G, R) pixels to  RGB(BGR)565 format respectively
convertTo565(const uint8x16_t & vR,const uint8x16_t & vG,const uint8x16_t & vB)1828 inline uint8x16x2_t convertTo565( const uint8x16_t& vR, const uint8x16_t& vG, const uint8x16_t& vB )
1829 {
1830     uint8x16x2_t vRgb565;                               // rrrrRRRR ggggGGGG bbbbBBBB
1831 
1832     vRgb565.val[1] = vsriq_n_u8(vB, vG, 5);             // xxxxxxxx bbbbBggg
1833     vRgb565.val[0] = vshlq_n_u8(vG, 3);                 // gGGGG000 bbbbBggg
1834     vRgb565.val[0] = vsriq_n_u8(vRgb565.val[0], vR, 3); // gGGrrrrR bbbbBggg
1835 
1836     return vRgb565;
1837 }
convertTo565(const u16 R,const u16 G,const u16 B,u8 * dst)1838 inline void convertTo565( const u16 R, const u16 G, const u16 B, u8 * dst )
1839 {
1840     *((u16*)dst) = (R >> 3)|((G&~3) << 3)|((B&~7) << 8);
1841 }
1842 #endif
1843 
1844 } //namespace
1845 
rgb2hsv(const Size2D & size,const u8 * srcBase,ptrdiff_t srcStride,u8 * dstBase,ptrdiff_t dstStride,s32 hrange)1846 void rgb2hsv(const Size2D &size,
1847              const u8 * srcBase, ptrdiff_t srcStride,
1848              u8 * dstBase, ptrdiff_t dstStride,
1849              s32 hrange)
1850 {
1851     internal::assertSupportedConfiguration();
1852 #ifdef CAROTENE_NEON
1853     size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
1854     const s32 hsv_shift = 12;
1855 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
1856     register const f32 vsdiv_table = f32(255 << hsv_shift);
1857     register f32 vhdiv_table = f32(hrange << hsv_shift);
1858     register const s32 vhrange = hrange;
1859     register const s32 v0 = s32(0);
1860     register const s32 vshift = s32(1 << (hsv_shift-1));
1861     register const s32 v6 = s32(6);
1862     register const f32 bias = 0.5f;
1863 #endif
1864 
1865     for (size_t i = 0u; i < size.height; ++i)
1866     {
1867         const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
1868         u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
1869         size_t sj = 0u, dj = 0u, j = 0u;
1870 
1871         for (; j < roiw8; sj += 24, dj += 24, j += 8)
1872         {
1873             internal::prefetch(src + sj);
1874 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
1875             CONVERT_TO_HSV_ASM(vld3.8 {d0-d2}, d0, d2)
1876 #else
1877             uint8x8x3_t vRgb = vld3_u8(src + sj);
1878             uint8x8x3_t vHsv = convertToHSV(vRgb.val[0], vRgb.val[1], vRgb.val[2], hrange);
1879             vst3_u8(dst + dj, vHsv);
1880 #endif
1881         }
1882 
1883         for (; j < size.width; ++j, sj += 3, dj += 3)
1884         {
1885             convertToHSV(src[sj], src[sj+1], src[sj+2], hrange, hsv_shift, dst+dj);
1886         }
1887     }
1888 #else
1889     (void)size;
1890     (void)srcBase;
1891     (void)srcStride;
1892     (void)dstBase;
1893     (void)dstStride;
1894     (void)hrange;
1895 #endif
1896 }
1897 
rgbx2hsv(const Size2D & size,const u8 * srcBase,ptrdiff_t srcStride,u8 * dstBase,ptrdiff_t dstStride,s32 hrange)1898 void rgbx2hsv(const Size2D &size,
1899               const u8 * srcBase, ptrdiff_t srcStride,
1900               u8 * dstBase, ptrdiff_t dstStride,
1901               s32 hrange)
1902 {
1903     internal::assertSupportedConfiguration();
1904 #ifdef CAROTENE_NEON
1905     size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
1906     const s32 hsv_shift = 12;
1907 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
1908     register const f32 vsdiv_table = f32(255 << hsv_shift);
1909     register f32 vhdiv_table = f32(hrange << hsv_shift);
1910     register const s32 vhrange = hrange;
1911     register const s32 v0 = s32(0);
1912     register const s32 vshift = s32(1 << (hsv_shift-1));
1913     register const s32 v6 = s32(6);
1914     register const f32 bias = 0.5f;
1915 #endif
1916 
1917     for (size_t i = 0u; i < size.height; ++i)
1918     {
1919         const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
1920         u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
1921         size_t sj = 0u, dj = 0u, j = 0u;
1922 
1923         for (; j < roiw8; sj += 32, dj += 24, j += 8)
1924         {
1925             internal::prefetch(src + sj);
1926 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
1927             CONVERT_TO_HSV_ASM(vld4.8 {d0-d3}, d0, d2)
1928 #else
1929             uint8x8x4_t vRgb = vld4_u8(src + sj);
1930             uint8x8x3_t vHsv = convertToHSV(vRgb.val[0], vRgb.val[1], vRgb.val[2], hrange);
1931             vst3_u8(dst + dj, vHsv);
1932 #endif
1933         }
1934 
1935         for (; j < size.width; ++j, sj += 4, dj += 3)
1936         {
1937             convertToHSV(src[sj], src[sj+1], src[sj+2], hrange, hsv_shift, dst+dj);
1938         }
1939     }
1940 #else
1941     (void)size;
1942     (void)srcBase;
1943     (void)srcStride;
1944     (void)dstBase;
1945     (void)dstStride;
1946     (void)hrange;
1947 #endif
1948 }
1949 
bgr2hsv(const Size2D & size,const u8 * srcBase,ptrdiff_t srcStride,u8 * dstBase,ptrdiff_t dstStride,s32 hrange)1950 void bgr2hsv(const Size2D &size,
1951              const u8 * srcBase, ptrdiff_t srcStride,
1952              u8 * dstBase, ptrdiff_t dstStride,
1953              s32 hrange)
1954 {
1955     internal::assertSupportedConfiguration();
1956 #ifdef CAROTENE_NEON
1957     size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
1958     const s32 hsv_shift = 12;
1959 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
1960     register const f32 vsdiv_table = f32(255 << hsv_shift);
1961     register f32 vhdiv_table = f32(hrange << hsv_shift);
1962     register const s32 vhrange = hrange;
1963     register const s32 v0 = s32(0);
1964     register const s32 vshift = s32(1 << (hsv_shift-1));
1965     register const s32 v6 = s32(6);
1966     register const f32 bias = 0.5f;
1967 #endif
1968 
1969     for (size_t i = 0u; i < size.height; ++i)
1970     {
1971         const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
1972         u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
1973         size_t sj = 0u, dj = 0u, j = 0u;
1974 
1975         for (; j < roiw8; sj += 24, dj += 24, j += 8)
1976         {
1977             internal::prefetch(src + sj);
1978 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
1979             CONVERT_TO_HSV_ASM(vld3.8 {d0-d2}, d2, d0)
1980 #else
1981             uint8x8x3_t vRgb = vld3_u8(src + sj);
1982             uint8x8x3_t vHsv = convertToHSV(vRgb.val[2], vRgb.val[1], vRgb.val[0], hrange);
1983             vst3_u8(dst + dj, vHsv);
1984 #endif
1985         }
1986 
1987         for (; j < size.width; ++j, sj += 3, dj += 3)
1988         {
1989             convertToHSV(src[sj+2], src[sj+1], src[sj], hrange, hsv_shift, dst+dj);
1990         }
1991     }
1992 #else
1993     (void)size;
1994     (void)srcBase;
1995     (void)srcStride;
1996     (void)dstBase;
1997     (void)dstStride;
1998     (void)hrange;
1999 #endif
2000 }
2001 
bgrx2hsv(const Size2D & size,const u8 * srcBase,ptrdiff_t srcStride,u8 * dstBase,ptrdiff_t dstStride,s32 hrange)2002 void bgrx2hsv(const Size2D &size,
2003               const u8 * srcBase, ptrdiff_t srcStride,
2004               u8 * dstBase, ptrdiff_t dstStride,
2005               s32 hrange)
2006 {
2007     internal::assertSupportedConfiguration();
2008 #ifdef CAROTENE_NEON
2009     size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
2010     const s32 hsv_shift = 12;
2011 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
2012     register const f32 vsdiv_table = f32(255 << hsv_shift);
2013     register f32 vhdiv_table = f32(hrange << hsv_shift);
2014     register const s32 vhrange = hrange;
2015     register const s32 v0 = s32(0);
2016     register const s32 vshift = s32(1 << (hsv_shift-1));
2017     register const s32 v6 = s32(6);
2018     register const f32 bias = 0.5f;
2019 #endif
2020 
2021     for (size_t i = 0u; i < size.height; ++i)
2022     {
2023         const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
2024         u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
2025         size_t sj = 0u, dj = 0u, j = 0u;
2026 
2027         for (; j < roiw8; sj += 32, dj += 24, j += 8)
2028         {
2029             internal::prefetch(src + sj);
2030 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
2031             CONVERT_TO_HSV_ASM(vld4.8 {d0-d3}, d2, d0)
2032 #else
2033             uint8x8x4_t vRgb = vld4_u8(src + sj);
2034             uint8x8x3_t vHsv = convertToHSV(vRgb.val[2], vRgb.val[1], vRgb.val[0], hrange);
2035             vst3_u8(dst + dj, vHsv);
2036 #endif
2037         }
2038 
2039         for (; j < size.width; ++j, sj += 4, dj += 3)
2040         {
2041             convertToHSV(src[sj+2], src[sj+1], src[sj], hrange, hsv_shift, dst+dj);
2042         }
2043     }
2044 #else
2045     (void)size;
2046     (void)srcBase;
2047     (void)srcStride;
2048     (void)dstBase;
2049     (void)dstStride;
2050     (void)hrange;
2051 #endif
2052 }
2053 
rgbx2bgr565(const Size2D & size,const u8 * srcBase,ptrdiff_t srcStride,u8 * dstBase,ptrdiff_t dstStride)2054 void rgbx2bgr565(const Size2D &size,
2055                  const u8 * srcBase, ptrdiff_t srcStride,
2056                  u8 * dstBase, ptrdiff_t dstStride)
2057 {
2058     internal::assertSupportedConfiguration();
2059 #ifdef CAROTENE_NEON
2060     size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
2061 
2062     for (size_t i = 0u; i < size.height; ++i)
2063     {
2064         const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
2065         u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
2066         size_t sj = 0u, dj = 0u, j = 0u;
2067 
2068         for (; j < roiw16; sj += 64, dj += 32, j += 16)
2069         {
2070             internal::prefetch(src + sj);
2071 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
2072             __asm__ (
2073                 "vld4.8 {d2, d4, d6, d8}, [%[in0]]        @  q0       q1       q2       q3       q4       \n\t"
2074                 "vld4.8 {d3, d5, d7, d9}, [%[in1]]        @  xxxxxxxx rrrrRRRR ggggGGGG bbbbBBBB xxxxxxxx \n\t"
2075                 "vsri.8 q1, q2, #5                        @  xxxxxxxx rrrrRggg ggggGGGG bbbbBBBB xxxxxxxx \n\t"
2076                 "vshl.u8 q0, q2, #3                       @  gGGGG000 rrrrRggg ggggGGGG bbbbBBBB xxxxxxxx \n\t"
2077                 "vsri.8 q0, q3, #3                        @  gGGbbbbB rrrrRggg ggggGGGG bbbbBBBB xxxxxxxx \n\t"
2078                 "vst2.8 {d0, d2}, [%[out0]]                                                               \n\t"
2079                 "vst2.8 {d1, d3}, [%[out1]]                                                               \n\t"
2080                 : /*no output*/
2081                 : [out0] "r" (dst + dj),
2082                   [out1] "r" (dst + dj + 16),
2083                   [in0]  "r" (src + sj),
2084                   [in1]  "r" (src + sj + 32)
2085                 : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9"
2086             );
2087 #else
2088             uint8x16x4_t vRgba = vld4q_u8(src + sj);
2089             uint8x16x2_t vVal565 = convertTo565(vRgba.val[2], vRgba.val[1], vRgba.val[0]);
2090             vst2q_u8(dst + dj, vVal565);
2091 #endif
2092         }
2093 
2094         for (; j < size.width; ++j, sj += 4, dj += 2)
2095         {
2096             convertTo565(src[sj + 2], src[sj + 1], src[sj], dst + dj);
2097         }
2098     }
2099 #else
2100     (void)size;
2101     (void)srcBase;
2102     (void)srcStride;
2103     (void)dstBase;
2104     (void)dstStride;
2105 #endif
2106 }
2107 
rgb2bgr565(const Size2D & size,const u8 * srcBase,ptrdiff_t srcStride,u8 * dstBase,ptrdiff_t dstStride)2108 void rgb2bgr565(const Size2D &size,
2109                  const u8 * srcBase, ptrdiff_t srcStride,
2110                  u8 * dstBase, ptrdiff_t dstStride)
2111 {
2112     internal::assertSupportedConfiguration();
2113 #ifdef CAROTENE_NEON
2114     size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
2115 
2116     for (size_t i = 0u; i < size.height; ++i)
2117     {
2118         const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
2119         u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
2120         size_t sj = 0u, dj = 0u, j = 0u;
2121 
2122         for (; j < roiw16; sj += 48, dj += 32, j += 16)
2123         {
2124             internal::prefetch(src + sj);
2125 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
2126             __asm__ (
2127                 "vld3.8 {d2, d4, d6}, [%[in0]]       @  q0       q1       q2       q3       q4       \n\t"
2128                 "vld3.8 {d3, d5, d7}, [%[in1]]       @  xxxxxxxx rrrrRRRR ggggGGGG bbbbBBBB xxxxxxxx \n\t"
2129                 "vsri.8 q1, q2, #5                   @  xxxxxxxx rrrrRggg ggggGGGG bbbbBBBB xxxxxxxx \n\t"
2130                 "vshl.u8 q0, q2, #3                  @  gGGGG000 rrrrRggg ggggGGGG bbbbBBBB xxxxxxxx \n\t"
2131                 "vsri.8 q0, q3, #3                   @  gGGbbbbB rrrrRggg ggggGGGG bbbbBBBB xxxxxxxx \n\t"
2132                 "vst2.8 {d0, d2}, [%[out0]]                                                          \n\t"
2133                 "vst2.8 {d1, d3}, [%[out1]]                                                          \n\t"
2134                 : /*no output*/
2135                 : [out0] "r" (dst + dj),
2136                   [out1] "r" (dst + dj + 16),
2137                   [in0]  "r" (src + sj),
2138                   [in1]  "r" (src + sj + 24)
2139                 : "d0","d1","d2","d3","d4","d5","d6","d7"
2140             );
2141 #else
2142             uint8x16x3_t vRgba = vld3q_u8(src + sj);
2143             uint8x16x2_t vVal565 = convertTo565(vRgba.val[2], vRgba.val[1], vRgba.val[0]);
2144             vst2q_u8(dst + dj, vVal565);
2145 #endif
2146         }
2147 
2148         for (; j < size.width; ++j, sj += 3, dj += 2)
2149         {
2150             convertTo565(src[sj + 2], src[sj + 1], src[sj], dst + dj);
2151         }
2152     }
2153 #else
2154     (void)size;
2155     (void)srcBase;
2156     (void)srcStride;
2157     (void)dstBase;
2158     (void)dstStride;
2159 #endif
2160 }
2161 
rgbx2rgb565(const Size2D & size,const u8 * srcBase,ptrdiff_t srcStride,u8 * dstBase,ptrdiff_t dstStride)2162 void rgbx2rgb565(const Size2D &size,
2163                  const u8 * srcBase, ptrdiff_t srcStride,
2164                  u8 * dstBase, ptrdiff_t dstStride)
2165 {
2166     internal::assertSupportedConfiguration();
2167 #ifdef CAROTENE_NEON
2168     size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
2169 
2170     for (size_t i = 0u; i < size.height; ++i)
2171     {
2172         const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
2173         u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
2174         size_t sj = 0u, dj = 0u, j = 0u;
2175 
2176         for (; j < roiw16; sj += 64, dj += 32, j += 16)
2177         {
2178             internal::prefetch(src + sj);
2179 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
2180             __asm__ (
2181                 "vld4.8 {d0, d2, d4, d6}, [%[in0]]    @  q0       q1       q2       q3         \n\t"
2182                 "vld4.8 {d1, d3, d5, d7}, [%[in1]]    @  rrrrRRRR ggggGGGG bbbbBBBB aaaaAAAA   \n\t"
2183                 "vsri.8 q2, q1, #5                    @  rrrrRRRR ggggGGGG bbbbBggg aaaaAAAA   \n\t"
2184                 "vshl.u8 q1, #3                       @  rrrrRRRR gGGGG000 bbbbBggg aaaaAAAA   \n\t"
2185                 "vsri.8 q1, q0, #3                    @  rrrrRRRR gGGrrrrR bbbbBggg aaaaAAAA   \n\t"
2186                 "vst2.8 {d2, d4}, [%[out0]]                                                    \n\t"
2187                 "vst2.8 {d3, d5}, [%[out1]]                                                    \n\t"
2188                 : /*no output*/
2189                 : [out0] "r" (dst + dj),
2190                   [out1] "r" (dst + dj + 16),
2191                   [in0]  "r" (src + sj),
2192                   [in1]  "r" (src + sj + 32)
2193                 : "d0","d1","d2","d3","d4","d5","d6","d7"
2194             );
2195 #else
2196             uint8x16x4_t vRgba = vld4q_u8(src + sj);
2197             uint8x16x2_t vVal565 = convertTo565(vRgba.val[0], vRgba.val[1], vRgba.val[2]);
2198             vst2q_u8(dst + dj, vVal565);
2199 #endif
2200         }
2201 
2202         for (; j < size.width; ++j, sj += 4, dj += 2)
2203         {
2204             convertTo565(src[sj], src[sj + 1], src[sj + 2], dst + dj);
2205         }
2206     }
2207 #else
2208     (void)size;
2209     (void)srcBase;
2210     (void)srcStride;
2211     (void)dstBase;
2212     (void)dstStride;
2213 #endif
2214 }
2215 
rgb2rgb565(const Size2D & size,const u8 * srcBase,ptrdiff_t srcStride,u8 * dstBase,ptrdiff_t dstStride)2216 void rgb2rgb565(const Size2D &size,
2217                  const u8 * srcBase, ptrdiff_t srcStride,
2218                  u8 * dstBase, ptrdiff_t dstStride)
2219 {
2220     internal::assertSupportedConfiguration();
2221 #ifdef CAROTENE_NEON
2222     size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
2223 
2224     for (size_t i = 0u; i < size.height; ++i)
2225     {
2226         const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
2227         u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
2228         size_t sj = 0u, dj = 0u, j = 0u;
2229 
2230         for (; j < roiw16; sj += 48, dj += 32, j += 16)
2231         {
2232             internal::prefetch(src + sj);
2233 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
2234             __asm__ (
2235                 "vld3.8 {d0, d2, d4}, [%[in0]]        @  q0       q1       q2       q3         \n\t"
2236                 "vld3.8 {d1, d3, d5}, [%[in1]]        @  rrrrRRRR ggggGGGG bbbbBBBB xxxxxxxx   \n\t"
2237                 "vsri.8 q2, q1, #5                    @  rrrrRRRR ggggGGGG bbbbBggg xxxxxxxx   \n\t"
2238                 "vshl.u8 q1, #3                       @  rrrrRRRR gGGGG000 bbbbBggg xxxxxxxx   \n\t"
2239                 "vsri.8 q1, q0, #3                    @  rrrrRRRR gGGrrrrR bbbbBggg xxxxxxxx   \n\t"
2240                 "vst2.8 {d2, d4}, [%[out0]]                                                    \n\t"
2241                 "vst2.8 {d3, d5}, [%[out1]]                                                    \n\t"
2242                 : /*no output*/
2243                 : [out0] "r" (dst + dj),
2244                   [out1] "r" (dst + dj + 16),
2245                   [in0]  "r" (src + sj),
2246                   [in1]  "r" (src + sj + 24)
2247                 : "d0","d1","d2","d3","d4","d5"
2248             );
2249 #else
2250             uint8x16x3_t vRgba = vld3q_u8(src + sj);
2251             uint8x16x2_t vVal565 = convertTo565(vRgba.val[0], vRgba.val[1], vRgba.val[2]);
2252             vst2q_u8(dst + dj, vVal565);
2253 #endif
2254         }
2255 
2256         for (; j < size.width; ++j, sj += 3, dj += 2)
2257         {
2258             convertTo565(src[sj], src[sj + 1], src[sj + 2], dst + dj);
2259         }
2260     }
2261 #else
2262     (void)size;
2263     (void)srcBase;
2264     (void)srcStride;
2265     (void)dstBase;
2266     (void)dstStride;
2267 #endif
2268 }
2269 
rgb2ycrcb(const Size2D & size,const u8 * srcBase,ptrdiff_t srcStride,u8 * dstBase,ptrdiff_t dstStride)2270 void rgb2ycrcb(const Size2D &size,
2271                const u8 * srcBase, ptrdiff_t srcStride,
2272                u8 * dstBase, ptrdiff_t dstStride)
2273 {
2274     internal::assertSupportedConfiguration();
2275 #ifdef CAROTENE_NEON
2276     YCRCB_CONSTS
2277     size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
2278 
2279     for (size_t i = 0u; i < size.height; ++i)
2280     {
2281         const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
2282         u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
2283         size_t sj = 0u, dj = 0u, j = 0u;
2284 
2285         for (; j < roiw8; sj += 24, dj += 24, j += 8)
2286         {
2287             internal::prefetch(src + sj);
2288 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
2289             CONVERTTOYCRCB(vld3.8 {d0-d2}, d0, d1, d2)
2290 #else
2291             uint8x8x3_t vRgb = vld3_u8(src + sj);
2292             int16x8_t vR = vreinterpretq_s16_u16(vmovl_u8(vRgb.val[0]));
2293             int16x8_t vG = vreinterpretq_s16_u16(vmovl_u8(vRgb.val[1]));
2294             int16x8_t vB = vreinterpretq_s16_u16(vmovl_u8(vRgb.val[2]));
2295             uint8x8x3_t vYCrCb = convertToYCrCb(vR, vG, vB, vcYRG, vcYB, vcCrGB, vcCbRG);
2296             vst3_u8(dst + dj, vYCrCb);
2297 #endif
2298         }
2299 
2300         for (; j < size.width; ++j, sj += 3, dj += 3)
2301         {
2302             S_CONVERTTOYCRCB(src[sj], src[sj + 1], src[sj + 2]);
2303         }
2304     }
2305 #else
2306     (void)size;
2307     (void)srcBase;
2308     (void)srcStride;
2309     (void)dstBase;
2310     (void)dstStride;
2311 #endif
2312 }
2313 
rgbx2ycrcb(const Size2D & size,const u8 * srcBase,ptrdiff_t srcStride,u8 * dstBase,ptrdiff_t dstStride)2314 void rgbx2ycrcb(const Size2D &size,
2315                 const u8 * srcBase, ptrdiff_t srcStride,
2316                 u8 * dstBase, ptrdiff_t dstStride)
2317 {
2318     internal::assertSupportedConfiguration();
2319 #ifdef CAROTENE_NEON
2320     YCRCB_CONSTS
2321     size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
2322 
2323     for (size_t i = 0u; i < size.height; ++i)
2324     {
2325         const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
2326         u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
2327         size_t sj = 0u, dj = 0u, j = 0u;
2328 
2329         for (; j < roiw8; sj += 32, dj += 24, j += 8)
2330         {
2331             internal::prefetch(src + sj);
2332 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
2333             CONVERTTOYCRCB(vld4.8 {d0-d3}, d0, d1, d2)
2334 #else
2335             uint8x8x4_t vRgba = vld4_u8(src + sj);
2336             int16x8_t vR = vreinterpretq_s16_u16(vmovl_u8(vRgba.val[0]));
2337             int16x8_t vG = vreinterpretq_s16_u16(vmovl_u8(vRgba.val[1]));
2338             int16x8_t vB = vreinterpretq_s16_u16(vmovl_u8(vRgba.val[2]));
2339             uint8x8x3_t vYCrCb = convertToYCrCb(vR, vG, vB, vcYRG, vcYB, vcCrGB, vcCbRG);
2340             vst3_u8(dst + dj, vYCrCb);
2341 #endif
2342         }
2343 
2344         for (; j < size.width; ++j, sj += 4, dj += 3)
2345         {
2346             S_CONVERTTOYCRCB(src[sj], src[sj + 1], src[sj + 2]);
2347         }
2348     }
2349 #else
2350     (void)size;
2351     (void)srcBase;
2352     (void)srcStride;
2353     (void)dstBase;
2354     (void)dstStride;
2355 #endif
2356 }
2357 
bgr2ycrcb(const Size2D & size,const u8 * srcBase,ptrdiff_t srcStride,u8 * dstBase,ptrdiff_t dstStride)2358 void bgr2ycrcb(const Size2D &size,
2359                const u8 * srcBase, ptrdiff_t srcStride,
2360                u8 * dstBase, ptrdiff_t dstStride)
2361 {
2362     internal::assertSupportedConfiguration();
2363 #ifdef CAROTENE_NEON
2364     YCRCB_CONSTS
2365     size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
2366 
2367     for (size_t i = 0u; i < size.height; ++i)
2368     {
2369         const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
2370         u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
2371         size_t sj = 0u, dj = 0u, j = 0u;
2372 
2373         for (; j < roiw8; sj += 24, dj += 24, j += 8)
2374         {
2375             internal::prefetch(src + sj);
2376 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
2377             CONVERTTOYCRCB(vld3.8 {d0-d2}, d2, d1, d0)
2378 #else
2379             uint8x8x3_t vBgr = vld3_u8(src + sj);
2380             int16x8_t vB = vreinterpretq_s16_u16(vmovl_u8(vBgr.val[0]));
2381             int16x8_t vG = vreinterpretq_s16_u16(vmovl_u8(vBgr.val[1]));
2382             int16x8_t vR = vreinterpretq_s16_u16(vmovl_u8(vBgr.val[2]));
2383             uint8x8x3_t vYCrCb = convertToYCrCb(vR, vG, vB, vcYRG, vcYB, vcCrGB, vcCbRG);
2384             vst3_u8(dst + dj, vYCrCb);
2385 #endif
2386         }
2387 
2388         for (; j < size.width; ++j, sj += 3, dj += 3)
2389         {
2390             S_CONVERTTOYCRCB(src[sj + 2], src[sj + 1], src[sj]);
2391         }
2392     }
2393 #else
2394     (void)size;
2395     (void)srcBase;
2396     (void)srcStride;
2397     (void)dstBase;
2398     (void)dstStride;
2399 #endif
2400 }
2401 
bgrx2ycrcb(const Size2D & size,const u8 * srcBase,ptrdiff_t srcStride,u8 * dstBase,ptrdiff_t dstStride)2402 void bgrx2ycrcb(const Size2D &size,
2403                 const u8 * srcBase, ptrdiff_t srcStride,
2404                 u8 * dstBase, ptrdiff_t dstStride)
2405 {
2406     internal::assertSupportedConfiguration();
2407 #ifdef CAROTENE_NEON
2408     YCRCB_CONSTS
2409     size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
2410 
2411     for (size_t i = 0u; i < size.height; ++i)
2412     {
2413         const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
2414         u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
2415         size_t sj = 0u, dj = 0u, j = 0u;
2416 
2417         for (; j < roiw8; sj += 32, dj += 24, j += 8)
2418         {
2419             internal::prefetch(src + sj);
2420 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
2421             CONVERTTOYCRCB(vld4.8 {d0-d3}, d2, d1, d0)
2422 #else
2423             uint8x8x4_t vBgra = vld4_u8(src + sj);
2424             int16x8_t vB = vreinterpretq_s16_u16(vmovl_u8(vBgra.val[0]));
2425             int16x8_t vG = vreinterpretq_s16_u16(vmovl_u8(vBgra.val[1]));
2426             int16x8_t vR = vreinterpretq_s16_u16(vmovl_u8(vBgra.val[2]));
2427             uint8x8x3_t vYCrCb = convertToYCrCb(vR, vG, vB, vcYRG, vcYB, vcCrGB, vcCbRG);
2428             vst3_u8(dst + dj, vYCrCb);
2429 #endif
2430         }
2431 
2432         for (; j < size.width; ++j, sj += 4, dj += 3)
2433         {
2434             S_CONVERTTOYCRCB(src[sj + 2], src[sj + 1], src[sj]);
2435         }
2436     }
2437 #else
2438     (void)size;
2439     (void)srcBase;
2440     (void)srcStride;
2441     (void)dstBase;
2442     (void)dstStride;
2443 #endif
2444 }
2445 
yuv420sp2rgb(const Size2D & size,const u8 * yBase,ptrdiff_t yStride,const u8 * uvBase,ptrdiff_t uvStride,u8 * dstBase,ptrdiff_t dstStride)2446 void yuv420sp2rgb(const Size2D &size,
2447                   const u8 *  yBase, ptrdiff_t  yStride,
2448                   const u8 * uvBase, ptrdiff_t uvStride,
2449                   u8 * dstBase, ptrdiff_t dstStride)
2450 {
2451     // input data:
2452     ////////////// Y matrix:
2453     // {y1, y2,   y3, y4,   y5, y6,   y7, y8,   y9, y10, y11, y12, y13, y14, y15, y16}
2454     // {Y1, Y2,   Y3, Y4,   Y5, Y6,   Y7, Y8,   Y9, Y10, Y11, Y12, Y13, Y14, Y15, Y16}
2455     ////////////// UV matrix:
2456     // {v12, u12, v34, u34, v56, u56, v78, u78, v90 u90, V12, U12, V34, U34, V56, U56}
2457 
2458     // fp version
2459     // R = 1.164(Y - 16) + 1.596(V - 128)
2460     // G = 1.164(Y - 16) - 0.813(V - 128) - 0.391(U - 128)
2461     // B = 1.164(Y - 16)                  + 2.018(U - 128)
2462 
2463     // integer version
2464     // R = [((149*y)/2 + (-14248+102*v)      )/2]/32
2465     // G = [((149*y)/2 + ((8663- 25*u)-52*v))/2]/32
2466     // B = [((149*y)/2 + (-17705+129*u)      )/2]/32
2467 
2468     // error estimation:
2469     //Rerr = 0.0000625 * y - 0.00225 * v                - 0.287
2470     //Gerr = 0.0000625 * y + 0.0005  * v + 0.000375 * u + 0.128625
2471     //Berr = 0.0000625 * y               - 0.002375 * u - 0.287375
2472 
2473     //real error test:
2474     //=================
2475     //R: 1 less: 520960       ==  3.11% of full space
2476     //G: 1 less: 251425       ==  1.50% of full space
2477     //B: 1 less: 455424       ==  2.71% of full space
2478     //=================
2479     //R: 1 more: 642048       ==  3.83% of full space
2480     //G: 1 more: 192458       ==  1.15% of full space
2481     //B: 1 more: 445184       ==  2.65% of full space
2482 
2483     internal::assertSupportedConfiguration();
2484 #ifdef CAROTENE_NEON
2485     YUV420_CONSTS(3, 2, 0)
2486     size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
2487 
2488     for (size_t i = 0u; i < size.height; i+=2)
2489     {
2490         const u8 * uv = internal::getRowPtr(uvBase, uvStride, i>>1);
2491         const u8 * y1 = internal::getRowPtr(yBase, yStride, i);
2492         const u8 * y2 = internal::getRowPtr(yBase, yStride, i+1);
2493         u8 * dst1 = internal::getRowPtr(dstBase, dstStride, i);
2494         u8 * dst2 = internal::getRowPtr(dstBase, dstStride, i+1);
2495 
2496         size_t dj = 0u, j = 0u;
2497         for (; j < roiw16; dj += 48, j += 16)
2498         {
2499             internal::prefetch(uv + j);
2500             internal::prefetch(y1 + j);
2501             internal::prefetch(y2 + j);
2502 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
2503             CONVERTYUV420TORGB(3, d1, d0, q5, q6)
2504 #else
2505             convertYUV420.ToRGB(y1 + j, y2 + j, uv + j, dst1 + dj, dst2 + dj);
2506 #endif
2507         }
2508         for (; j + 2 <= size.width; j+=2, dj += 6)
2509         {
2510             convertYUV420ToRGB<3, 2, 0>(y1+j, y2+j, uv+j, dst1 + dj, dst2 + dj);
2511         }
2512     }
2513 #else
2514     (void)size;
2515     (void)yBase;
2516     (void)yStride;
2517     (void)uvBase;
2518     (void)uvStride;
2519     (void)dstBase;
2520     (void)dstStride;
2521 #endif
2522 }
2523 
yuv420sp2rgbx(const Size2D & size,const u8 * yBase,ptrdiff_t yStride,const u8 * uvBase,ptrdiff_t uvStride,u8 * dstBase,ptrdiff_t dstStride)2524 void yuv420sp2rgbx(const Size2D &size,
2525                    const u8 *  yBase, ptrdiff_t  yStride,
2526                    const u8 * uvBase, ptrdiff_t uvStride,
2527                    u8 * dstBase, ptrdiff_t dstStride)
2528 {
2529     internal::assertSupportedConfiguration();
2530 #ifdef CAROTENE_NEON
2531     YUV420_CONSTS(4, 2, 0)
2532     size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
2533 
2534     for (size_t i = 0u; i < size.height; i+=2)
2535     {
2536         const u8 * uv = internal::getRowPtr(uvBase, uvStride, i>>1);
2537         const u8 * y1 = internal::getRowPtr(yBase, yStride, i);
2538         const u8 * y2 = internal::getRowPtr(yBase, yStride, i+1);
2539         u8 * dst1 = internal::getRowPtr(dstBase, dstStride, i);
2540         u8 * dst2 = internal::getRowPtr(dstBase, dstStride, i+1);
2541 
2542         size_t dj = 0u, j = 0u;
2543         for (; j < roiw16; dj += 64, j += 16)
2544         {
2545             internal::prefetch(uv + j);
2546             internal::prefetch(y1 + j);
2547             internal::prefetch(y2 + j);
2548 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
2549             CONVERTYUV420TORGB(4, d1, d0, q5, q6)
2550 #else
2551             convertYUV420.ToRGB(y1 + j, y2 + j, uv + j, dst1 + dj, dst2 + dj);
2552 #endif
2553         }
2554         for (; j + 2 <= size.width; j+=2, dj += 8)
2555         {
2556             convertYUV420ToRGB<4, 2, 0>(y1+j, y2+j, uv+j, dst1 + dj, dst2 + dj);
2557         }
2558     }
2559 #else
2560     (void)size;
2561     (void)yBase;
2562     (void)yStride;
2563     (void)uvBase;
2564     (void)uvStride;
2565     (void)dstBase;
2566     (void)dstStride;
2567 #endif
2568 }
2569 
yuv420i2rgb(const Size2D & size,const u8 * yBase,ptrdiff_t yStride,const u8 * uvBase,ptrdiff_t uvStride,u8 * dstBase,ptrdiff_t dstStride)2570 void yuv420i2rgb(const Size2D &size,
2571                  const u8 *  yBase, ptrdiff_t  yStride,
2572                  const u8 * uvBase, ptrdiff_t uvStride,
2573                  u8 * dstBase, ptrdiff_t dstStride)
2574 {
2575     internal::assertSupportedConfiguration();
2576 #ifdef CAROTENE_NEON
2577     YUV420_CONSTS(3, 2, 1)
2578     size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
2579 
2580     for (size_t i = 0u; i < size.height; i+=2)
2581     {
2582         const u8 * uv = internal::getRowPtr(uvBase, uvStride, i>>1);
2583         const u8 * y1 = internal::getRowPtr(yBase, yStride, i);
2584         const u8 * y2 = internal::getRowPtr(yBase, yStride, i+1);
2585         u8 * dst1 = internal::getRowPtr(dstBase, dstStride, i);
2586         u8 * dst2 = internal::getRowPtr(dstBase, dstStride, i+1);
2587 
2588         size_t dj = 0u, j = 0u;
2589         for (; j < roiw16; dj += 48, j += 16)
2590         {
2591             internal::prefetch(uv + j);
2592             internal::prefetch(y1 + j);
2593             internal::prefetch(y2 + j);
2594 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
2595             CONVERTYUV420TORGB(3, d0, d1, q5, q6)
2596 #else
2597             convertYUV420.ToRGB(y1 + j, y2 + j, uv + j, dst1 + dj, dst2 + dj);
2598 #endif
2599         }
2600         for (; j + 2 <= size.width; j+=2, dj += 6)
2601         {
2602             convertYUV420ToRGB<3, 2, 1>(y1+j, y2+j, uv+j, dst1 + dj, dst2 + dj);
2603         }
2604     }
2605 #else
2606     (void)size;
2607     (void)yBase;
2608     (void)yStride;
2609     (void)uvBase;
2610     (void)uvStride;
2611     (void)dstBase;
2612     (void)dstStride;
2613 #endif
2614 }
2615 
yuv420i2rgbx(const Size2D & size,const u8 * yBase,ptrdiff_t yStride,const u8 * uvBase,ptrdiff_t uvStride,u8 * dstBase,ptrdiff_t dstStride)2616 void yuv420i2rgbx(const Size2D &size,
2617                   const u8 *  yBase, ptrdiff_t  yStride,
2618                   const u8 * uvBase, ptrdiff_t uvStride,
2619                   u8 * dstBase, ptrdiff_t dstStride)
2620 {
2621     internal::assertSupportedConfiguration();
2622 #ifdef CAROTENE_NEON
2623     YUV420_CONSTS(4, 2, 1)
2624     size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
2625 
2626     for (size_t i = 0u; i < size.height; i+=2)
2627     {
2628         const u8 * uv = internal::getRowPtr(uvBase, uvStride, i>>1);
2629         const u8 * y1 = internal::getRowPtr(yBase, yStride, i);
2630         const u8 * y2 = internal::getRowPtr(yBase, yStride, i+1);
2631         u8 * dst1 = internal::getRowPtr(dstBase, dstStride, i);
2632         u8 * dst2 = internal::getRowPtr(dstBase, dstStride, i+1);
2633 
2634         size_t dj = 0u, j = 0u;
2635         for (; j < roiw16; dj += 64, j += 16)
2636         {
2637             internal::prefetch(uv + j);
2638             internal::prefetch(y1 + j);
2639             internal::prefetch(y2 + j);
2640 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
2641             CONVERTYUV420TORGB(4, d0, d1, q5, q6)
2642 #else
2643             convertYUV420.ToRGB(y1 + j, y2 + j, uv + j, dst1 + dj, dst2 + dj);
2644 #endif
2645         }
2646         for (; j + 2 <= size.width; j+=2, dj += 8)
2647         {
2648             convertYUV420ToRGB<4, 2, 1>(y1+j, y2+j, uv+j, dst1 + dj, dst2 + dj);
2649         }
2650     }
2651 #else
2652     (void)size;
2653     (void)yBase;
2654     (void)yStride;
2655     (void)uvBase;
2656     (void)uvStride;
2657     (void)dstBase;
2658     (void)dstStride;
2659 #endif
2660 }
2661 
yuv420sp2bgr(const Size2D & size,const u8 * yBase,ptrdiff_t yStride,const u8 * uvBase,ptrdiff_t uvStride,u8 * dstBase,ptrdiff_t dstStride)2662 void yuv420sp2bgr(const Size2D &size,
2663                   const u8 *  yBase, ptrdiff_t  yStride,
2664                   const u8 * uvBase, ptrdiff_t uvStride,
2665                   u8 * dstBase, ptrdiff_t dstStride)
2666 {
2667     internal::assertSupportedConfiguration();
2668 #ifdef CAROTENE_NEON
2669     YUV420_CONSTS(3, 0, 0)
2670     size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
2671 
2672     for (size_t i = 0u; i < size.height; i+=2)
2673     {
2674         const u8 * uv = internal::getRowPtr(uvBase, uvStride, i>>1);
2675         const u8 * y1 = internal::getRowPtr(yBase, yStride, i);
2676         const u8 * y2 = internal::getRowPtr(yBase, yStride, i+1);
2677         u8 * dst1 = internal::getRowPtr(dstBase, dstStride, i);
2678         u8 * dst2 = internal::getRowPtr(dstBase, dstStride, i+1);
2679 
2680         size_t dj = 0u, j = 0u;
2681         for (; j < roiw16; dj += 48, j += 16)
2682         {
2683             internal::prefetch(uv + j);
2684             internal::prefetch(y1 + j);
2685             internal::prefetch(y2 + j);
2686 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
2687             CONVERTYUV420TORGB(3, d1, d0, q6, q5)
2688 #else
2689             convertYUV420.ToRGB(y1 + j, y2 + j, uv + j, dst1 + dj, dst2 + dj);
2690 #endif
2691         }
2692         for (; j + 2 <= size.width; j+=2, dj += 6)
2693         {
2694             convertYUV420ToRGB<3, 0, 0>(y1+j, y2+j, uv+j, dst1 + dj, dst2 + dj);
2695         }
2696     }
2697 #else
2698     (void)size;
2699     (void)yBase;
2700     (void)yStride;
2701     (void)uvBase;
2702     (void)uvStride;
2703     (void)dstBase;
2704     (void)dstStride;
2705 #endif
2706 }
2707 
yuv420sp2bgrx(const Size2D & size,const u8 * yBase,ptrdiff_t yStride,const u8 * uvBase,ptrdiff_t uvStride,u8 * dstBase,ptrdiff_t dstStride)2708 void yuv420sp2bgrx(const Size2D &size,
2709                    const u8 *  yBase, ptrdiff_t  yStride,
2710                    const u8 * uvBase, ptrdiff_t uvStride,
2711                    u8 * dstBase, ptrdiff_t dstStride)
2712 {
2713     internal::assertSupportedConfiguration();
2714 #ifdef CAROTENE_NEON
2715     YUV420_CONSTS(4, 0, 0)
2716     size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
2717 
2718     for (size_t i = 0u; i < size.height; i+=2)
2719     {
2720         const u8 * uv = internal::getRowPtr(uvBase, uvStride, i>>1);
2721         const u8 * y1 = internal::getRowPtr(yBase, yStride, i);
2722         const u8 * y2 = internal::getRowPtr(yBase, yStride, i+1);
2723         u8 * dst1 = internal::getRowPtr(dstBase, dstStride, i);
2724         u8 * dst2 = internal::getRowPtr(dstBase, dstStride, i+1);
2725 
2726         size_t dj = 0u, j = 0u;
2727         for (; j < roiw16; dj += 64, j += 16)
2728         {
2729             internal::prefetch(uv + j);
2730             internal::prefetch(y1 + j);
2731             internal::prefetch(y2 + j);
2732 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
2733             CONVERTYUV420TORGB(4, d1, d0, q6, q5)
2734 #else
2735             convertYUV420.ToRGB(y1 + j, y2 + j, uv + j, dst1 + dj, dst2 + dj);
2736 #endif
2737         }
2738         for (; j + 2 <= size.width; j+=2, dj += 8)
2739         {
2740             convertYUV420ToRGB<4, 0, 0>(y1+j, y2+j, uv+j, dst1 + dj, dst2 + dj);
2741         }
2742     }
2743 #else
2744     (void)size;
2745     (void)yBase;
2746     (void)yStride;
2747     (void)uvBase;
2748     (void)uvStride;
2749     (void)dstBase;
2750     (void)dstStride;
2751 #endif
2752 }
2753 
yuv420i2bgr(const Size2D & size,const u8 * yBase,ptrdiff_t yStride,const u8 * uvBase,ptrdiff_t uvStride,u8 * dstBase,ptrdiff_t dstStride)2754 void yuv420i2bgr(const Size2D &size,
2755                  const u8 *  yBase, ptrdiff_t  yStride,
2756                  const u8 * uvBase, ptrdiff_t uvStride,
2757                  u8 * dstBase, ptrdiff_t dstStride)
2758 {
2759     internal::assertSupportedConfiguration();
2760 #ifdef CAROTENE_NEON
2761     YUV420_CONSTS(3, 0, 1)
2762     size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
2763 
2764     for (size_t i = 0u; i < size.height; i+=2)
2765     {
2766         const u8 * uv = internal::getRowPtr(uvBase, uvStride, i>>1);
2767         const u8 * y1 = internal::getRowPtr(yBase, yStride, i);
2768         const u8 * y2 = internal::getRowPtr(yBase, yStride, i+1);
2769         u8 * dst1 = internal::getRowPtr(dstBase, dstStride, i);
2770         u8 * dst2 = internal::getRowPtr(dstBase, dstStride, i+1);
2771 
2772         size_t dj = 0u, j = 0u;
2773         for (; j < roiw16; dj += 48, j += 16)
2774         {
2775             internal::prefetch(uv + j);
2776             internal::prefetch(y1 + j);
2777             internal::prefetch(y2 + j);
2778 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
2779             CONVERTYUV420TORGB(3, d0, d1, q6, q5)
2780 #else
2781             convertYUV420.ToRGB(y1 + j, y2 + j, uv + j, dst1 + dj, dst2 + dj);
2782 #endif
2783         }
2784         for (; j + 2 <= size.width; j+=2, dj += 6)
2785         {
2786             convertYUV420ToRGB<3, 0, 1>(y1+j, y2+j, uv+j, dst1 + dj, dst2 + dj);
2787         }
2788     }
2789 #else
2790     (void)size;
2791     (void)yBase;
2792     (void)yStride;
2793     (void)uvBase;
2794     (void)uvStride;
2795     (void)dstBase;
2796     (void)dstStride;
2797 #endif
2798 }
2799 
yuv420i2bgrx(const Size2D & size,const u8 * yBase,ptrdiff_t yStride,const u8 * uvBase,ptrdiff_t uvStride,u8 * dstBase,ptrdiff_t dstStride)2800 void yuv420i2bgrx(const Size2D &size,
2801                   const u8 *  yBase, ptrdiff_t  yStride,
2802                   const u8 * uvBase, ptrdiff_t uvStride,
2803                   u8 * dstBase, ptrdiff_t dstStride)
2804 {
2805     internal::assertSupportedConfiguration();
2806 #ifdef CAROTENE_NEON
2807     YUV420_CONSTS(4, 0, 1)
2808     size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
2809 
2810     for (size_t i = 0u; i < size.height; i+=2)
2811     {
2812         const u8 * uv = internal::getRowPtr(uvBase, uvStride, i>>1);
2813         const u8 * y1 = internal::getRowPtr(yBase, yStride, i);
2814         const u8 * y2 = internal::getRowPtr(yBase, yStride, i+1);
2815         u8 * dst1 = internal::getRowPtr(dstBase, dstStride, i);
2816         u8 * dst2 = internal::getRowPtr(dstBase, dstStride, i+1);
2817 
2818         size_t dj = 0u, j = 0u;
2819         for (; j < roiw16; dj += 64, j += 16)
2820         {
2821             internal::prefetch(uv + j);
2822             internal::prefetch(y1 + j);
2823             internal::prefetch(y2 + j);
2824 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 &&  __GNUC_MINOR__ < 7 && !defined(__clang__)
2825             CONVERTYUV420TORGB(4, d0, d1, q6, q5)
2826 #else
2827             convertYUV420.ToRGB(y1 + j, y2 + j, uv + j, dst1 + dj, dst2 + dj);
2828 #endif
2829         }
2830         for (; j + 2 <= size.width; j+=2, dj += 8)
2831         {
2832             convertYUV420ToRGB<4, 0, 1>(y1+j, y2+j, uv+j, dst1 + dj, dst2 + dj);
2833         }
2834     }
2835 #else
2836     (void)size;
2837     (void)yBase;
2838     (void)yStride;
2839     (void)uvBase;
2840     (void)uvStride;
2841     (void)dstBase;
2842     (void)dstStride;
2843 #endif
2844 }
2845 
2846 } // namespace CAROTENE_NS
2847