1 /*
2 * By downloading, copying, installing or using the software you agree to this license.
3 * If you do not agree to this license, do not download, install,
4 * copy or use the software.
5 *
6 *
7 * License Agreement
8 * For Open Source Computer Vision Library
9 * (3-clause BSD License)
10 *
11 * Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
12 * Third party copyrights are property of their respective owners.
13 *
14 * Redistribution and use in source and binary forms, with or without modification,
15 * are permitted provided that the following conditions are met:
16 *
17 * * Redistributions of source code must retain the above copyright notice,
18 * this list of conditions and the following disclaimer.
19 *
20 * * Redistributions in binary form must reproduce the above copyright notice,
21 * this list of conditions and the following disclaimer in the documentation
22 * and/or other materials provided with the distribution.
23 *
24 * * Neither the names of the copyright holders nor the names of the contributors
25 * may be used to endorse or promote products derived from this software
26 * without specific prior written permission.
27 *
28 * This software is provided by the copyright holders and contributors "as is" and
29 * any express or implied warranties, including, but not limited to, the implied
30 * warranties of merchantability and fitness for a particular purpose are disclaimed.
31 * In no event shall copyright holders or contributors be liable for any direct,
32 * indirect, incidental, special, exemplary, or consequential damages
33 * (including, but not limited to, procurement of substitute goods or services;
34 * loss of use, data, or profits; or business interruption) however caused
35 * and on any theory of liability, whether in contract, strict liability,
36 * or tort (including negligence or otherwise) arising in any way out of
37 * the use of this software, even if advised of the possibility of such damage.
38 */
39
40 #include "common.hpp"
41
42 #include "saturate_cast.hpp"
43
44 namespace CAROTENE_NS {
45
46 #ifdef CAROTENE_NEON
47
48 namespace {
49
50 enum
51 {
52 SHIFT = 14,
53 SHIFT_DELTA = 1 << (SHIFT - 1),
54
55 R2Y_BT601 = 4899,
56 G2Y_BT601 = 9617,
57 B2Y_BT601 = 1868,
58
59 R2Y_BT709 = 3483,
60 G2Y_BT709 = 11718,
61 B2Y_BT709 = 1183,
62 };
63
convertToGray(const uint16x8_t & v_r,const uint16x8_t & v_g,const uint16x8_t & v_b,const uint16x4_t & v_r2y,const uint16x4_t & v_g2y,const uint16x4_t & v_b2y)64 inline uint8x8_t convertToGray(const uint16x8_t & v_r,
65 const uint16x8_t & v_g,
66 const uint16x8_t & v_b,
67 const uint16x4_t & v_r2y,
68 const uint16x4_t & v_g2y,
69 const uint16x4_t & v_b2y)
70 {
71 uint32x4_t v_dst0 = vmull_u16(vget_low_u16(v_g), v_g2y);
72 uint32x4_t v_dst1 = vmull_u16(vget_high_u16(v_g), v_g2y);
73
74 v_dst0 = vmlal_u16(v_dst0, vget_low_u16(v_r), v_r2y);
75 v_dst1 = vmlal_u16(v_dst1, vget_high_u16(v_r), v_r2y);
76
77 v_dst0 = vmlal_u16(v_dst0, vget_low_u16(v_b), v_b2y);
78 v_dst1 = vmlal_u16(v_dst1, vget_high_u16(v_b), v_b2y);
79
80 uint8x8_t v_gray = vqmovn_u16(vcombine_u16(vrshrn_n_u32(v_dst0, SHIFT),
81 vrshrn_n_u32(v_dst1, SHIFT)));
82
83 return v_gray;
84 }
85
86 } // namespace
87
88 #endif
89
rgb2gray(const Size2D & size,COLOR_SPACE color_space,const u8 * srcBase,ptrdiff_t srcStride,u8 * dstBase,ptrdiff_t dstStride)90 void rgb2gray(const Size2D &size, COLOR_SPACE color_space,
91 const u8 * srcBase, ptrdiff_t srcStride,
92 u8 * dstBase, ptrdiff_t dstStride)
93 {
94 internal::assertSupportedConfiguration();
95 #ifdef CAROTENE_NEON
96 const u32 R2Y = color_space == COLOR_SPACE_BT601 ? R2Y_BT601 : R2Y_BT709;
97 const u32 G2Y = color_space == COLOR_SPACE_BT601 ? G2Y_BT601 : G2Y_BT709;
98 const u32 B2Y = color_space == COLOR_SPACE_BT601 ? B2Y_BT601 : B2Y_BT709;
99
100 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
101 register int16x4_t v_r2y asm ("d31") = vmov_n_s16(R2Y);
102 register int16x4_t v_g2y asm ("d30") = vmov_n_s16(G2Y);
103 register int16x4_t v_b2y asm ("d29") = vmov_n_s16(B2Y);
104 #else
105 uint16x4_t v_r2y = vdup_n_u16(R2Y),
106 v_g2y = vdup_n_u16(G2Y),
107 v_b2y = vdup_n_u16(B2Y);
108
109 size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
110 #endif
111 size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
112
113 for (size_t i = 0u; i < size.height; ++i)
114 {
115 const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
116 u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
117 size_t sj = 0u, dj = 0u;
118
119 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
120 for (; dj < roiw8; sj += 24, dj += 8)
121 {
122 internal::prefetch(src + sj);
123 __asm__ (
124 "vld3.8 {d0-d2}, [%[in]] @RGB \n\t"
125 "vmovl.u8 q2, d0 @R (d4,d5) \n\t"
126 "vmovl.u8 q3, d1 @G (d6,d7) \n\t"
127 "vmovl.u8 q4, d2 @B (d8,d9) \n\t"
128 "vmull.u16 q5, d6, d30 @Y (q5,q6): G \n\t"
129 "vmull.u16 q6, d7, d30 @Y (q5,q6): G \n\t"
130 "vmlal.s16 q5, d8, d29 @Y (q5,q6): GB \n\t"
131 "vmlal.s16 q6, d9, d29 @Y (q5,q6): GB \n\t"
132 "vmlal.s16 q5, d4, d31 @Y (q5,q6): GBR \n\t"
133 "vmlal.s16 q6, d5, d31 @Y (q5,q6): GBR \n\t"
134 "vrshrn.s32 d8, q5, #14 @Y -> q4 \n\t"
135 "vrshrn.s32 d9, q6, #14 @Y -> q4 \n\t"
136 "vqmovn.u16 d4, q4 \n\t"
137 "vst1.8 {d4}, [%[out]] \n\t"
138 : /*no output*/
139 : [out] "r" (dst + dj), [in] "r" (src + sj), "w" (v_r2y), "w" (v_g2y), "w" (v_b2y)
140 : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13"
141 );
142 }
143 #else
144 for (; dj < roiw16; sj += 48, dj += 16)
145 {
146 internal::prefetch(src + sj);
147 uint8x16x3_t v_src0 = vld3q_u8(src + sj);
148 // 0
149 uint16x8_t v_r = vmovl_u8(vget_low_u8(v_src0.val[0])),
150 v_g = vmovl_u8(vget_low_u8(v_src0.val[1])),
151 v_b = vmovl_u8(vget_low_u8(v_src0.val[2]));
152 uint8x8_t v_gray0 = convertToGray(v_r, v_g, v_b, v_r2y, v_g2y, v_b2y);
153
154 v_r = vmovl_u8(vget_high_u8(v_src0.val[0])),
155 v_g = vmovl_u8(vget_high_u8(v_src0.val[1])),
156 v_b = vmovl_u8(vget_high_u8(v_src0.val[2]));
157 uint8x8_t v_gray1 = convertToGray(v_r, v_g, v_b, v_r2y, v_g2y, v_b2y);
158
159 vst1q_u8(dst + dj, vcombine_u8(v_gray0, v_gray1));
160 }
161
162 if (dj < roiw8)
163 {
164 uint8x8x3_t v_src = vld3_u8(src + sj);
165 uint16x8_t v_r = vmovl_u8(v_src.val[0]),
166 v_g = vmovl_u8(v_src.val[1]),
167 v_b = vmovl_u8(v_src.val[2]);
168 uint8x8_t v_gray = convertToGray(v_r, v_g, v_b, v_r2y, v_g2y, v_b2y);
169
170 vst1_u8(dst + dj, v_gray);
171 sj += 24; dj += 8;
172 }
173 #endif
174
175 for (; dj < size.width; sj += 3, dj++)
176 {
177 u32 val = src[sj] * R2Y + src[sj + 1] * G2Y + src[sj + 2] * B2Y;
178 dst[dj] = internal::saturate_cast<u8>((val + SHIFT_DELTA) >> SHIFT);
179 }
180 }
181 #else
182 (void)size;
183 (void)color_space;
184 (void)srcBase;
185 (void)srcStride;
186 (void)dstBase;
187 (void)dstStride;
188 #endif
189 }
190
rgbx2gray(const Size2D & size,COLOR_SPACE color_space,const u8 * srcBase,ptrdiff_t srcStride,u8 * dstBase,ptrdiff_t dstStride)191 void rgbx2gray(const Size2D &size, COLOR_SPACE color_space,
192 const u8 * srcBase, ptrdiff_t srcStride,
193 u8 * dstBase, ptrdiff_t dstStride)
194 {
195 internal::assertSupportedConfiguration();
196 #ifdef CAROTENE_NEON
197 const u32 R2Y = color_space == COLOR_SPACE_BT601 ? R2Y_BT601 : R2Y_BT709;
198 const u32 G2Y = color_space == COLOR_SPACE_BT601 ? G2Y_BT601 : G2Y_BT709;
199 const u32 B2Y = color_space == COLOR_SPACE_BT601 ? B2Y_BT601 : B2Y_BT709;
200
201 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
202 register int16x4_t v_r2y asm ("d31") = vmov_n_s16(R2Y);
203 register int16x4_t v_g2y asm ("d30") = vmov_n_s16(G2Y);
204 register int16x4_t v_b2y asm ("d29") = vmov_n_s16(B2Y);
205 #else
206 uint16x4_t v_r2y = vdup_n_u16(R2Y),
207 v_g2y = vdup_n_u16(G2Y),
208 v_b2y = vdup_n_u16(B2Y);
209
210 size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
211 #endif
212 size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
213
214 for (size_t i = 0u; i < size.height; ++i)
215 {
216 const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
217 u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
218 size_t sj = 0u, dj = 0u;
219
220 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
221 for (; dj < roiw8; sj += 32, dj += 8)
222 {
223 internal::prefetch(src + sj);
224 __asm__ (
225 "vld4.8 {d0-d3}, [%[in]] @RGBA \n\t"
226 "vmovl.u8 q2, d0 @R (d4,d5) \n\t"
227 "vmovl.u8 q3, d1 @G (d6,d7) \n\t"
228 "vmovl.u8 q4, d2 @B (d8,d9) \n\t"
229 "vmull.u16 q5, d6, d30 @Y (q5,q6): G \n\t"
230 "vmull.u16 q6, d7, d30 @Y (q5,q6): G \n\t"
231 "vmlal.s16 q5, d8, d29 @Y (q5,q6): GB \n\t"
232 "vmlal.s16 q6, d9, d29 @Y (q5,q6): GB \n\t"
233 "vmlal.s16 q5, d4, d31 @Y (q5,q6): GBR \n\t"
234 "vmlal.s16 q6, d5, d31 @Y (q5,q6): GBR \n\t"
235 "vrshrn.s32 d8, q5, #14 @Y -> q4 \n\t"
236 "vrshrn.s32 d9, q6, #14 @Y -> q4 \n\t"
237 "vqmovn.u16 d4, q4 \n\t"
238 "vst1.8 {d4}, [%[out]] \n\t"
239 : /*no output*/
240 : [out] "r" (dst + dj), [in] "r" (src + sj), "w" (v_r2y), "w" (v_g2y), "w" (v_b2y)
241 : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13"
242 );
243 }
244 #else
245 for (; dj < roiw16; sj += 64, dj += 16)
246 {
247 internal::prefetch(src + sj);
248 uint8x16x4_t v_src0 = vld4q_u8(src + sj);
249
250 // 0
251 uint16x8_t v_r = vmovl_u8(vget_low_u8(v_src0.val[0])),
252 v_g = vmovl_u8(vget_low_u8(v_src0.val[1])),
253 v_b = vmovl_u8(vget_low_u8(v_src0.val[2]));
254 uint8x8_t v_gray0 = convertToGray(v_r, v_g, v_b, v_r2y, v_g2y, v_b2y);
255
256 v_r = vmovl_u8(vget_high_u8(v_src0.val[0])),
257 v_g = vmovl_u8(vget_high_u8(v_src0.val[1])),
258 v_b = vmovl_u8(vget_high_u8(v_src0.val[2]));
259 uint8x8_t v_gray1 = convertToGray(v_r, v_g, v_b, v_r2y, v_g2y, v_b2y);
260
261 vst1q_u8(dst + dj, vcombine_u8(v_gray0, v_gray1));
262 }
263
264 if (dj < roiw8)
265 {
266 uint8x8x4_t v_src = vld4_u8(src + sj);
267 uint16x8_t v_r = vmovl_u8(v_src.val[0]),
268 v_g = vmovl_u8(v_src.val[1]),
269 v_b = vmovl_u8(v_src.val[2]);
270 uint8x8_t v_gray = convertToGray(v_r, v_g, v_b, v_r2y, v_g2y, v_b2y);
271
272 vst1_u8(dst + dj, v_gray);
273 sj += 32; dj += 8;
274 }
275 #endif
276
277 for (; dj < size.width; sj += 4, dj++)
278 {
279 u32 val = src[sj] * R2Y + src[sj + 1] * G2Y + src[sj + 2] * B2Y;
280 dst[dj] = internal::saturate_cast<u8>((val + SHIFT_DELTA) >> SHIFT);
281 }
282 }
283 #else
284 (void)size;
285 (void)color_space;
286 (void)srcBase;
287 (void)srcStride;
288 (void)dstBase;
289 (void)dstStride;
290 #endif
291 }
292
bgr2gray(const Size2D & size,COLOR_SPACE color_space,const u8 * srcBase,ptrdiff_t srcStride,u8 * dstBase,ptrdiff_t dstStride)293 void bgr2gray(const Size2D &size, COLOR_SPACE color_space,
294 const u8 * srcBase, ptrdiff_t srcStride,
295 u8 * dstBase, ptrdiff_t dstStride)
296 {
297 internal::assertSupportedConfiguration();
298 #ifdef CAROTENE_NEON
299 const u32 R2Y = color_space == COLOR_SPACE_BT601 ? R2Y_BT601 : R2Y_BT709;
300 const u32 G2Y = color_space == COLOR_SPACE_BT601 ? G2Y_BT601 : G2Y_BT709;
301 const u32 B2Y = color_space == COLOR_SPACE_BT601 ? B2Y_BT601 : B2Y_BT709;
302
303 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
304 register int16x4_t v_r2y asm ("d31") = vmov_n_s16(R2Y);
305 register int16x4_t v_g2y asm ("d30") = vmov_n_s16(G2Y);
306 register int16x4_t v_b2y asm ("d29") = vmov_n_s16(B2Y);
307 #else
308 uint16x4_t v_r2y = vdup_n_u16(R2Y),
309 v_g2y = vdup_n_u16(G2Y),
310 v_b2y = vdup_n_u16(B2Y);
311
312 size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
313 #endif
314 size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
315
316 for (size_t i = 0u; i < size.height; ++i)
317 {
318 const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
319 u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
320 size_t sj = 0u, dj = 0u;
321
322 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
323 for (; dj < roiw8; sj += 24, dj += 8)
324 {
325 internal::prefetch(src + sj);
326 __asm__ (
327 "vld3.8 {d0-d2}, [%[in]] @BGR \n\t"
328 "vmovl.u8 q2, d2 @R (d4,d5) \n\t"
329 "vmovl.u8 q3, d1 @G (d6,d7) \n\t"
330 "vmovl.u8 q4, d0 @B (d8,d9) \n\t"
331 "vmull.u16 q5, d6, d30 @Y (q5,q6): G \n\t"
332 "vmull.u16 q6, d7, d30 @Y (q5,q6): G \n\t"
333 "vmlal.s16 q5, d8, d29 @Y (q5,q6): GB \n\t"
334 "vmlal.s16 q6, d9, d29 @Y (q5,q6): GB \n\t"
335 "vmlal.s16 q5, d4, d31 @Y (q5,q6): GBR \n\t"
336 "vmlal.s16 q6, d5, d31 @Y (q5,q6): GBR \n\t"
337 "vrshrn.s32 d8, q5, #14 @Y -> q4 \n\t"
338 "vrshrn.s32 d9, q6, #14 @Y -> q4 \n\t"
339 "vqmovn.u16 d4, q4 \n\t"
340 "vst1.8 {d4}, [%[out]] \n\t"
341 : /*no output*/
342 : [out] "r" (dst + dj), [in] "r" (src + sj), "w" (v_r2y), "w" (v_g2y), "w" (v_b2y)
343 : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13"
344 );
345 }
346 #else
347 for (; dj < roiw16; sj += 48, dj += 16)
348 {
349 internal::prefetch(src + sj);
350 uint8x16x3_t v_src0 = vld3q_u8(src + sj);
351
352 // 0
353 uint16x8_t v_b = vmovl_u8(vget_low_u8(v_src0.val[0])),
354 v_g = vmovl_u8(vget_low_u8(v_src0.val[1])),
355 v_r = vmovl_u8(vget_low_u8(v_src0.val[2]));
356 uint8x8_t v_gray0 = convertToGray(v_r, v_g, v_b, v_r2y, v_g2y, v_b2y);
357
358 v_b = vmovl_u8(vget_high_u8(v_src0.val[0])),
359 v_g = vmovl_u8(vget_high_u8(v_src0.val[1])),
360 v_r = vmovl_u8(vget_high_u8(v_src0.val[2]));
361 uint8x8_t v_gray1 = convertToGray(v_r, v_g, v_b, v_r2y, v_g2y, v_b2y);
362
363 vst1q_u8(dst + dj, vcombine_u8(v_gray0, v_gray1));
364 }
365
366 if (dj < roiw8)
367 {
368 uint8x8x3_t v_src = vld3_u8(src + sj);
369 uint16x8_t v_b = vmovl_u8(v_src.val[0]),
370 v_g = vmovl_u8(v_src.val[1]),
371 v_r = vmovl_u8(v_src.val[2]);
372 uint8x8_t v_gray = convertToGray(v_r, v_g, v_b, v_r2y, v_g2y, v_b2y);
373
374 vst1_u8(dst + dj, v_gray);
375 sj += 24; dj += 8;
376 }
377 #endif
378
379 for (; dj < size.width; sj += 3, dj++)
380 {
381 u32 val = src[sj] * B2Y + src[sj + 1] * G2Y + src[sj + 2] * R2Y;
382 dst[dj] = internal::saturate_cast<u8>((val + SHIFT_DELTA) >> SHIFT);
383 }
384 }
385 #else
386 (void)size;
387 (void)color_space;
388 (void)srcBase;
389 (void)srcStride;
390 (void)dstBase;
391 (void)dstStride;
392 #endif
393 }
394
bgrx2gray(const Size2D & size,COLOR_SPACE color_space,const u8 * srcBase,ptrdiff_t srcStride,u8 * dstBase,ptrdiff_t dstStride)395 void bgrx2gray(const Size2D &size, COLOR_SPACE color_space,
396 const u8 * srcBase, ptrdiff_t srcStride,
397 u8 * dstBase, ptrdiff_t dstStride)
398 {
399 internal::assertSupportedConfiguration();
400 #ifdef CAROTENE_NEON
401 const u32 R2Y = color_space == COLOR_SPACE_BT601 ? R2Y_BT601 : R2Y_BT709;
402 const u32 G2Y = color_space == COLOR_SPACE_BT601 ? G2Y_BT601 : G2Y_BT709;
403 const u32 B2Y = color_space == COLOR_SPACE_BT601 ? B2Y_BT601 : B2Y_BT709;
404
405 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
406 register int16x4_t v_r2y asm ("d31") = vmov_n_s16(R2Y);
407 register int16x4_t v_g2y asm ("d30") = vmov_n_s16(G2Y);
408 register int16x4_t v_b2y asm ("d29") = vmov_n_s16(B2Y);
409 #else
410 uint16x4_t v_r2y = vdup_n_u16(R2Y),
411 v_g2y = vdup_n_u16(G2Y),
412 v_b2y = vdup_n_u16(B2Y);
413
414 size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
415 #endif
416 size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
417
418 for (size_t i = 0u; i < size.height; ++i)
419 {
420 const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
421 u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
422 size_t sj = 0u, dj = 0u;
423
424 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
425 for (; dj < roiw8; sj += 32, dj += 8)
426 {
427 internal::prefetch(src + sj);
428 __asm__ (
429 "vld4.8 {d0-d3}, [%[in]] @BGRA \n\t"
430 "vmovl.u8 q2, d2 @R (d4,d5) \n\t"
431 "vmovl.u8 q3, d1 @G (d6,d7) \n\t"
432 "vmovl.u8 q4, d0 @B (d8,d9) \n\t"
433 "vmull.u16 q5, d6, d30 @Y (q5,q6): G \n\t"
434 "vmull.u16 q6, d7, d30 @Y (q5,q6): G \n\t"
435 "vmlal.s16 q5, d8, d29 @Y (q5,q6): GB \n\t"
436 "vmlal.s16 q6, d9, d29 @Y (q5,q6): GB \n\t"
437 "vmlal.s16 q5, d4, d31 @Y (q5,q6): GBR \n\t"
438 "vmlal.s16 q6, d5, d31 @Y (q5,q6): GBR \n\t"
439 "vrshrn.s32 d8, q5, #14 @Y -> q4 \n\t"
440 "vrshrn.s32 d9, q6, #14 @Y -> q4 \n\t"
441 "vqmovn.u16 d4, q4 \n\t"
442 "vst1.8 {d4}, [%[out]] \n\t"
443 : /*no output*/
444 : [out] "r" (dst + dj), [in] "r" (src + sj), "w" (v_r2y), "w" (v_g2y), "w" (v_b2y)
445 : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9","d10","d11","d12","d13"
446 );
447 }
448 #else
449 for (; dj < roiw16; sj += 64, dj += 16)
450 {
451 internal::prefetch(src + sj);
452 uint8x16x4_t v_src0 = vld4q_u8(src + sj);
453
454 // 0
455 uint16x8_t v_b = vmovl_u8(vget_low_u8(v_src0.val[0])),
456 v_g = vmovl_u8(vget_low_u8(v_src0.val[1])),
457 v_r = vmovl_u8(vget_low_u8(v_src0.val[2]));
458 uint8x8_t v_gray0 = convertToGray(v_r, v_g, v_b, v_r2y, v_g2y, v_b2y);
459
460 v_b = vmovl_u8(vget_high_u8(v_src0.val[0])),
461 v_g = vmovl_u8(vget_high_u8(v_src0.val[1])),
462 v_r = vmovl_u8(vget_high_u8(v_src0.val[2]));
463 uint8x8_t v_gray1 = convertToGray(v_r, v_g, v_b, v_r2y, v_g2y, v_b2y);
464
465 vst1q_u8(dst + dj, vcombine_u8(v_gray0, v_gray1));
466 }
467
468 if (dj < roiw8)
469 {
470 uint8x8x4_t v_src = vld4_u8(src + sj);
471 uint16x8_t v_b = vmovl_u8(v_src.val[0]),
472 v_g = vmovl_u8(v_src.val[1]),
473 v_r = vmovl_u8(v_src.val[2]);
474 uint8x8_t v_gray = convertToGray(v_r, v_g, v_b, v_r2y, v_g2y, v_b2y);
475
476 vst1_u8(dst + dj, v_gray);
477 sj += 32; dj += 8;
478 }
479 #endif
480
481 for (; dj < size.width; sj += 4, dj++)
482 {
483 u32 val = src[sj] * B2Y + src[sj + 1] * G2Y + src[sj + 2] * R2Y;
484 dst[dj] = internal::saturate_cast<u8>((val + SHIFT_DELTA) >> SHIFT);
485 }
486 }
487 #else
488 (void)size;
489 (void)color_space;
490 (void)srcBase;
491 (void)srcStride;
492 (void)dstBase;
493 (void)dstStride;
494 #endif
495 }
496
gray2rgb(const Size2D & size,const u8 * srcBase,ptrdiff_t srcStride,u8 * dstBase,ptrdiff_t dstStride)497 void gray2rgb(const Size2D &size,
498 const u8 * srcBase, ptrdiff_t srcStride,
499 u8 * dstBase, ptrdiff_t dstStride)
500 {
501 internal::assertSupportedConfiguration();
502 #ifdef CAROTENE_NEON
503 size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
504 size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
505
506 for (size_t i = 0u; i < size.height; ++i)
507 {
508 const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
509 u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
510 size_t sj = 0u, dj = 0u;
511
512 for (; sj < roiw16; sj += 16, dj += 48)
513 {
514 internal::prefetch(src + sj);
515 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
516 __asm__ (
517 "vld1.8 {d0-d1}, [%[in0]] \n\t"
518 "vmov.8 q1, q0 \n\t"
519 "vmov.8 q2, q0 \n\t"
520 "vmov.8 q3, q1 \n\t"
521 "vst3.8 {d2, d4, d6}, [%[out0]] \n\t"
522 "vst3.8 {d3, d5, d7}, [%[out1]] \n\t"
523 : /*no output*/
524 : [out0] "r" (dst + dj), [out1] "r" (dst + dj + 24),
525 [in0] "r" (src + sj)
526 : "d0","d1","d2","d3","d4","d5","d6","d7"
527 );
528 #else
529 uint8x16x3_t vRgb1;
530 vRgb1.val[0] = vld1q_u8(src + sj);
531
532 vRgb1.val[1] = vRgb1.val[0];
533 vRgb1.val[2] = vRgb1.val[0];
534
535 vst3q_u8(dst + dj, vRgb1);
536 #endif
537 }
538
539 if (sj < roiw8)
540 {
541 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
542 __asm__ (
543 "vld1.8 {d0}, [%[in]] \n\t"
544 "vmov.8 d1, d0 \n\t"
545 "vmov.8 d2, d0 \n\t"
546 "vst3.8 {d0-d2}, [%[out]] \n\t"
547 : /*no output*/
548 : [out] "r" (dst + dj), [in] "r" (src + sj)
549 : "d0","d1","d2"
550 );
551 #else
552 uint8x8x3_t vRgb2;
553 vRgb2.val[0] = vld1_u8(src + sj);
554 vRgb2.val[1] = vRgb2.val[0];
555 vRgb2.val[2] = vRgb2.val[0];
556
557 vst3_u8(dst + dj, vRgb2);
558 #endif
559 sj += 8; dj += 24;
560 }
561
562 for (; sj < size.width; sj++, dj += 3)
563 {
564 dst[dj+0] = src[sj];
565 dst[dj+1] = src[sj];
566 dst[dj+2] = src[sj];
567 }
568 }
569 #else
570 (void)size;
571 (void)srcBase;
572 (void)srcStride;
573 (void)dstBase;
574 (void)dstStride;
575 #endif
576 }
577
gray2rgbx(const Size2D & size,const u8 * srcBase,ptrdiff_t srcStride,u8 * dstBase,ptrdiff_t dstStride)578 void gray2rgbx(const Size2D &size,
579 const u8 * srcBase, ptrdiff_t srcStride,
580 u8 * dstBase, ptrdiff_t dstStride)
581 {
582 internal::assertSupportedConfiguration();
583 #ifdef CAROTENE_NEON
584 size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
585 size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
586
587 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
588 register uint8x16_t vc255 asm ("q4") = vmovq_n_u8(255);
589 #else
590 uint8x16x4_t vRgba;
591 uint8x8x4_t vRgba2;
592 vRgba.val[3] = vmovq_n_u8(255);
593 vRgba2.val[3] = vget_low_u8(vRgba.val[3]);
594 #endif
595
596 for (size_t i = 0u; i < size.height; ++i)
597 {
598 const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
599 u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
600 size_t sj = 0u, dj = 0u;
601
602 for (; sj < roiw16; sj += 16, dj += 64)
603 {
604 internal::prefetch(src + sj);
605 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
606 __asm__ (
607 "vld1.8 {d0-d1}, [%[in0]] \n\t"
608 "vmov.8 q1, q0 \n\t"
609 "vmov.8 q2, q0 \n\t"
610 "vmov.8 q3, q1 \n\t"
611 "vst4.8 {d2, d4, d6, d8}, [%[out0]] \n\t"
612 "vst4.8 {d3, d5, d7, d9}, [%[out1]] \n\t"
613 : /*no output*/
614 : [out0] "r" (dst + dj), [out1] "r" (dst + dj + 32),
615 [in0] "r" (src + sj),
616 "w" (vc255)
617 : "d0","d1","d2","d3","d4","d5","d6","d7"
618 );
619 #else
620 vRgba.val[0] = vld1q_u8(src + sj);
621
622 vRgba.val[1] = vRgba.val[0];
623 vRgba.val[2] = vRgba.val[0];
624
625 vst4q_u8(dst + dj, vRgba);
626 #endif
627 }
628
629 if (sj < roiw8)
630 {
631 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
632 __asm__ (
633 "vld1.8 {d5}, [%[in]] \n\t"
634 "vmov.8 d6, d5 \n\t"
635 "vmov.8 d7, d5 \n\t"
636 "vst4.8 {d5-d8}, [%[out]] \n\t"
637 : /*no output*/
638 : [out] "r" (dst + dj), [in] "r" (src + sj), "w" (vc255)
639 : "d5","d6","d7"
640 );
641 #else
642 vRgba2.val[0] = vld1_u8(src + sj);
643 vRgba2.val[1] = vRgba2.val[0];
644 vRgba2.val[2] = vRgba2.val[0];
645
646 vst4_u8(dst + dj, vRgba2);
647 #endif
648 sj += 8; dj += 32;
649 }
650
651 for (; sj < size.width; sj++, dj += 4)
652 {
653 dst[dj+0] = src[sj];
654 dst[dj+1] = src[sj];
655 dst[dj+2] = src[sj];
656 dst[dj+3] = 255;
657 }
658 }
659 #else
660 (void)size;
661 (void)srcBase;
662 (void)srcStride;
663 (void)dstBase;
664 (void)dstStride;
665 #endif
666 }
667
rgb2rgbx(const Size2D & size,const u8 * srcBase,ptrdiff_t srcStride,u8 * dstBase,ptrdiff_t dstStride)668 void rgb2rgbx(const Size2D &size,
669 const u8 * srcBase, ptrdiff_t srcStride,
670 u8 * dstBase, ptrdiff_t dstStride)
671 {
672 internal::assertSupportedConfiguration();
673 #ifdef CAROTENE_NEON
674 size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
675 #if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__)
676 register uint8x8_t vc255_0 asm ("d3") = vmov_n_u8(255);
677 #else
678 size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
679 union { uint8x16x4_t v4; uint8x16x3_t v3; } v_dst0;
680 v_dst0.v4.val[3] = vdupq_n_u8(255);
681 union { uint8x8x4_t v4; uint8x8x3_t v3; } v_dst;
682 v_dst.v4.val[3] = vdup_n_u8(255);
683 #endif
684
685 for (size_t i = 0u; i < size.height; ++i)
686 {
687 const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
688 u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
689 size_t sj = 0u, dj = 0u, j = 0u;
690
691 #if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__)
692 for (; j < roiw8; sj += 24, dj += 32, j += 8)
693 {
694 internal::prefetch(src + sj);
695 __asm__ (
696 "vld3.8 {d0, d1, d2}, [%[in0]] \n\t"
697 "vst4.8 {d0, d1, d2, d3}, [%[out0]] \n\t"
698 : /*no output*/
699 : [out0] "r" (dst + dj),
700 [in0] "r" (src + sj),
701 "w" (vc255_0)
702 : "d0","d1","d2"
703 );
704 }
705 #else
706 for (; j < roiw16; sj += 48, dj += 64, j += 16)
707 {
708 internal::prefetch(src + sj);
709 v_dst0.v3 = vld3q_u8(src + sj);
710 vst4q_u8(dst + dj, v_dst0.v4);
711 }
712
713 if (j < roiw8)
714 {
715 v_dst.v3 = vld3_u8(src + sj);
716 vst4_u8(dst + dj, v_dst.v4);
717 sj += 24; dj += 32; j += 8;
718 }
719 #endif
720
721 for (; j < size.width; ++j, sj += 3, dj += 4)
722 {
723 dst[dj] = src[sj];
724 dst[dj + 1] = src[sj + 1];
725 dst[dj + 2] = src[sj + 2];
726 dst[dj + 3] = 255;
727 }
728 }
729 #else
730 (void)size;
731 (void)srcBase;
732 (void)srcStride;
733 (void)dstBase;
734 (void)dstStride;
735 #endif
736 }
737
rgbx2rgb(const Size2D & size,const u8 * srcBase,ptrdiff_t srcStride,u8 * dstBase,ptrdiff_t dstStride)738 void rgbx2rgb(const Size2D &size,
739 const u8 * srcBase, ptrdiff_t srcStride,
740 u8 * dstBase, ptrdiff_t dstStride)
741 {
742 internal::assertSupportedConfiguration();
743 #ifdef CAROTENE_NEON
744 size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
745 #if !(!defined(__aarch64__) && defined(__GNUC__) && defined(__arm__))
746 size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
747 union { uint8x16x4_t v4; uint8x16x3_t v3; } v_dst0;
748 union { uint8x8x4_t v4; uint8x8x3_t v3; } v_dst;
749 #endif
750
751 for (size_t i = 0u; i < size.height; ++i)
752 {
753 const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
754 u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
755 size_t sj = 0u, dj = 0u, j = 0u;
756
757 #if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__)
758 for (; j < roiw8; sj += 32, dj += 24, j += 8)
759 {
760 internal::prefetch(src + sj);
761 __asm__ (
762 "vld4.8 {d0, d1, d2, d3}, [%[in0]] \n\t"
763 "vst3.8 {d0, d1, d2}, [%[out0]] \n\t"
764 : /*no output*/
765 : [out0] "r" (dst + dj),
766 [in0] "r" (src + sj)
767 : "d0","d1","d2","d3"
768 );
769 }
770 #else
771 for (; j < roiw16; sj += 64, dj += 48, j += 16)
772 {
773 internal::prefetch(src + sj);
774 v_dst0.v4 = vld4q_u8(src + sj);
775 vst3q_u8(dst + dj, v_dst0.v3);
776 }
777
778 if (j < roiw8)
779 {
780 v_dst.v4 = vld4_u8(src + sj);
781 vst3_u8(dst + dj, v_dst.v3);
782 sj += 32; dj += 24; j += 8;
783 }
784 #endif
785
786 for (; j < size.width; ++j, sj += 4, dj += 3)
787 {
788 dst[dj] = src[sj];
789 dst[dj + 1] = src[sj + 1];
790 dst[dj + 2] = src[sj + 2];
791 }
792 }
793 #else
794 (void)size;
795 (void)srcBase;
796 (void)srcStride;
797 (void)dstBase;
798 (void)dstStride;
799 #endif
800 }
801
rgb2bgr(const Size2D & size,const u8 * srcBase,ptrdiff_t srcStride,u8 * dstBase,ptrdiff_t dstStride)802 void rgb2bgr(const Size2D &size,
803 const u8 * srcBase, ptrdiff_t srcStride,
804 u8 * dstBase, ptrdiff_t dstStride)
805 {
806 internal::assertSupportedConfiguration();
807 #ifdef CAROTENE_NEON
808 #if !(!defined(__aarch64__) && defined(__GNUC__) && defined(__arm__))
809 size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
810 #endif
811 size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
812
813 for (size_t i = 0u; i < size.height; ++i)
814 {
815 const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
816 u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
817 size_t sj = 0u, dj = 0u, j = 0u;
818
819
820 #if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__)
821 for (; j < roiw8; sj += 24, dj += 24, j += 8)
822 {
823 internal::prefetch(src + sj);
824 __asm__ (
825 "vld3.8 {d0, d1, d2}, [%[in0]] \n\t"
826 "vswp d0, d2 \n\t"
827 "vst3.8 {d0, d1, d2}, [%[out0]] \n\t"
828 : /*no output*/
829 : [out0] "r" (dst + dj),
830 [in0] "r" (src + sj)
831 : "d0","d1","d2"
832 );
833 }
834 #else
835 for (; j < roiw16; sj += 48, dj += 48, j += 16)
836 {
837 internal::prefetch(src + sj);
838 uint8x16x3_t vals0 = vld3q_u8(src + sj);
839
840 std::swap(vals0.val[0], vals0.val[2]);
841
842 vst3q_u8(dst + dj, vals0);
843 }
844
845 if (j < roiw8)
846 {
847 uint8x8x3_t vals = vld3_u8(src + sj);
848 std::swap(vals.val[0], vals.val[2]);
849 vst3_u8(dst + dj, vals);
850 sj += 24; dj += 24; j += 8;
851 }
852 #endif
853
854 for (; j < size.width; ++j, sj += 3, dj += 3)
855 {
856 u8 b = src[sj + 2];//Handle src == dst case
857 dst[dj + 2] = src[sj ];
858 dst[dj + 1] = src[sj + 1];
859 dst[dj ] = b;
860 }
861 }
862 #else
863 (void)size;
864 (void)srcBase;
865 (void)srcStride;
866 (void)dstBase;
867 (void)dstStride;
868 #endif
869 }
870
rgbx2bgrx(const Size2D & size,const u8 * srcBase,ptrdiff_t srcStride,u8 * dstBase,ptrdiff_t dstStride)871 void rgbx2bgrx(const Size2D &size,
872 const u8 * srcBase, ptrdiff_t srcStride,
873 u8 * dstBase, ptrdiff_t dstStride)
874 {
875 internal::assertSupportedConfiguration();
876 #ifdef CAROTENE_NEON
877 #if !(!defined(__aarch64__) && defined(__GNUC__) && defined(__arm__))
878 size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
879 #endif
880 size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
881
882 for (size_t i = 0u; i < size.height; ++i)
883 {
884 const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
885 u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
886 size_t sj = 0u, dj = 0u, j = 0u;
887
888 #if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__)
889 for (; j < roiw8; sj += 32, dj += 32, j += 8)
890 {
891 internal::prefetch(src + sj);
892 __asm__ (
893 "vld4.8 {d0, d1, d2, d3}, [%[in0]] \n\t"
894 "vswp d0, d2 \n\t"
895 "vst4.8 {d0, d1, d2, d3}, [%[out0]] \n\t"
896 : /*no output*/
897 : [out0] "r" (dst + dj),
898 [in0] "r" (src + sj)
899 : "d0","d1","d2","d3"
900 );
901 }
902 #else
903 for (; j < roiw16; sj += 64, dj += 64, j += 16)
904 {
905 internal::prefetch(src + sj);
906 uint8x16x4_t vals0 = vld4q_u8(src + sj);
907
908 std::swap(vals0.val[0], vals0.val[2]);
909
910 vst4q_u8(dst + dj, vals0);
911 }
912
913 if (j < roiw8)
914 {
915 uint8x8x4_t vals = vld4_u8(src + sj);
916 std::swap(vals.val[0], vals.val[2]);
917 vst4_u8(dst + dj, vals);
918 sj += 32; dj += 32; j += 8;
919 }
920 #endif
921
922 for (; j < size.width; ++j, sj += 4, dj += 4)
923 {
924 u8 b = src[sj + 2];//Handle src == dst case
925 dst[dj + 2] = src[sj ];
926 dst[dj + 1] = src[sj + 1];
927 dst[dj ] = b;
928 dst[dj + 3] = src[sj + 3];
929 }
930 }
931 #else
932 (void)size;
933 (void)srcBase;
934 (void)srcStride;
935 (void)dstBase;
936 (void)dstStride;
937 #endif
938 }
939
rgbx2bgr(const Size2D & size,const u8 * srcBase,ptrdiff_t srcStride,u8 * dstBase,ptrdiff_t dstStride)940 void rgbx2bgr(const Size2D &size,
941 const u8 * srcBase, ptrdiff_t srcStride,
942 u8 * dstBase, ptrdiff_t dstStride)
943 {
944 internal::assertSupportedConfiguration();
945 #ifdef CAROTENE_NEON
946 #if !(!defined(__aarch64__) && defined(__GNUC__) && defined(__arm__))
947 size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
948 #endif
949 size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
950
951 for (size_t i = 0u; i < size.height; ++i)
952 {
953 const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
954 u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
955 size_t sj = 0u, dj = 0u, j = 0u;
956
957 #if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__)
958 for (; j < roiw8; sj += 32, dj += 24, j += 8)
959 {
960 internal::prefetch(src + sj);
961 __asm__ (
962 "vld4.8 {d0, d1, d2, d3}, [%[in0]] \n\t"
963 "vswp d0, d2 \n\t"
964 "vst3.8 {d0, d1, d2}, [%[out0]] \n\t"
965 : /*no output*/
966 : [out0] "r" (dst + dj),
967 [in0] "r" (src + sj)
968 : "d0","d1","d2","d3"
969 );
970 }
971 #else
972 for (; j < roiw16; sj += 64, dj += 48, j += 16)
973 {
974 internal::prefetch(src + sj);
975 union { uint8x16x4_t v4; uint8x16x3_t v3; } vals0;
976 vals0.v4 = vld4q_u8(src + sj);
977 std::swap(vals0.v3.val[0], vals0.v3.val[2]);
978 vst3q_u8(dst + dj, vals0.v3);
979 }
980
981 if (j < roiw8)
982 {
983 union { uint8x8x4_t v4; uint8x8x3_t v3; } vals;
984 vals.v4 = vld4_u8(src + sj);
985 std::swap(vals.v3.val[0], vals.v3.val[2]);
986 vst3_u8(dst + dj, vals.v3);
987 sj += 32; dj += 24; j += 8;
988 }
989 #endif
990
991 for (; j < size.width; ++j, sj += 4, dj += 3)
992 {
993 dst[dj + 2] = src[sj ];
994 dst[dj + 1] = src[sj + 1];
995 dst[dj ] = src[sj + 2];
996 }
997 }
998 #else
999 (void)size;
1000 (void)srcBase;
1001 (void)srcStride;
1002 (void)dstBase;
1003 (void)dstStride;
1004 #endif
1005 }
1006
rgb2bgrx(const Size2D & size,const u8 * srcBase,ptrdiff_t srcStride,u8 * dstBase,ptrdiff_t dstStride)1007 void rgb2bgrx(const Size2D &size,
1008 const u8 * srcBase, ptrdiff_t srcStride,
1009 u8 * dstBase, ptrdiff_t dstStride)
1010 {
1011 internal::assertSupportedConfiguration();
1012 #ifdef CAROTENE_NEON
1013 #if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__)
1014 register uint8x8_t vc255 asm ("d3") = vmov_n_u8(255);
1015 #else
1016 union { uint8x16x4_t v4; uint8x16x3_t v3; } vals0;
1017 vals0.v4.val[3] = vmovq_n_u8(255);
1018 union { uint8x8x4_t v4; uint8x8x3_t v3; } vals8;
1019 vals8.v4.val[3] = vmov_n_u8(255);
1020 #endif
1021
1022 #if !(!defined(__aarch64__) && defined(__GNUC__) && defined(__arm__))
1023 size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
1024 #endif
1025 size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
1026
1027 for (size_t i = 0u; i < size.height; ++i)
1028 {
1029 const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
1030 u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
1031 size_t sj = 0u, dj = 0u, j = 0u;
1032
1033 #if !defined(__aarch64__) && defined(__GNUC__) && defined(__arm__)
1034 for (; j < roiw8; sj += 24, dj += 32, j += 8)
1035 {
1036 internal::prefetch(src + sj);
1037 __asm__ (
1038 "vld3.8 {d0, d1, d2}, [%[in0]] \n\t"
1039 "vswp d0, d2 \n\t"
1040 "vst4.8 {d0, d1, d2, d3}, [%[out0]] \n\t"
1041 : /*no output*/
1042 : [out0] "r" (dst + dj),
1043 [in0] "r" (src + sj),
1044 "w" (vc255)
1045 : "d0","d1","d2"
1046 );
1047 }
1048 #else
1049 for (; j < roiw16; sj += 48, dj += 64, j += 16)
1050 {
1051 internal::prefetch(src + sj);
1052 vals0.v3 = vld3q_u8(src + sj);
1053 std::swap(vals0.v4.val[0], vals0.v4.val[2]);
1054 vst4q_u8(dst + dj, vals0.v4);
1055 }
1056
1057 if (j < roiw8)
1058 {
1059 vals8.v3 = vld3_u8(src + sj);
1060 std::swap(vals8.v4.val[0], vals8.v4.val[2]);
1061 vst4_u8(dst + dj, vals8.v4);
1062 sj += 24; dj += 32; j += 8;
1063 }
1064 #endif
1065
1066 for (; j < size.width; ++j, sj += 3, dj += 4)
1067 {
1068 dst[dj + 3] = 255;
1069 dst[dj + 2] = src[sj ];
1070 dst[dj + 1] = src[sj + 1];
1071 dst[dj ] = src[sj + 2];
1072 }
1073 }
1074 #else
1075 (void)size;
1076 (void)srcBase;
1077 (void)srcStride;
1078 (void)dstBase;
1079 (void)dstStride;
1080 #endif
1081 }
1082
1083 namespace {
1084
1085 #ifdef CAROTENE_NEON
convertToHSV(const uint8x8_t vR,const uint8x8_t vG,const uint8x8_t vB,const s32 hrange)1086 inline uint8x8x3_t convertToHSV(const uint8x8_t vR, const uint8x8_t vG, const uint8x8_t vB,
1087 const s32 hrange )
1088 {
1089 const s32 hsv_shift = 12;
1090 const f32 vsdiv_table = f32(255 << hsv_shift);
1091 f32 vhdiv_table = f32(hrange << hsv_shift);
1092 const s32 vhrange = hrange;
1093 const s32 v0 = s32(0);
1094 const s32 vshift = s32(1 << (hsv_shift-1));
1095 const s32 v6 = s32(6);
1096
1097 uint8x8_t vMin = vmin_u8(vR, vG);
1098 uint8x8_t vMax = vmax_u8(vR, vG);
1099
1100 uint16x8_t vR_u16 = vmovl_u8(vR);
1101 uint16x8_t vG_u16 = vmovl_u8(vG);
1102
1103 vMax = vmax_u8(vMax, vB);
1104 vMin = vmin_u8(vMin, vB);
1105 uint16x8_t vB_u16 = vmovl_u8(vB);
1106
1107 uint16x8_t vDiff = vsubl_u8(vMax, vMin);
1108
1109 uint16x8_t vV = vmovl_u8(vMax);
1110 uint16x8_t vDiffx2 = vaddq_u16(vDiff, vDiff);
1111 uint32x4_t vDiffL = vmovl_u16(vget_low_u16(vDiff));
1112 uint32x4_t vDiffH = vmovl_u16(vget_high_u16(vDiff));
1113
1114 uint16x8_t vVEqR = vceqq_u16(vR_u16, vV);
1115 uint16x8_t vVEqG = vceqq_u16(vG_u16, vV);
1116
1117 int16x8_t vG_B = vsubq_s16(vreinterpretq_s16_u16(vG_u16), vreinterpretq_s16_u16(vB_u16));
1118 uint16x8_t vInvR = vmvnq_u16(vVEqR);
1119 int16x8_t vB_R = vsubq_s16(vreinterpretq_s16_u16(vB_u16), vreinterpretq_s16_u16(vR_u16));
1120 int16x8_t vR_G = vsubq_s16(vreinterpretq_s16_u16(vR_u16), vreinterpretq_s16_u16(vG_u16));
1121
1122 uint16x8_t vMask2 = vandq_u16(vVEqG, vInvR);
1123 vR_u16 = vandq_u16(vreinterpretq_u16_s16(vG_B), vVEqR);
1124 int16x8_t vH2 = vaddq_s16(vB_R, vreinterpretq_s16_u16(vDiffx2));
1125
1126 vVEqR = vmvnq_u16(vVEqG);
1127 vB_R = vaddq_s16(vreinterpretq_s16_u16(vDiffx2), vreinterpretq_s16_u16(vDiffx2));
1128 vG_B = vandq_s16(vreinterpretq_s16_u16(vInvR), vreinterpretq_s16_u16(vVEqR));
1129 vInvR = vandq_u16(vreinterpretq_u16_s16(vH2), vMask2);
1130 vR_G = vaddq_s16(vR_G, vB_R);
1131 int16x8_t vH = vaddq_s16(vreinterpretq_s16_u16(vR_u16), vreinterpretq_s16_u16(vInvR));
1132
1133 uint32x4_t vV_L = vmovl_u16(vget_low_u16(vV));
1134 vR_G = vandq_s16(vR_G, vG_B);
1135 uint32x4_t vV_H = vmovl_u16(vget_high_u16(vV));
1136 int16x8_t vDiff4 = vaddq_s16(vH, vR_G);
1137
1138 int32x4_t vc6 = vdupq_n_s32(v6);
1139 uint32x4_t vLine1 = vmulq_u32(vDiffL, vreinterpretq_u32_s32(vc6));
1140 uint32x4_t vLine2 = vmulq_u32(vDiffH, vreinterpretq_u32_s32(vc6));
1141
1142 float32x4_t vF1 = vcvtq_f32_u32(vV_L);
1143 float32x4_t vF2 = vcvtq_f32_u32(vV_H);
1144 float32x4_t vHF1 = vcvtq_f32_u32(vLine1);
1145 float32x4_t vHF2 = vcvtq_f32_u32(vLine2);
1146
1147 float32x4_t vXInv1 = vrecpeq_f32(vF1);
1148 float32x4_t vXInv2 = vrecpeq_f32(vF2);
1149 float32x4_t vXInv3 = vrecpeq_f32(vHF1);
1150 float32x4_t vXInv4 = vrecpeq_f32(vHF2);
1151
1152 float32x4_t vSt1 = vrecpsq_f32(vXInv1, vF1);
1153 float32x4_t vSt2 = vrecpsq_f32(vXInv2, vF2);
1154 float32x4_t vSt3 = vrecpsq_f32(vXInv3, vHF1);
1155 float32x4_t vSt4 = vrecpsq_f32(vXInv4, vHF2);
1156
1157 vF1 = vmulq_f32(vXInv1, vSt1);
1158 vF2 = vmulq_f32(vXInv2, vSt2);
1159 vHF1 = vmulq_f32(vXInv3, vSt3);
1160 vHF2 = vmulq_f32(vXInv4, vSt4);
1161
1162 float32x4_t vDivTab = vdupq_n_f32(vsdiv_table);
1163 vSt1 = vmulq_f32(vF1, vDivTab);
1164 vSt2 = vmulq_f32(vF2, vDivTab);
1165 vDivTab = vdupq_n_f32(vhdiv_table);
1166 vSt3 = vmulq_f32(vHF1, vDivTab);
1167 vSt4 = vmulq_f32(vHF2, vDivTab);
1168
1169 float32x4_t bias = vdupq_n_f32(0.5f);
1170
1171 vSt1 = vaddq_f32(vSt1, bias);
1172 vSt2 = vaddq_f32(vSt2, bias);
1173 vSt3 = vaddq_f32(vSt3, bias);
1174 vSt4 = vaddq_f32(vSt4, bias);
1175
1176 uint32x4_t vRes1 = vcvtq_u32_f32(vSt1);
1177 uint32x4_t vRes2 = vcvtq_u32_f32(vSt2);
1178 uint32x4_t vRes3 = vcvtq_u32_f32(vSt3);
1179 uint32x4_t vRes4 = vcvtq_u32_f32(vSt4);
1180
1181 int32x4_t vH_L = vmovl_s16(vget_low_s16(vDiff4));
1182 int32x4_t vH_H = vmovl_s16(vget_high_s16(vDiff4));
1183
1184 uint32x4_t vDiff_Res1 = vmulq_u32(vDiffL, vRes1);
1185 uint32x4_t vDiff_Res2 = vmulq_u32(vDiffH, vRes2);
1186 uint32x4_t vDiff_Res3 = vmulq_u32(vreinterpretq_u32_s32(vH_L), vRes3);
1187 uint32x4_t vDiff_Res4 = vmulq_u32(vreinterpretq_u32_s32(vH_H), vRes4);
1188
1189 int32x4_t vShift = vdupq_n_s32(vshift);
1190 uint32x4_t vAddRes1 = vaddq_u32(vDiff_Res1, vreinterpretq_u32_s32(vShift));
1191 uint32x4_t vAddRes2 = vaddq_u32(vDiff_Res2, vreinterpretq_u32_s32(vShift));
1192 uint32x4_t vAddRes3 = vaddq_u32(vDiff_Res3, vreinterpretq_u32_s32(vShift));
1193 uint32x4_t vAddRes4 = vaddq_u32(vDiff_Res4, vreinterpretq_u32_s32(vShift));
1194 int16x4_t vShrRes1 = vshrn_n_s32(vreinterpretq_s32_u32(vAddRes1), 8);
1195 int16x4_t vShrRes2 = vshrn_n_s32(vreinterpretq_s32_u32(vAddRes2), 8);
1196 int16x4_t vShrRes3 = vshrn_n_s32(vreinterpretq_s32_u32(vAddRes3), 8);
1197 int16x4_t vShrRes4 = vshrn_n_s32(vreinterpretq_s32_u32(vAddRes4), 8);
1198
1199 int16x8_t vc0 = vdupq_n_s16((s16)v0);
1200 int8x8_t vShrRes1_s8 = vshrn_n_s16(vcombine_s16(vShrRes1, vShrRes2), 4);
1201 uint16x8_t vCltRes_u16 = vcltq_s16(vcombine_s16(vShrRes3, vShrRes4), vc0);
1202 int8x8_t vShrRes2_s8 = vshrn_n_s16(vcombine_s16(vShrRes3, vShrRes4), 4);
1203
1204 int8x8_t vCltRes_s8 = vmovn_s16(vreinterpretq_s16_u16(vCltRes_u16));
1205 int8x8_t vcHRange = vdup_n_s8((s8)vhrange);
1206 uint8x8_t vHResAdd = vand_u8(vreinterpret_u8_s8(vCltRes_s8), vreinterpret_u8_s8(vcHRange));
1207 int8x8_t vHRes = vadd_s8(vShrRes2_s8, vreinterpret_s8_u8(vHResAdd));
1208
1209 uint8x8x3_t vHsv;
1210 vHsv.val[0] = vreinterpret_u8_s8(vHRes);
1211 vHsv.val[1] = vreinterpret_u8_s8(vShrRes1_s8);
1212 vHsv.val[2] = vMax;
1213
1214 return vHsv;
1215 }
1216
1217 const u8 fastSaturate8u[] =
1218 {
1219 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1220 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1221 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1222 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1223 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1224 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1225 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1226 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1227 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1228 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1229 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1230 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1231 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1232 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1233 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1234 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1235 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
1236 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
1237 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
1238 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
1239 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
1240 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95,
1241 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,
1242 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,
1243 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,
1244 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159,
1245 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,
1246 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191,
1247 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207,
1248 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223,
1249 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
1250 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255,
1251 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
1252 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
1253 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
1254 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
1255 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
1256 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
1257 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
1258 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
1259 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
1260 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
1261 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
1262 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
1263 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
1264 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
1265 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
1266 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
1267 255
1268 };
1269
convertToHSV(const s32 r,const s32 g,const s32 b,const s32 & hrange,const s32 & hsv_shift,u8 * dst)1270 inline void convertToHSV(const s32 r, const s32 g, const s32 b,
1271 const s32 &hrange, const s32 &hsv_shift,
1272 u8* dst)
1273 {
1274 s32 h, s, v = b;
1275 s32 vmin = b, diff;
1276 s32 vr, vg;
1277
1278 v += fastSaturate8u[g-v+256];
1279 v += fastSaturate8u[r-v+256];
1280 vmin -= fastSaturate8u[vmin-g+256];
1281 vmin -= fastSaturate8u[vmin-r+256];
1282
1283 diff = v - vmin;
1284 vr = v == r ? -1 : 0;
1285 vg = v == g ? -1 : 0;
1286
1287 s = (s32(diff * (255 << hsv_shift) * (1.0f/(f32)v)) + (1 << (hsv_shift-1))) >> hsv_shift;
1288 h = (vr & (g - b)) + (~vr & ((vg & (b - r + 2 * diff)) + ((~vg) & (r - g + 4 * diff))));
1289 h = ((h * s32((hrange << hsv_shift)/(6.f*diff) + 0.5)) + (1 << (hsv_shift-1))) >> hsv_shift;
1290 h += h < 0 ? hrange : 0;
1291
1292 dst[0] = internal::saturate_cast<u8>(h);
1293 dst[1] = (u8)s;
1294 dst[2] = (u8)v;
1295 }
1296
1297 #define CONVERT_TO_HSV_ASM(loadop, rreg, breg) \
1298 __asm__ ( \
1299 #loadop ", [%[in]] @RGB \n\t" \
1300 "vmin.u8 d3, d0, d1 @VMin (d3) \n\t" \
1301 "vmax.u8 d6, d0, d1 @V (d6) \n\t" \
1302 "vmovl.u8 q2, " #rreg " @V16_R (d4,d5) \n\t" \
1303 "vmovl.u8 q4, d1 @V16_G (d8,d9) \n\t" \
1304 "vmax.u8 d6, d6, d2 \n\t" \
1305 "vmin.u8 d3, d3, d2 \n\t" \
1306 "vmovl.u8 q0, " #breg " @V16_B (d0,d1) \n\t" \
1307 "vsubl.u8 q8, d6, d3 @V16_Diff (d16,d17) \n\t" \
1308 \
1309 "vmovl.u8 q5, d6 @V16_V (d10,d11) \n\t" \
1310 "vadd.s16 q10, q8, q8 @V16_Diff_2 (d20,d21) \n\t" \
1311 "vmovl.u16 q9, d16 @V32_Diff_L (d18,d19) \n\t" \
1312 "vmovl.u16 q11, d17 @V32_Diff_H (d22,d23) \n\t" \
1313 "vceq.u16 q12, q2, q5 @V==R(d24,d25) \n\t" \
1314 "vceq.u16 q13, q4, q5 @V==G(d26,d27) \n\t" \
1315 \
1316 "vsub.s16 q8, q4, q0 @V16_G-B (d16,d17) \n\t" \
1317 "vmvn.u16 q15, q12 @V16~R \n\t" \
1318 "vsub.s16 q6, q0, q2 @V16_B-R (d12,d13) \n\t" \
1319 "vsub.s16 q7, q2, q4 @V16_R-G (d14,d15) \n\t" \
1320 "vand.u16 q1, q13, q15 @VMask2 \n\t" \
1321 "vand.u16 q2, q8, q12 @V16_H(d4,d5) \n\t" \
1322 "vadd.s16 q4, q6, q10 @V16_H2 \n\t" \
1323 "vmvn.u16 q12, q13 @V16~G \n\t" \
1324 "vadd.s16 q6, q10, q10 @VDiff16_4 (d12,d13) \n\t" \
1325 "vand.u16 q8, q15, q12 @VMask3 \n\t" \
1326 "vand.u16 q15, q4, q1 @vH2(d30,d31) \n\t" \
1327 "vadd.s16 q7, q7, q6 @V16_H3 (d14,d15) \n\t" \
1328 "vadd.s16 q14, q2, q15 @vH16 \n\t" \
1329 "vmovl.u16 q12, d10 @V32_V_L \n\t" \
1330 "vand.s16 q7, q7, q8 @vH16 \n\t" \
1331 "vmovl.u16 q13, d11 @V32_V_H \n\t" \
1332 "vadd.s16 q2, q14, q7 @V16_Diff_4 \n\t" \
1333 \
1334 "vdup.32 q4, %[v6] \n\t" \
1335 "vmul.u32 q14, q9, q4 \n\t" \
1336 "vmul.u32 q15, q11, q4 \n\t" \
1337 "vcvt.f32.u32 q4, q12 @VF1 (d8,d9) \n\t" \
1338 "vcvt.f32.u32 q8, q13 @VF2 \n\t" \
1339 "vcvt.f32.u32 q0, q14 @HF1 \n\t" \
1340 "vcvt.f32.u32 q1, q15 @HF2 \n\t" \
1341 "vrecpe.f32 q12, q4 @Vxinv \n\t" \
1342 "vrecpe.f32 q13, q8 @Vxinv \n\t" \
1343 "vrecpe.f32 q5, q0 @Vxinv \n\t" \
1344 "vrecpe.f32 q7, q1 @Vxinv \n\t" \
1345 "vrecps.f32 q14, q12, q4 @Vst1 \n\t" \
1346 "vrecps.f32 q15, q13, q8 @Vst1 \n\t" \
1347 "vrecps.f32 q10, q5, q0 @Vst1 \n\t" \
1348 "vrecps.f32 q6, q7, q1 @Vst1 \n\t" \
1349 "vmul.f32 q4, q12, q14 \n\t" \
1350 "vmul.f32 q8, q13, q15 \n\t" \
1351 "vmul.f32 q0, q5, q10 \n\t" \
1352 "vmul.f32 q1, q7, q6 \n\t" \
1353 "vdup.32 q12, %[vsdiv_table] \n\t" \
1354 "vmul.f32 q14, q4, q12 \n\t" \
1355 "vmul.f32 q15, q8, q12 \n\t" \
1356 "vdup.32 q12, %[vhdiv_table] \n\t" \
1357 "vmul.f32 q10, q0, q12 \n\t" \
1358 "vmul.f32 q6, q1, q12 \n\t" \
1359 \
1360 "vdup.32 q12, %[bias] \n\t" \
1361 \
1362 "vadd.f32 q7, q14, q12 \n\t" \
1363 "vadd.f32 q13, q15, q12 \n\t" \
1364 "vcvt.u32.f32 q4, q7 \n\t" \
1365 "vcvt.u32.f32 q8, q13 \n\t" \
1366 \
1367 "vadd.f32 q14, q10, q12 \n\t" \
1368 "vadd.f32 q7, q6, q12 \n\t" \
1369 "vcvt.u32.f32 q0, q14 \n\t" \
1370 "vcvt.u32.f32 q1, q7 @Vres \n\t" \
1371 \
1372 "vmovl.s16 q7, d4 @V32_H_L (d14,d15) \n\t" \
1373 "vmovl.s16 q5, d5 @V32_H_H (d10,d11) \n\t" \
1374 "vmul.u32 q14, q9, q4 \n\t" \
1375 "vmul.u32 q15, q11, q8 \n\t" \
1376 "vmul.u32 q10, q7, q0 \n\t" \
1377 "vmul.u32 q6, q5, q1 \n\t" \
1378 \
1379 "vdup.32 q12, %[vshift] \n\t" \
1380 "vadd.u32 q13, q14, q12 \n\t" \
1381 "vadd.u32 q8, q15, q12 \n\t" \
1382 "vadd.u32 q0, q10, q12 \n\t" \
1383 "vadd.u32 q1, q6, q12 \n\t" \
1384 "vshrn.s32 d8, q13, #8 \n\t" \
1385 "vshrn.s32 d9, q8, #8 \n\t" \
1386 "vshrn.s32 d10, q0, #8 \n\t" \
1387 "vshrn.s32 d11, q1, #8 \n\t" \
1388 \
1389 "vdup.16 q8, %[v0] \n\t" \
1390 "vshrn.s16 d5, q4, #4 \n\t" \
1391 "vclt.s16 q9, q5, q8 \n\t" \
1392 "vshrn.s16 d4, q5, #4 \n\t" \
1393 \
1394 "vmovn.s16 d9, q9 \n\t" \
1395 "vdup.8 d7, %[vhrange] \n\t" \
1396 "vand.u8 d10, d9, d7 \n\t" \
1397 "vadd.s8 d4, d4, d10 \n\t" \
1398 "vst3.8 {d4-d6}, [%[out]] @HSV \n\t" \
1399 : /*no output*/ \
1400 : [out] "r" (dst + dj), [in] "r" (src + sj), \
1401 [vsdiv_table] "r" (vsdiv_table), \
1402 [vshift] "r" (vshift), \
1403 [vhdiv_table] "r" (vhdiv_table), \
1404 [v6] "r" (v6), [vhrange] "r" (vhrange), \
1405 [v0] "r" (v0), [bias] "r" (bias) \
1406 : "d0","d1","d2","d3","d4","d5","d6","d7", \
1407 "d8","d9","d10","d11","d12","d13","d14","d15", \
1408 "d16","d17","d18","d19","d20","d21","d22","d23", \
1409 "d24","d25","d26","d27","d28","d29","d30","d31" \
1410 );
1411
1412 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
1413
1414 #define YCRCB_CONSTS \
1415 register int16x4_t vcYR asm ("d31") = vmov_n_s16(4899); \
1416 register int16x4_t vcYG asm ("d30") = vmov_n_s16(9617); \
1417 register int16x4_t vcYB asm ("d29") = vmov_n_s16(1868); \
1418 register int16x4_t vcCrG asm ("d28") = vmov_n_s16(6860); \
1419 register int16x4_t vcCrB asm ("d27") = vmov_n_s16(1332); \
1420 register int16x4_t vcCbR asm ("d26") = vmov_n_s16(2765); \
1421 register int16x4_t vcCbG asm ("d25") = vmov_n_s16(5427);
1422
1423 #else
1424
1425 #define YCRCB_CONSTS \
1426 const s16 convertCoeffs[] = { 4899, 4899, 4899, 4899, \
1427 9617, 9617, 9617, 9617, \
1428 1868, 1868, 1868, 1868, \
1429 6860, 6860, 6860, 6860, \
1430 1332, 1332, 1332, 1332, \
1431 2765, 2765, 2765, 2765, \
1432 5427, 5427, 5427, 5427 }; \
1433 const int16x8_t vcYRG = vld1q_s16(convertCoeffs); /*YR and YG*/ \
1434 const int16x4_t vcYB = vld1_s16(convertCoeffs + 8); /*YB*/ \
1435 const int16x8_t vcCrGB = vld1q_s16(convertCoeffs + 12); /*CrG and CrB*/ \
1436 const int16x8_t vcCbRG = vld1q_s16(convertCoeffs + 20); /*CbR and CbG*/
1437
1438 #endif
1439
1440 #define CONVERTTOYCRCB(loadcmd, rreg, greg, breg) \
1441 __asm__ ( \
1442 #loadcmd ", [%[in]] @RGB \n\t" \
1443 "vmovl.u8 q2, " #rreg " @R (d4,d5) \n\t" \
1444 "vmovl.u8 q3, " #greg " @G (d6,d7) \n\t" \
1445 "vmovl.u8 q4, " #breg " @B (d8,d9) \n\t" \
1446 \
1447 "vshll.u16 q7, d4, #13 @Cr(q7,q8): R \n\t" \
1448 "vmull.u16 q5, d6, d30 @Y (q5,q6): G \n\t" \
1449 "vshll.u16 q9, d8, #13 @Cb(q9,q10): B \n\t" \
1450 "vshll.u16 q8, d5, #13 @Cr(q7,q8): R \n\t" \
1451 "vmull.u16 q6, d7, d30 @Y (q5,q6): G \n\t" \
1452 "vshll.u16 q10, d9, #13 @Cb(q9,q10): B \n\t" \
1453 \
1454 "vmlsl.s16 q7, d6, d28 @Cr(q7,q8): RG \n\t" \
1455 "vmlal.s16 q5, d8, d29 @Y (q5,q6): GB \n\t" \
1456 "vmlsl.s16 q9, d4, d26 @Cb(q9,q10): BR \n\t" \
1457 "vmlsl.s16 q8, d7, d28 @Cr(q7,q8): RG \n\t" \
1458 "vmlal.s16 q6, d9, d29 @Y (q5,q6): GB \n\t" \
1459 "vmlsl.s16 q10, d5, d26 @Cb(q9,q10): BR \n\t" \
1460 \
1461 "vmlsl.s16 q7, d8, d27 @Cr(q7,q8): RGB \n\t" \
1462 "vmlal.s16 q5, d4, d31 @Y (q5,q6): GBR \n\t" \
1463 "vmlsl.s16 q9, d6, d25 @Cb(q9,q10): BRG \n\t" \
1464 "vmlsl.s16 q8, d9, d27 @Cr(q7,q8): RGB \n\t" \
1465 "vmlal.s16 q6, d5, d31 @Y (q5,q6): GBR \n\t" \
1466 "vmlsl.s16 q10, d7, d25 @Cb(q9,q10): BRG \n\t" \
1467 \
1468 "vrshrn.s32 d4, q7, #14 @Cr -> q2 \n\t" \
1469 "vrshrn.s32 d8, q5, #14 @Y -> q4 \n\t" \
1470 "vrshrn.s32 d6, q9, #14 @Cb -> q3 \n\t" \
1471 "vrshrn.s32 d5, q8, #14 @Cr -> q2 \n\t" \
1472 "vrshrn.s32 d9, q6, #14 @Y -> q4 \n\t" \
1473 "vrshrn.s32 d7, q10, #14 @Cb -> q3 \n\t" \
1474 \
1475 "vmov.s16 q5, #128 \n\t" \
1476 "vmov.s16 q6, #128 \n\t" \
1477 "vadd.i16 q5, q2 @Cr -> q5 \n\t" \
1478 "vadd.i16 q6, q3 @Cb -> q6 \n\t" \
1479 \
1480 "vqmovn.u16 d4, q4 \n\t" \
1481 "vqmovun.s16 d5, q5 \n\t" \
1482 "vqmovun.s16 d6, q6 \n\t" \
1483 \
1484 "vst3.8 {d4-d6}, [%[out]] \n\t" \
1485 : /*no output*/ \
1486 : [out] "r" (dst + dj), [in] "r" (src + sj), \
1487 "w" (vcYR), "w" (vcYG), "w" (vcYB), \
1488 "w" (vcCrB), "w" (vcCrG), "w" (vcCbG), "w" (vcCbR) \
1489 : "d0","d1","d2","d3","d4","d5","d6","d7", \
1490 "d8","d9","d10","d11","d12","d13","d14","d15", \
1491 "d16","d17","d18","d19","d20","d21" \
1492 );
1493
1494
convertToYCrCb(const int16x8_t & vR,const int16x8_t & vG,const int16x8_t & vB,const int16x8_t & vcYRG,const int16x4_t & vcYB,const int16x8_t & vcCrGB,const int16x8_t & vcCbRG)1495 inline uint8x8x3_t convertToYCrCb( const int16x8_t& vR, const int16x8_t& vG, const int16x8_t& vB,
1496 const int16x8_t& vcYRG, const int16x4_t& vcYB,
1497 const int16x8_t& vcCrGB, const int16x8_t& vcCbRG )
1498 {
1499 int32x4_t vCrL = vshll_n_s16(vget_low_s16(vR), 13); // R
1500 int32x4_t vCrH = vshll_n_s16(vget_high_s16(vR), 13); // R
1501 int32x4_t vYL = vmull_s16(vget_low_s16(vG), vget_high_s16(vcYRG)); // G
1502 int32x4_t vYH = vmull_s16(vget_high_s16(vG), vget_high_s16(vcYRG)); // G
1503 int32x4_t vCbL = vshll_n_s16(vget_low_s16(vB), 13); // B
1504 int32x4_t vCbH = vshll_n_s16(vget_high_s16(vB), 13); // B
1505
1506 vCrL = vmlsl_s16(vCrL, vget_low_s16(vG), vget_low_s16(vcCrGB)); // RG
1507 vCrH = vmlsl_s16(vCrH, vget_high_s16(vG), vget_low_s16(vcCrGB)); // RG
1508 vYL = vmlal_s16(vYL, vget_low_s16(vB), vcYB); // GB
1509 vYH = vmlal_s16(vYH, vget_high_s16(vB), vcYB); // GB
1510 vCbL = vmlsl_s16(vCbL, vget_low_s16(vR), vget_low_s16(vcCbRG)); // BR
1511 vCbH = vmlsl_s16(vCbH, vget_high_s16(vR), vget_low_s16(vcCbRG)); // BR
1512
1513 vCrL = vmlsl_s16(vCrL, vget_low_s16(vB), vget_high_s16(vcCrGB)); // RGB
1514 vCrH = vmlsl_s16(vCrH, vget_high_s16(vB), vget_high_s16(vcCrGB)); // RGB
1515 vYL = vmlal_s16(vYL, vget_low_s16(vR), vget_low_s16(vcYRG)); // GBR
1516 vYH = vmlal_s16(vYH, vget_high_s16(vR), vget_low_s16(vcYRG)); // GBR
1517 vCbL = vmlsl_s16(vCbL, vget_low_s16(vG), vget_high_s16(vcCbRG)); // BRG
1518 vCbH = vmlsl_s16(vCbH, vget_high_s16(vG), vget_high_s16(vcCbRG)); // BRG
1519
1520 int16x4_t vCrL_ = vrshrn_n_s32(vCrL, 14);
1521 int16x4_t vCrH_ = vrshrn_n_s32(vCrH, 14);
1522 int16x4_t vYL_ = vrshrn_n_s32(vYL, 14);
1523 int16x4_t vYH_ = vrshrn_n_s32(vYH, 14);
1524 int16x4_t vCbL_ = vrshrn_n_s32(vCbL, 14);
1525 int16x4_t vCbH_ = vrshrn_n_s32(vCbH, 14);
1526
1527 int16x8_t vCr = vmovq_n_s16(128);
1528 int16x8_t vCb = vmovq_n_s16(128);
1529
1530 vCr = vaddq_s16(vCr, vcombine_s16(vCrL_, vCrH_));
1531 vCb = vaddq_s16(vCb, vcombine_s16(vCbL_, vCbH_));
1532
1533 uint8x8x3_t vYCrCb;
1534 vYCrCb.val[0] = vqmovn_u16(vreinterpretq_u16_s16(vcombine_s16(vYL_, vYH_)));
1535 vYCrCb.val[1] = vqmovun_s16(vCr);
1536 vYCrCb.val[2] = vqmovun_s16(vCb);
1537
1538 return vYCrCb;
1539 }
1540
1541 #define S_CONVERTTOYCRCB(R, G, B) \
1542 s32 Y = (R * 4899 + G * 9617 + B * 1868 + (1 << 13)) >> 14; \
1543 s32 Cr = 128 + ((R * 8192 - G * 6860 - B * 1332 + (1 << 13)) >> 14); \
1544 s32 Cb = 128 + ((R * (-2765) - G * 5427 + B * 8192 + (1 << 13)) >> 14); \
1545 dst[dj + 0] = internal::saturate_cast<u8>(Y); \
1546 dst[dj + 1] = internal::saturate_cast<u8>(Cr); \
1547 dst[dj + 2] = internal::saturate_cast<u8>(Cb);
1548
1549 #define COEFF_Y ( 149)
1550 #define COEFF_BU ( 129)
1551 #define COEFF_RV ( 102)
1552 #define COEFF_GU ( 25)
1553 #define COEFF_GV ( 52)
1554 #define COEFF_R (-14248)
1555 #define COEFF_G ( 8663)
1556 #define COEFF_B (-17705)
1557
1558 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
1559 #define YUV420ALPHA3_CONST
1560 #define YUV420ALPHA4_CONST register uint8x16_t c255 asm ("q13") = vmovq_n_u8(255);
1561 #define YUV420ALPHA3_CONVERT
1562 #define YUV420ALPHA4_CONVERT , "w" (c255)
1563 #define YUV420STORE1CMD3 "vst3.8 {d20, d22, d24}"
1564 #define YUV420STORE2CMD3 "vst3.8 {d21, d23, d25}"
1565 #define YUV420STORE1CMD4 "vst4.8 {d20, d22, d24, d26}"
1566 #define YUV420STORE2CMD4 "vst4.8 {d21, d23, d25, d27}"
1567
1568 #define YUV420_CONSTS(cn, bIdx, vIdx) \
1569 register const s32 cR = s16(COEFF_R); \
1570 register const s32 cG = s16(COEFF_G); \
1571 register const s32 cB = s16(COEFF_B); \
1572 \
1573 register uint8x16_t vc16 asm ("q15") = vmovq_n_u8(16); \
1574 register uint8x8_t cGU asm ("d14") = vmov_n_u8(COEFF_GU); \
1575 register uint8x8_t cGV asm ("d15") = vmov_n_u8(COEFF_GV); \
1576 register uint8x8_t cRV asm ("d16") = vmov_n_u8(COEFF_RV); \
1577 register uint8x8_t cBU asm ("d17") = vmov_n_u8(COEFF_BU); \
1578 register uint8x16_t cRGBY asm ("q3") = vmovq_n_u8(COEFF_Y); \
1579 YUV420ALPHA##cn##_CONST
1580
1581 #define CONVERTYUV420TORGB(cn, ureg, vreg, rreg, breg) \
1582 __asm__ ( \
1583 "vld2.8 {d0-d1}, [%[inUV]] @UV \n\t" \
1584 "vdup.16 q4, %[cG] @cG \n\t" \
1585 "vld2.8 {d2-d3}, [%[inY1]] @YY \n\t" \
1586 "vdup.16 "#rreg", %[cR] @cR \n\t" \
1587 "vld2.8 {d4-d5}, [%[inY2]] @YY \n\t" \
1588 "vdup.16 "#breg", %[cB] @cB \n\t" \
1589 "vmlsl.u8 q4, "#ureg", d14 @cG-25u \n\t" \
1590 "vmax.u8 q1, q15 @max(Y,16) \n\t" \
1591 "vmlal.u8 "#rreg", "#vreg", d16 @cR+102*v \n\t" \
1592 "vmlal.u8 "#breg", "#ureg", d17 @cB+129*u \n\t" \
1593 "vmax.u8 q2, q15 @max(Y,16) \n\t" \
1594 "vmlsl.u8 q4, "#vreg", d15 @cG-25u-52v \n\t" \
1595 /*q10,q11,q12,q13 - for output*/ \
1596 "vmull.u8 q9, d3, d6 @h 149*y \n\t" \
1597 "vmull.u8 q10, d2, d7 @l 149*y \n\t" \
1598 "vshr.u16 q9, #1 @h (149*y)/2 \n\t" \
1599 "vshr.u16 q10, #1 @l (149*y)/2 \n\t" \
1600 \
1601 "vhadd.s16 q0, q9, q4 @hG ((149*y)/2 + cG - 25*u - 52*v)/2 \n\t" \
1602 "vhadd.s16 q12, q10, q6 @lB ((149*y)/2 + cB + 129*u)/2 \n\t" \
1603 "vhadd.s16 q1, q9, q5 @hR ((149*y)/2 + cR + 102*v)/2 \n\t" \
1604 "vhadd.s16 q11, q10, q4 @lG ((149*y)/2 + cG - 25*u - 52*v)/2 \n\t" \
1605 "vhadd.s16 q9, q6 @hB ((149*y)/2 + cB + 129*u)/2 \n\t" \
1606 "vhadd.s16 q10, q5 @lR ((149*y)/2 + cR + 102*v)/2 \n\t" \
1607 \
1608 "vqrshrun.s16 d24, q12, #5 @lB ((149*y)/2 + cB + 129*u)/2/32 \n\t" \
1609 "vqrshrun.s16 d22, q11, #5 @lG ((149*y)/2 + cG - 25*u - 52*v)/2/32 \n\t" \
1610 "vqrshrun.s16 d20, q10, #5 @lR ((149*y)/2 + cR + 102*v)/2/32 \n\t" \
1611 "vqrshrun.s16 d23, q0, #5 @hG ((149*y)/2 + cG - 25*u - 52*v)/2/32 \n\t" \
1612 "vqrshrun.s16 d21, q1, #5 @hR ((149*y)/2 + cR + 102*v)/2/32 \n\t" \
1613 "vqrshrun.s16 d25, q9, #5 @hB ((149*y)/2 + cB + 129*u)/2/32 \n\t" \
1614 \
1615 "vzip.8 d22, d23 @G \n\t" \
1616 "vzip.8 d20, d21 @R \n\t" \
1617 "vzip.8 d24, d25 @B \n\t" \
1618 \
1619 YUV420STORE1CMD##cn", [%[out1]] \n\t" \
1620 YUV420STORE2CMD##cn", [%[out1x]] \n\t" \
1621 \
1622 "vmull.u8 q9, d5, d6 @h 149*y \n\t" \
1623 "vmull.u8 q10, d4, d7 @l 149*y \n\t" \
1624 "vshr.u16 q9, #1 @h (149*y)/2 \n\t" \
1625 "vshr.u16 q10, #1 @l (149*y)/2 \n\t" \
1626 \
1627 "vhadd.s16 q0, q9, q4 @hG ((149*y)/2 + cG - 25*u - 52*v)/2 \n\t" \
1628 "vhadd.s16 q12, q10, q6 @lB ((149*y)/2 + cB + 129*u)/2 \n\t" \
1629 "vhadd.s16 q1, q9, q5 @hR ((149*y)/2 + cR + 102*v)/2 \n\t" \
1630 "vhadd.s16 q11, q10, q4 @lG ((149*y)/2 + cG - 25*u - 52*v)/2 \n\t" \
1631 "vhadd.s16 q9, q6 @hB ((149*y)/2 + cB + 129*u)/2 \n\t" \
1632 "vhadd.s16 q10, q5 @lR ((149*y)/2 + cR + 102*v)/2 \n\t" \
1633 \
1634 "vqrshrun.s16 d24, q12, #5 @lB ((149*y)/2 + cB + 129*u)/2/32 \n\t" \
1635 "vqrshrun.s16 d22, q11, #5 @lG ((149*y)/2 + cG - 25*u - 52*v)/2/32 \n\t" \
1636 "vqrshrun.s16 d20, q10, #5 @lR ((149*y)/2 + cR + 102*v)/2/32 \n\t" \
1637 "vqrshrun.s16 d23, q0, #5 @hG ((149*y)/2 + cG - 25*u - 52*v)/2/32 \n\t" \
1638 "vqrshrun.s16 d21, q1, #5 @hR ((149*y)/2 + cR + 102*v)/2/32 \n\t" \
1639 "vqrshrun.s16 d25, q9, #5 @hB ((149*y)/2 + cB + 129*u)/2/32 \n\t" \
1640 \
1641 "vzip.8 d22, d23 @G \n\t" \
1642 "vzip.8 d20, d21 @R \n\t" \
1643 "vzip.8 d24, d25 @B \n\t" \
1644 \
1645 YUV420STORE1CMD##cn", [%[out2]] \n\t" \
1646 YUV420STORE2CMD##cn", [%[out2x]] \n\t" \
1647 \
1648 : /*no output*/ \
1649 : [out1] "r" (dst1 + dj), [out2] "r" (dst2 + dj), \
1650 [out1x] "r" (dst1 + dj+cn*8), [out2x] "r" (dst2 + dj+cn*8), \
1651 [inUV] "r" (uv+j), [inY1] "r" (y1+j), [inY2] "r" (y2+j), \
1652 [cR] "r" (cR), [cG] "r" (cG), [cB] "r" (cB), \
1653 "w" (vc16), "w" (cGU), "w" (cGV), "w" (cBU), "w" (cRV), "w" (cRGBY) YUV420ALPHA##cn##_CONVERT \
1654 : "d0","d1","d2","d3","d4","d5","d8","d9","d10","d11","d12", \
1655 "d13","d18","d19","d20","d21","d22","d23","d24","d25" \
1656 );
1657
1658 #else
1659
1660 template<int bIdx>
1661 struct _convertYUV420Internals
1662 {
1663 uint16x8_t vc14216;
1664 uint16x8_t vc17672;
1665 uint16x8_t vc8696;
1666 uint8x8_t vc102;
1667 uint8x8_t vc25;
1668 uint8x8_t vc129;
1669 uint8x8_t vc52;
1670 uint16x8_t vc_1;
1671 uint8x8_t vc149;
1672 uint8x8_t vc16;
_convertYUV420InternalsCAROTENE_NS::__anon83de09880b11::_convertYUV420Internals1673 _convertYUV420Internals()
1674 {
1675 vc14216 = vdupq_n_u16(-COEFF_R);
1676 vc17672 = vdupq_n_u16(-COEFF_B);
1677 vc8696 = vdupq_n_u16(COEFF_G);
1678 vc102 = vdup_n_u8(COEFF_RV);
1679 vc25 = vdup_n_u8(COEFF_GU);
1680 vc129 = vdup_n_u8(COEFF_BU);
1681 vc52 = vdup_n_u8(COEFF_GV);
1682 vc_1 = vdupq_n_u16((uint16_t)-1);
1683 vc149 = vdup_n_u8(COEFF_Y);
1684 vc16 = vdup_n_u8(16);
1685 }
1686
UVrgbToRGBCAROTENE_NS::__anon83de09880b11::_convertYUV420Internals1687 inline void UVrgbToRGB( const int16x8_t &ruv, const int16x8_t &guv, const int16x8_t &buv,
1688 const u8 *y, uint8x16x3_t &rgbl )
1689 {
1690 //y get line
1691 uint8x8x2_t yl = vld2_u8(y);
1692 yl.val[0] = vmax_u8(yl.val[0], vc16);
1693 yl.val[1] = vmax_u8(yl.val[1], vc16);
1694
1695 //y part line
1696 uint16x8_t yodd1 = vmlal_u8(vc_1, yl.val[0], vc149); //(-1+149*y)
1697 uint16x8_t yevn1 = vmlal_u8(vc_1, yl.val[1], vc149); //(-1+149*y)
1698 int16x8_t yodd1h = (int16x8_t)vshrq_n_u16(yodd1, 1); //(-1+149*y)/2
1699 int16x8_t yevn1h = (int16x8_t)vshrq_n_u16(yevn1, 1); //(-1+149*y)/2
1700
1701 //y line calc rgb
1702 int16x8_t rodd1w = vhsubq_s16(yodd1h, ruv); //((-1+149*y)/2 - (14216-102*v))/2
1703 int16x8_t gevn1w = vhaddq_s16(yevn1h, guv); //((-1+149*y)/2 + ((8696-25*u)-52*v))/2
1704 int16x8_t bodd1w = vhsubq_s16(yodd1h, buv); //((-1+149*y)/2 - (17672-129*u))/2
1705 int16x8_t revn1w = vhsubq_s16(yevn1h, ruv); //((-1+149*y)/2 - (14216-102*v))/2
1706 int16x8_t godd1w = vhaddq_s16(yodd1h, guv); //((-1+149*y)/2 + ((8696-25*u)-52*v))/2
1707 int16x8_t bevn1w = vhsubq_s16(yevn1h, buv); //((-1+149*y)/2 - (17672-129*u))/2
1708
1709 //y line clamp + narrow
1710 uint8x8_t rodd1n = vqshrun_n_s16(rodd1w, 5);
1711 uint8x8_t revn1n = vqshrun_n_s16(revn1w, 5);
1712 uint8x8_t godd1n = vqshrun_n_s16(godd1w, 5);
1713 uint8x8x2_t r1 = vzip_u8 (rodd1n, revn1n);
1714 uint8x8_t gevn1n = vqshrun_n_s16(gevn1w, 5);
1715 uint8x8_t bodd1n = vqshrun_n_s16(bodd1w, 5);
1716 uint8x8x2_t g1 = vzip_u8 (godd1n, gevn1n);
1717 uint8x8_t bevn1n = vqshrun_n_s16(bevn1w, 5);
1718 uint8x8x2_t b1 = vzip_u8 (bodd1n, bevn1n);
1719 rgbl.val[2 - bIdx] = vcombine_u8(r1.val[0], r1.val[1]);
1720 rgbl.val[1] = vcombine_u8(g1.val[0], g1.val[1]);
1721 rgbl.val[0 + bIdx] = vcombine_u8(b1.val[0], b1.val[1]);
1722 }
1723 };
1724
1725 template<int cn, int bIdx, int vIdx>
1726 struct _convertYUV420
1727 {
1728 _convertYUV420Internals<bIdx> convertYUV420Internals;
1729
ToRGBCAROTENE_NS::__anon83de09880b11::_convertYUV4201730 inline void ToRGB( const u8 *y1, const u8 *y2, const u8 *uv,
1731 u8 *dst1, u8 *dst2 )
1732 {
1733 uint8x8x2_t raw_uv = vld2_u8(uv);
1734 uint16x8_t gu = vmlsl_u8(convertYUV420Internals.vc8696, raw_uv.val[1-vIdx], convertYUV420Internals.vc25); //(8696-25*u)
1735 int16x8_t ruv = (int16x8_t)vmlsl_u8(convertYUV420Internals.vc14216, raw_uv.val[vIdx], convertYUV420Internals.vc102); //(14216-102*v)
1736
1737 int16x8_t buv = (int16x8_t)vmlsl_u8(convertYUV420Internals.vc17672, raw_uv.val[1-vIdx], convertYUV420Internals.vc129); //(17672-129*u)
1738 int16x8_t guv = (int16x8_t)vmlsl_u8(gu, raw_uv.val[vIdx], convertYUV420Internals.vc52); //((8696-25*u)-52*v))
1739
1740 uint8x16x3_t rgbl;
1741 //y line1
1742 convertYUV420Internals.UVrgbToRGB(ruv, guv, buv, y1, rgbl);
1743 vst3q_u8(dst1, rgbl);
1744 //y line2
1745 convertYUV420Internals.UVrgbToRGB(ruv, guv, buv, y2, rgbl);
1746 vst3q_u8(dst2, rgbl);
1747 }
1748 };
1749
1750 template<int bIdx, int vIdx>
1751 struct _convertYUV420<4, bIdx, vIdx>
1752 {
1753 _convertYUV420Internals<bIdx> convertYUV420Internals;
1754
ToRGBCAROTENE_NS::__anon83de09880b11::_convertYUV4201755 inline void ToRGB( const u8 *y1, const u8 *y2, const u8 *uv,
1756 u8 *dst1, u8 *dst2 )
1757 {
1758 uint8x8x2_t raw_uv = vld2_u8(uv);
1759 uint16x8_t gu = vmlsl_u8(convertYUV420Internals.vc8696, raw_uv.val[1-vIdx], convertYUV420Internals.vc25); //(8696-25*u)
1760 int16x8_t ruv = (int16x8_t)vmlsl_u8(convertYUV420Internals.vc14216, raw_uv.val[vIdx], convertYUV420Internals.vc102); //(14216-102*v)
1761
1762 int16x8_t buv = (int16x8_t)vmlsl_u8(convertYUV420Internals.vc17672, raw_uv.val[1-vIdx], convertYUV420Internals.vc129); //(17672-129*u)
1763 int16x8_t guv = (int16x8_t)vmlsl_u8(gu, raw_uv.val[vIdx], convertYUV420Internals.vc52); //((8696-25*u)-52*v))
1764
1765 union { uint8x16x4_t v4; uint8x16x3_t v3; } rgbl;
1766 rgbl.v4.val[3] = vdupq_n_u8(0xff);
1767 //y line1
1768 convertYUV420Internals.UVrgbToRGB(ruv, guv, buv, y1, rgbl.v3);
1769 vst4q_u8(dst1, rgbl.v4);
1770 //y line2
1771 convertYUV420Internals.UVrgbToRGB(ruv, guv, buv, y2, rgbl.v3);
1772 vst4q_u8(dst2, rgbl.v4);
1773 }
1774 };
1775
1776 #define YUV420_CONSTS(cn, bIdx, vIdx) _convertYUV420<cn, bIdx, vIdx> convertYUV420;
1777
1778 #endif
1779
fillAlpha(u8 *,u8 *)1780 template <int cn> inline void fillAlpha(u8 *, u8 *){}
fillAlpha(u8 * dst1,u8 * dst2)1781 template <> inline void fillAlpha<4>(u8 *dst1, u8 *dst2)
1782 {
1783 dst1[3] = 255;
1784 dst1[7] = 255;
1785 dst2[3] = 255;
1786 dst2[7] = 255;
1787 }
1788 template <int cn, int bIdx, int vIdx>
convertYUV420ToRGB(const u8 * y1,const u8 * y2,const u8 * uv,u8 * dst1,u8 * dst2)1789 inline void convertYUV420ToRGB(const u8 *y1, const u8 *y2, const u8 *uv, u8* dst1, u8 *dst2)
1790 {
1791 int Y11 = y1[0];
1792 int Y12 = y1[1];
1793 int Y21 = y2[0];
1794 int Y22 = y2[1];
1795
1796 int U = uv[1 - vIdx];
1797 int V = uv[vIdx];
1798
1799 int y11 = (COEFF_Y * std::max(16, Y11)) >> 1;
1800 int y12 = (COEFF_Y * std::max(16, Y12)) >> 1;
1801 int y21 = (COEFF_Y * std::max(16, Y21)) >> 1;
1802 int y22 = (COEFF_Y * std::max(16, Y22)) >> 1;
1803
1804 int uvR = COEFF_R + COEFF_RV * V;
1805 int uvG = COEFF_G - COEFF_GU * U - COEFF_GV * V;
1806 int uvB = COEFF_B + COEFF_BU * U;
1807
1808 dst1[2-bIdx] = internal::saturate_cast<u8>((((y11 + uvR) >> 1) + (1 << 4)) >> 5);
1809 dst1[1] = internal::saturate_cast<u8>((((y11 + uvG) >> 1) + (1 << 4)) >> 5);
1810 dst1[bIdx] = internal::saturate_cast<u8>((((y11 + uvB) >> 1) + (1 << 4)) >> 5);
1811
1812 dst1[cn+2-bIdx] = internal::saturate_cast<u8>((((y12 + uvR) >> 1) + (1 << 4)) >> 5);
1813 dst1[cn+1] = internal::saturate_cast<u8>((((y12 + uvG) >> 1) + (1 << 4)) >> 5);
1814 dst1[cn+bIdx] = internal::saturate_cast<u8>((((y12 + uvB) >> 1) + (1 << 4)) >> 5);
1815
1816 dst2[2-bIdx] = internal::saturate_cast<u8>((((y21 + uvR) >> 1) + (1 << 4)) >> 5);
1817 dst2[1] = internal::saturate_cast<u8>((((y21 + uvG) >> 1) + (1 << 4)) >> 5);
1818 dst2[bIdx] = internal::saturate_cast<u8>((((y21 + uvB) >> 1) + (1 << 4)) >> 5);
1819
1820 dst2[cn+2-bIdx] = internal::saturate_cast<u8>((((y22 + uvR) >> 1) + (1 << 4)) >> 5);
1821 dst2[cn+1] = internal::saturate_cast<u8>((((y22 + uvG) >> 1) + (1 << 4)) >> 5);
1822 dst2[cn+bIdx] = internal::saturate_cast<u8>((((y22 + uvB) >> 1) + (1 << 4)) >> 5);
1823
1824 fillAlpha<cn>(dst1, dst2);
1825 }
1826
1827 // converts R, G, B (B, G, R) pixels to RGB(BGR)565 format respectively
convertTo565(const uint8x16_t & vR,const uint8x16_t & vG,const uint8x16_t & vB)1828 inline uint8x16x2_t convertTo565( const uint8x16_t& vR, const uint8x16_t& vG, const uint8x16_t& vB )
1829 {
1830 uint8x16x2_t vRgb565; // rrrrRRRR ggggGGGG bbbbBBBB
1831
1832 vRgb565.val[1] = vsriq_n_u8(vB, vG, 5); // xxxxxxxx bbbbBggg
1833 vRgb565.val[0] = vshlq_n_u8(vG, 3); // gGGGG000 bbbbBggg
1834 vRgb565.val[0] = vsriq_n_u8(vRgb565.val[0], vR, 3); // gGGrrrrR bbbbBggg
1835
1836 return vRgb565;
1837 }
convertTo565(const u16 R,const u16 G,const u16 B,u8 * dst)1838 inline void convertTo565( const u16 R, const u16 G, const u16 B, u8 * dst )
1839 {
1840 *((u16*)dst) = (R >> 3)|((G&~3) << 3)|((B&~7) << 8);
1841 }
1842 #endif
1843
1844 } //namespace
1845
rgb2hsv(const Size2D & size,const u8 * srcBase,ptrdiff_t srcStride,u8 * dstBase,ptrdiff_t dstStride,s32 hrange)1846 void rgb2hsv(const Size2D &size,
1847 const u8 * srcBase, ptrdiff_t srcStride,
1848 u8 * dstBase, ptrdiff_t dstStride,
1849 s32 hrange)
1850 {
1851 internal::assertSupportedConfiguration();
1852 #ifdef CAROTENE_NEON
1853 size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
1854 const s32 hsv_shift = 12;
1855 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
1856 register const f32 vsdiv_table = f32(255 << hsv_shift);
1857 register f32 vhdiv_table = f32(hrange << hsv_shift);
1858 register const s32 vhrange = hrange;
1859 register const s32 v0 = s32(0);
1860 register const s32 vshift = s32(1 << (hsv_shift-1));
1861 register const s32 v6 = s32(6);
1862 register const f32 bias = 0.5f;
1863 #endif
1864
1865 for (size_t i = 0u; i < size.height; ++i)
1866 {
1867 const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
1868 u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
1869 size_t sj = 0u, dj = 0u, j = 0u;
1870
1871 for (; j < roiw8; sj += 24, dj += 24, j += 8)
1872 {
1873 internal::prefetch(src + sj);
1874 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
1875 CONVERT_TO_HSV_ASM(vld3.8 {d0-d2}, d0, d2)
1876 #else
1877 uint8x8x3_t vRgb = vld3_u8(src + sj);
1878 uint8x8x3_t vHsv = convertToHSV(vRgb.val[0], vRgb.val[1], vRgb.val[2], hrange);
1879 vst3_u8(dst + dj, vHsv);
1880 #endif
1881 }
1882
1883 for (; j < size.width; ++j, sj += 3, dj += 3)
1884 {
1885 convertToHSV(src[sj], src[sj+1], src[sj+2], hrange, hsv_shift, dst+dj);
1886 }
1887 }
1888 #else
1889 (void)size;
1890 (void)srcBase;
1891 (void)srcStride;
1892 (void)dstBase;
1893 (void)dstStride;
1894 (void)hrange;
1895 #endif
1896 }
1897
rgbx2hsv(const Size2D & size,const u8 * srcBase,ptrdiff_t srcStride,u8 * dstBase,ptrdiff_t dstStride,s32 hrange)1898 void rgbx2hsv(const Size2D &size,
1899 const u8 * srcBase, ptrdiff_t srcStride,
1900 u8 * dstBase, ptrdiff_t dstStride,
1901 s32 hrange)
1902 {
1903 internal::assertSupportedConfiguration();
1904 #ifdef CAROTENE_NEON
1905 size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
1906 const s32 hsv_shift = 12;
1907 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
1908 register const f32 vsdiv_table = f32(255 << hsv_shift);
1909 register f32 vhdiv_table = f32(hrange << hsv_shift);
1910 register const s32 vhrange = hrange;
1911 register const s32 v0 = s32(0);
1912 register const s32 vshift = s32(1 << (hsv_shift-1));
1913 register const s32 v6 = s32(6);
1914 register const f32 bias = 0.5f;
1915 #endif
1916
1917 for (size_t i = 0u; i < size.height; ++i)
1918 {
1919 const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
1920 u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
1921 size_t sj = 0u, dj = 0u, j = 0u;
1922
1923 for (; j < roiw8; sj += 32, dj += 24, j += 8)
1924 {
1925 internal::prefetch(src + sj);
1926 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
1927 CONVERT_TO_HSV_ASM(vld4.8 {d0-d3}, d0, d2)
1928 #else
1929 uint8x8x4_t vRgb = vld4_u8(src + sj);
1930 uint8x8x3_t vHsv = convertToHSV(vRgb.val[0], vRgb.val[1], vRgb.val[2], hrange);
1931 vst3_u8(dst + dj, vHsv);
1932 #endif
1933 }
1934
1935 for (; j < size.width; ++j, sj += 4, dj += 3)
1936 {
1937 convertToHSV(src[sj], src[sj+1], src[sj+2], hrange, hsv_shift, dst+dj);
1938 }
1939 }
1940 #else
1941 (void)size;
1942 (void)srcBase;
1943 (void)srcStride;
1944 (void)dstBase;
1945 (void)dstStride;
1946 (void)hrange;
1947 #endif
1948 }
1949
bgr2hsv(const Size2D & size,const u8 * srcBase,ptrdiff_t srcStride,u8 * dstBase,ptrdiff_t dstStride,s32 hrange)1950 void bgr2hsv(const Size2D &size,
1951 const u8 * srcBase, ptrdiff_t srcStride,
1952 u8 * dstBase, ptrdiff_t dstStride,
1953 s32 hrange)
1954 {
1955 internal::assertSupportedConfiguration();
1956 #ifdef CAROTENE_NEON
1957 size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
1958 const s32 hsv_shift = 12;
1959 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
1960 register const f32 vsdiv_table = f32(255 << hsv_shift);
1961 register f32 vhdiv_table = f32(hrange << hsv_shift);
1962 register const s32 vhrange = hrange;
1963 register const s32 v0 = s32(0);
1964 register const s32 vshift = s32(1 << (hsv_shift-1));
1965 register const s32 v6 = s32(6);
1966 register const f32 bias = 0.5f;
1967 #endif
1968
1969 for (size_t i = 0u; i < size.height; ++i)
1970 {
1971 const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
1972 u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
1973 size_t sj = 0u, dj = 0u, j = 0u;
1974
1975 for (; j < roiw8; sj += 24, dj += 24, j += 8)
1976 {
1977 internal::prefetch(src + sj);
1978 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
1979 CONVERT_TO_HSV_ASM(vld3.8 {d0-d2}, d2, d0)
1980 #else
1981 uint8x8x3_t vRgb = vld3_u8(src + sj);
1982 uint8x8x3_t vHsv = convertToHSV(vRgb.val[2], vRgb.val[1], vRgb.val[0], hrange);
1983 vst3_u8(dst + dj, vHsv);
1984 #endif
1985 }
1986
1987 for (; j < size.width; ++j, sj += 3, dj += 3)
1988 {
1989 convertToHSV(src[sj+2], src[sj+1], src[sj], hrange, hsv_shift, dst+dj);
1990 }
1991 }
1992 #else
1993 (void)size;
1994 (void)srcBase;
1995 (void)srcStride;
1996 (void)dstBase;
1997 (void)dstStride;
1998 (void)hrange;
1999 #endif
2000 }
2001
bgrx2hsv(const Size2D & size,const u8 * srcBase,ptrdiff_t srcStride,u8 * dstBase,ptrdiff_t dstStride,s32 hrange)2002 void bgrx2hsv(const Size2D &size,
2003 const u8 * srcBase, ptrdiff_t srcStride,
2004 u8 * dstBase, ptrdiff_t dstStride,
2005 s32 hrange)
2006 {
2007 internal::assertSupportedConfiguration();
2008 #ifdef CAROTENE_NEON
2009 size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
2010 const s32 hsv_shift = 12;
2011 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
2012 register const f32 vsdiv_table = f32(255 << hsv_shift);
2013 register f32 vhdiv_table = f32(hrange << hsv_shift);
2014 register const s32 vhrange = hrange;
2015 register const s32 v0 = s32(0);
2016 register const s32 vshift = s32(1 << (hsv_shift-1));
2017 register const s32 v6 = s32(6);
2018 register const f32 bias = 0.5f;
2019 #endif
2020
2021 for (size_t i = 0u; i < size.height; ++i)
2022 {
2023 const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
2024 u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
2025 size_t sj = 0u, dj = 0u, j = 0u;
2026
2027 for (; j < roiw8; sj += 32, dj += 24, j += 8)
2028 {
2029 internal::prefetch(src + sj);
2030 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
2031 CONVERT_TO_HSV_ASM(vld4.8 {d0-d3}, d2, d0)
2032 #else
2033 uint8x8x4_t vRgb = vld4_u8(src + sj);
2034 uint8x8x3_t vHsv = convertToHSV(vRgb.val[2], vRgb.val[1], vRgb.val[0], hrange);
2035 vst3_u8(dst + dj, vHsv);
2036 #endif
2037 }
2038
2039 for (; j < size.width; ++j, sj += 4, dj += 3)
2040 {
2041 convertToHSV(src[sj+2], src[sj+1], src[sj], hrange, hsv_shift, dst+dj);
2042 }
2043 }
2044 #else
2045 (void)size;
2046 (void)srcBase;
2047 (void)srcStride;
2048 (void)dstBase;
2049 (void)dstStride;
2050 (void)hrange;
2051 #endif
2052 }
2053
rgbx2bgr565(const Size2D & size,const u8 * srcBase,ptrdiff_t srcStride,u8 * dstBase,ptrdiff_t dstStride)2054 void rgbx2bgr565(const Size2D &size,
2055 const u8 * srcBase, ptrdiff_t srcStride,
2056 u8 * dstBase, ptrdiff_t dstStride)
2057 {
2058 internal::assertSupportedConfiguration();
2059 #ifdef CAROTENE_NEON
2060 size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
2061
2062 for (size_t i = 0u; i < size.height; ++i)
2063 {
2064 const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
2065 u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
2066 size_t sj = 0u, dj = 0u, j = 0u;
2067
2068 for (; j < roiw16; sj += 64, dj += 32, j += 16)
2069 {
2070 internal::prefetch(src + sj);
2071 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
2072 __asm__ (
2073 "vld4.8 {d2, d4, d6, d8}, [%[in0]] @ q0 q1 q2 q3 q4 \n\t"
2074 "vld4.8 {d3, d5, d7, d9}, [%[in1]] @ xxxxxxxx rrrrRRRR ggggGGGG bbbbBBBB xxxxxxxx \n\t"
2075 "vsri.8 q1, q2, #5 @ xxxxxxxx rrrrRggg ggggGGGG bbbbBBBB xxxxxxxx \n\t"
2076 "vshl.u8 q0, q2, #3 @ gGGGG000 rrrrRggg ggggGGGG bbbbBBBB xxxxxxxx \n\t"
2077 "vsri.8 q0, q3, #3 @ gGGbbbbB rrrrRggg ggggGGGG bbbbBBBB xxxxxxxx \n\t"
2078 "vst2.8 {d0, d2}, [%[out0]] \n\t"
2079 "vst2.8 {d1, d3}, [%[out1]] \n\t"
2080 : /*no output*/
2081 : [out0] "r" (dst + dj),
2082 [out1] "r" (dst + dj + 16),
2083 [in0] "r" (src + sj),
2084 [in1] "r" (src + sj + 32)
2085 : "d0","d1","d2","d3","d4","d5","d6","d7","d8","d9"
2086 );
2087 #else
2088 uint8x16x4_t vRgba = vld4q_u8(src + sj);
2089 uint8x16x2_t vVal565 = convertTo565(vRgba.val[2], vRgba.val[1], vRgba.val[0]);
2090 vst2q_u8(dst + dj, vVal565);
2091 #endif
2092 }
2093
2094 for (; j < size.width; ++j, sj += 4, dj += 2)
2095 {
2096 convertTo565(src[sj + 2], src[sj + 1], src[sj], dst + dj);
2097 }
2098 }
2099 #else
2100 (void)size;
2101 (void)srcBase;
2102 (void)srcStride;
2103 (void)dstBase;
2104 (void)dstStride;
2105 #endif
2106 }
2107
rgb2bgr565(const Size2D & size,const u8 * srcBase,ptrdiff_t srcStride,u8 * dstBase,ptrdiff_t dstStride)2108 void rgb2bgr565(const Size2D &size,
2109 const u8 * srcBase, ptrdiff_t srcStride,
2110 u8 * dstBase, ptrdiff_t dstStride)
2111 {
2112 internal::assertSupportedConfiguration();
2113 #ifdef CAROTENE_NEON
2114 size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
2115
2116 for (size_t i = 0u; i < size.height; ++i)
2117 {
2118 const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
2119 u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
2120 size_t sj = 0u, dj = 0u, j = 0u;
2121
2122 for (; j < roiw16; sj += 48, dj += 32, j += 16)
2123 {
2124 internal::prefetch(src + sj);
2125 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
2126 __asm__ (
2127 "vld3.8 {d2, d4, d6}, [%[in0]] @ q0 q1 q2 q3 q4 \n\t"
2128 "vld3.8 {d3, d5, d7}, [%[in1]] @ xxxxxxxx rrrrRRRR ggggGGGG bbbbBBBB xxxxxxxx \n\t"
2129 "vsri.8 q1, q2, #5 @ xxxxxxxx rrrrRggg ggggGGGG bbbbBBBB xxxxxxxx \n\t"
2130 "vshl.u8 q0, q2, #3 @ gGGGG000 rrrrRggg ggggGGGG bbbbBBBB xxxxxxxx \n\t"
2131 "vsri.8 q0, q3, #3 @ gGGbbbbB rrrrRggg ggggGGGG bbbbBBBB xxxxxxxx \n\t"
2132 "vst2.8 {d0, d2}, [%[out0]] \n\t"
2133 "vst2.8 {d1, d3}, [%[out1]] \n\t"
2134 : /*no output*/
2135 : [out0] "r" (dst + dj),
2136 [out1] "r" (dst + dj + 16),
2137 [in0] "r" (src + sj),
2138 [in1] "r" (src + sj + 24)
2139 : "d0","d1","d2","d3","d4","d5","d6","d7"
2140 );
2141 #else
2142 uint8x16x3_t vRgba = vld3q_u8(src + sj);
2143 uint8x16x2_t vVal565 = convertTo565(vRgba.val[2], vRgba.val[1], vRgba.val[0]);
2144 vst2q_u8(dst + dj, vVal565);
2145 #endif
2146 }
2147
2148 for (; j < size.width; ++j, sj += 3, dj += 2)
2149 {
2150 convertTo565(src[sj + 2], src[sj + 1], src[sj], dst + dj);
2151 }
2152 }
2153 #else
2154 (void)size;
2155 (void)srcBase;
2156 (void)srcStride;
2157 (void)dstBase;
2158 (void)dstStride;
2159 #endif
2160 }
2161
rgbx2rgb565(const Size2D & size,const u8 * srcBase,ptrdiff_t srcStride,u8 * dstBase,ptrdiff_t dstStride)2162 void rgbx2rgb565(const Size2D &size,
2163 const u8 * srcBase, ptrdiff_t srcStride,
2164 u8 * dstBase, ptrdiff_t dstStride)
2165 {
2166 internal::assertSupportedConfiguration();
2167 #ifdef CAROTENE_NEON
2168 size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
2169
2170 for (size_t i = 0u; i < size.height; ++i)
2171 {
2172 const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
2173 u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
2174 size_t sj = 0u, dj = 0u, j = 0u;
2175
2176 for (; j < roiw16; sj += 64, dj += 32, j += 16)
2177 {
2178 internal::prefetch(src + sj);
2179 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
2180 __asm__ (
2181 "vld4.8 {d0, d2, d4, d6}, [%[in0]] @ q0 q1 q2 q3 \n\t"
2182 "vld4.8 {d1, d3, d5, d7}, [%[in1]] @ rrrrRRRR ggggGGGG bbbbBBBB aaaaAAAA \n\t"
2183 "vsri.8 q2, q1, #5 @ rrrrRRRR ggggGGGG bbbbBggg aaaaAAAA \n\t"
2184 "vshl.u8 q1, #3 @ rrrrRRRR gGGGG000 bbbbBggg aaaaAAAA \n\t"
2185 "vsri.8 q1, q0, #3 @ rrrrRRRR gGGrrrrR bbbbBggg aaaaAAAA \n\t"
2186 "vst2.8 {d2, d4}, [%[out0]] \n\t"
2187 "vst2.8 {d3, d5}, [%[out1]] \n\t"
2188 : /*no output*/
2189 : [out0] "r" (dst + dj),
2190 [out1] "r" (dst + dj + 16),
2191 [in0] "r" (src + sj),
2192 [in1] "r" (src + sj + 32)
2193 : "d0","d1","d2","d3","d4","d5","d6","d7"
2194 );
2195 #else
2196 uint8x16x4_t vRgba = vld4q_u8(src + sj);
2197 uint8x16x2_t vVal565 = convertTo565(vRgba.val[0], vRgba.val[1], vRgba.val[2]);
2198 vst2q_u8(dst + dj, vVal565);
2199 #endif
2200 }
2201
2202 for (; j < size.width; ++j, sj += 4, dj += 2)
2203 {
2204 convertTo565(src[sj], src[sj + 1], src[sj + 2], dst + dj);
2205 }
2206 }
2207 #else
2208 (void)size;
2209 (void)srcBase;
2210 (void)srcStride;
2211 (void)dstBase;
2212 (void)dstStride;
2213 #endif
2214 }
2215
rgb2rgb565(const Size2D & size,const u8 * srcBase,ptrdiff_t srcStride,u8 * dstBase,ptrdiff_t dstStride)2216 void rgb2rgb565(const Size2D &size,
2217 const u8 * srcBase, ptrdiff_t srcStride,
2218 u8 * dstBase, ptrdiff_t dstStride)
2219 {
2220 internal::assertSupportedConfiguration();
2221 #ifdef CAROTENE_NEON
2222 size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
2223
2224 for (size_t i = 0u; i < size.height; ++i)
2225 {
2226 const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
2227 u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
2228 size_t sj = 0u, dj = 0u, j = 0u;
2229
2230 for (; j < roiw16; sj += 48, dj += 32, j += 16)
2231 {
2232 internal::prefetch(src + sj);
2233 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
2234 __asm__ (
2235 "vld3.8 {d0, d2, d4}, [%[in0]] @ q0 q1 q2 q3 \n\t"
2236 "vld3.8 {d1, d3, d5}, [%[in1]] @ rrrrRRRR ggggGGGG bbbbBBBB xxxxxxxx \n\t"
2237 "vsri.8 q2, q1, #5 @ rrrrRRRR ggggGGGG bbbbBggg xxxxxxxx \n\t"
2238 "vshl.u8 q1, #3 @ rrrrRRRR gGGGG000 bbbbBggg xxxxxxxx \n\t"
2239 "vsri.8 q1, q0, #3 @ rrrrRRRR gGGrrrrR bbbbBggg xxxxxxxx \n\t"
2240 "vst2.8 {d2, d4}, [%[out0]] \n\t"
2241 "vst2.8 {d3, d5}, [%[out1]] \n\t"
2242 : /*no output*/
2243 : [out0] "r" (dst + dj),
2244 [out1] "r" (dst + dj + 16),
2245 [in0] "r" (src + sj),
2246 [in1] "r" (src + sj + 24)
2247 : "d0","d1","d2","d3","d4","d5"
2248 );
2249 #else
2250 uint8x16x3_t vRgba = vld3q_u8(src + sj);
2251 uint8x16x2_t vVal565 = convertTo565(vRgba.val[0], vRgba.val[1], vRgba.val[2]);
2252 vst2q_u8(dst + dj, vVal565);
2253 #endif
2254 }
2255
2256 for (; j < size.width; ++j, sj += 3, dj += 2)
2257 {
2258 convertTo565(src[sj], src[sj + 1], src[sj + 2], dst + dj);
2259 }
2260 }
2261 #else
2262 (void)size;
2263 (void)srcBase;
2264 (void)srcStride;
2265 (void)dstBase;
2266 (void)dstStride;
2267 #endif
2268 }
2269
rgb2ycrcb(const Size2D & size,const u8 * srcBase,ptrdiff_t srcStride,u8 * dstBase,ptrdiff_t dstStride)2270 void rgb2ycrcb(const Size2D &size,
2271 const u8 * srcBase, ptrdiff_t srcStride,
2272 u8 * dstBase, ptrdiff_t dstStride)
2273 {
2274 internal::assertSupportedConfiguration();
2275 #ifdef CAROTENE_NEON
2276 YCRCB_CONSTS
2277 size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
2278
2279 for (size_t i = 0u; i < size.height; ++i)
2280 {
2281 const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
2282 u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
2283 size_t sj = 0u, dj = 0u, j = 0u;
2284
2285 for (; j < roiw8; sj += 24, dj += 24, j += 8)
2286 {
2287 internal::prefetch(src + sj);
2288 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
2289 CONVERTTOYCRCB(vld3.8 {d0-d2}, d0, d1, d2)
2290 #else
2291 uint8x8x3_t vRgb = vld3_u8(src + sj);
2292 int16x8_t vR = vreinterpretq_s16_u16(vmovl_u8(vRgb.val[0]));
2293 int16x8_t vG = vreinterpretq_s16_u16(vmovl_u8(vRgb.val[1]));
2294 int16x8_t vB = vreinterpretq_s16_u16(vmovl_u8(vRgb.val[2]));
2295 uint8x8x3_t vYCrCb = convertToYCrCb(vR, vG, vB, vcYRG, vcYB, vcCrGB, vcCbRG);
2296 vst3_u8(dst + dj, vYCrCb);
2297 #endif
2298 }
2299
2300 for (; j < size.width; ++j, sj += 3, dj += 3)
2301 {
2302 S_CONVERTTOYCRCB(src[sj], src[sj + 1], src[sj + 2]);
2303 }
2304 }
2305 #else
2306 (void)size;
2307 (void)srcBase;
2308 (void)srcStride;
2309 (void)dstBase;
2310 (void)dstStride;
2311 #endif
2312 }
2313
rgbx2ycrcb(const Size2D & size,const u8 * srcBase,ptrdiff_t srcStride,u8 * dstBase,ptrdiff_t dstStride)2314 void rgbx2ycrcb(const Size2D &size,
2315 const u8 * srcBase, ptrdiff_t srcStride,
2316 u8 * dstBase, ptrdiff_t dstStride)
2317 {
2318 internal::assertSupportedConfiguration();
2319 #ifdef CAROTENE_NEON
2320 YCRCB_CONSTS
2321 size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
2322
2323 for (size_t i = 0u; i < size.height; ++i)
2324 {
2325 const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
2326 u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
2327 size_t sj = 0u, dj = 0u, j = 0u;
2328
2329 for (; j < roiw8; sj += 32, dj += 24, j += 8)
2330 {
2331 internal::prefetch(src + sj);
2332 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
2333 CONVERTTOYCRCB(vld4.8 {d0-d3}, d0, d1, d2)
2334 #else
2335 uint8x8x4_t vRgba = vld4_u8(src + sj);
2336 int16x8_t vR = vreinterpretq_s16_u16(vmovl_u8(vRgba.val[0]));
2337 int16x8_t vG = vreinterpretq_s16_u16(vmovl_u8(vRgba.val[1]));
2338 int16x8_t vB = vreinterpretq_s16_u16(vmovl_u8(vRgba.val[2]));
2339 uint8x8x3_t vYCrCb = convertToYCrCb(vR, vG, vB, vcYRG, vcYB, vcCrGB, vcCbRG);
2340 vst3_u8(dst + dj, vYCrCb);
2341 #endif
2342 }
2343
2344 for (; j < size.width; ++j, sj += 4, dj += 3)
2345 {
2346 S_CONVERTTOYCRCB(src[sj], src[sj + 1], src[sj + 2]);
2347 }
2348 }
2349 #else
2350 (void)size;
2351 (void)srcBase;
2352 (void)srcStride;
2353 (void)dstBase;
2354 (void)dstStride;
2355 #endif
2356 }
2357
bgr2ycrcb(const Size2D & size,const u8 * srcBase,ptrdiff_t srcStride,u8 * dstBase,ptrdiff_t dstStride)2358 void bgr2ycrcb(const Size2D &size,
2359 const u8 * srcBase, ptrdiff_t srcStride,
2360 u8 * dstBase, ptrdiff_t dstStride)
2361 {
2362 internal::assertSupportedConfiguration();
2363 #ifdef CAROTENE_NEON
2364 YCRCB_CONSTS
2365 size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
2366
2367 for (size_t i = 0u; i < size.height; ++i)
2368 {
2369 const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
2370 u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
2371 size_t sj = 0u, dj = 0u, j = 0u;
2372
2373 for (; j < roiw8; sj += 24, dj += 24, j += 8)
2374 {
2375 internal::prefetch(src + sj);
2376 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
2377 CONVERTTOYCRCB(vld3.8 {d0-d2}, d2, d1, d0)
2378 #else
2379 uint8x8x3_t vBgr = vld3_u8(src + sj);
2380 int16x8_t vB = vreinterpretq_s16_u16(vmovl_u8(vBgr.val[0]));
2381 int16x8_t vG = vreinterpretq_s16_u16(vmovl_u8(vBgr.val[1]));
2382 int16x8_t vR = vreinterpretq_s16_u16(vmovl_u8(vBgr.val[2]));
2383 uint8x8x3_t vYCrCb = convertToYCrCb(vR, vG, vB, vcYRG, vcYB, vcCrGB, vcCbRG);
2384 vst3_u8(dst + dj, vYCrCb);
2385 #endif
2386 }
2387
2388 for (; j < size.width; ++j, sj += 3, dj += 3)
2389 {
2390 S_CONVERTTOYCRCB(src[sj + 2], src[sj + 1], src[sj]);
2391 }
2392 }
2393 #else
2394 (void)size;
2395 (void)srcBase;
2396 (void)srcStride;
2397 (void)dstBase;
2398 (void)dstStride;
2399 #endif
2400 }
2401
bgrx2ycrcb(const Size2D & size,const u8 * srcBase,ptrdiff_t srcStride,u8 * dstBase,ptrdiff_t dstStride)2402 void bgrx2ycrcb(const Size2D &size,
2403 const u8 * srcBase, ptrdiff_t srcStride,
2404 u8 * dstBase, ptrdiff_t dstStride)
2405 {
2406 internal::assertSupportedConfiguration();
2407 #ifdef CAROTENE_NEON
2408 YCRCB_CONSTS
2409 size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;
2410
2411 for (size_t i = 0u; i < size.height; ++i)
2412 {
2413 const u8 * src = internal::getRowPtr(srcBase, srcStride, i);
2414 u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
2415 size_t sj = 0u, dj = 0u, j = 0u;
2416
2417 for (; j < roiw8; sj += 32, dj += 24, j += 8)
2418 {
2419 internal::prefetch(src + sj);
2420 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
2421 CONVERTTOYCRCB(vld4.8 {d0-d3}, d2, d1, d0)
2422 #else
2423 uint8x8x4_t vBgra = vld4_u8(src + sj);
2424 int16x8_t vB = vreinterpretq_s16_u16(vmovl_u8(vBgra.val[0]));
2425 int16x8_t vG = vreinterpretq_s16_u16(vmovl_u8(vBgra.val[1]));
2426 int16x8_t vR = vreinterpretq_s16_u16(vmovl_u8(vBgra.val[2]));
2427 uint8x8x3_t vYCrCb = convertToYCrCb(vR, vG, vB, vcYRG, vcYB, vcCrGB, vcCbRG);
2428 vst3_u8(dst + dj, vYCrCb);
2429 #endif
2430 }
2431
2432 for (; j < size.width; ++j, sj += 4, dj += 3)
2433 {
2434 S_CONVERTTOYCRCB(src[sj + 2], src[sj + 1], src[sj]);
2435 }
2436 }
2437 #else
2438 (void)size;
2439 (void)srcBase;
2440 (void)srcStride;
2441 (void)dstBase;
2442 (void)dstStride;
2443 #endif
2444 }
2445
yuv420sp2rgb(const Size2D & size,const u8 * yBase,ptrdiff_t yStride,const u8 * uvBase,ptrdiff_t uvStride,u8 * dstBase,ptrdiff_t dstStride)2446 void yuv420sp2rgb(const Size2D &size,
2447 const u8 * yBase, ptrdiff_t yStride,
2448 const u8 * uvBase, ptrdiff_t uvStride,
2449 u8 * dstBase, ptrdiff_t dstStride)
2450 {
2451 // input data:
2452 ////////////// Y matrix:
2453 // {y1, y2, y3, y4, y5, y6, y7, y8, y9, y10, y11, y12, y13, y14, y15, y16}
2454 // {Y1, Y2, Y3, Y4, Y5, Y6, Y7, Y8, Y9, Y10, Y11, Y12, Y13, Y14, Y15, Y16}
2455 ////////////// UV matrix:
2456 // {v12, u12, v34, u34, v56, u56, v78, u78, v90 u90, V12, U12, V34, U34, V56, U56}
2457
2458 // fp version
2459 // R = 1.164(Y - 16) + 1.596(V - 128)
2460 // G = 1.164(Y - 16) - 0.813(V - 128) - 0.391(U - 128)
2461 // B = 1.164(Y - 16) + 2.018(U - 128)
2462
2463 // integer version
2464 // R = [((149*y)/2 + (-14248+102*v) )/2]/32
2465 // G = [((149*y)/2 + ((8663- 25*u)-52*v))/2]/32
2466 // B = [((149*y)/2 + (-17705+129*u) )/2]/32
2467
2468 // error estimation:
2469 //Rerr = 0.0000625 * y - 0.00225 * v - 0.287
2470 //Gerr = 0.0000625 * y + 0.0005 * v + 0.000375 * u + 0.128625
2471 //Berr = 0.0000625 * y - 0.002375 * u - 0.287375
2472
2473 //real error test:
2474 //=================
2475 //R: 1 less: 520960 == 3.11% of full space
2476 //G: 1 less: 251425 == 1.50% of full space
2477 //B: 1 less: 455424 == 2.71% of full space
2478 //=================
2479 //R: 1 more: 642048 == 3.83% of full space
2480 //G: 1 more: 192458 == 1.15% of full space
2481 //B: 1 more: 445184 == 2.65% of full space
2482
2483 internal::assertSupportedConfiguration();
2484 #ifdef CAROTENE_NEON
2485 YUV420_CONSTS(3, 2, 0)
2486 size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
2487
2488 for (size_t i = 0u; i < size.height; i+=2)
2489 {
2490 const u8 * uv = internal::getRowPtr(uvBase, uvStride, i>>1);
2491 const u8 * y1 = internal::getRowPtr(yBase, yStride, i);
2492 const u8 * y2 = internal::getRowPtr(yBase, yStride, i+1);
2493 u8 * dst1 = internal::getRowPtr(dstBase, dstStride, i);
2494 u8 * dst2 = internal::getRowPtr(dstBase, dstStride, i+1);
2495
2496 size_t dj = 0u, j = 0u;
2497 for (; j < roiw16; dj += 48, j += 16)
2498 {
2499 internal::prefetch(uv + j);
2500 internal::prefetch(y1 + j);
2501 internal::prefetch(y2 + j);
2502 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
2503 CONVERTYUV420TORGB(3, d1, d0, q5, q6)
2504 #else
2505 convertYUV420.ToRGB(y1 + j, y2 + j, uv + j, dst1 + dj, dst2 + dj);
2506 #endif
2507 }
2508 for (; j + 2 <= size.width; j+=2, dj += 6)
2509 {
2510 convertYUV420ToRGB<3, 2, 0>(y1+j, y2+j, uv+j, dst1 + dj, dst2 + dj);
2511 }
2512 }
2513 #else
2514 (void)size;
2515 (void)yBase;
2516 (void)yStride;
2517 (void)uvBase;
2518 (void)uvStride;
2519 (void)dstBase;
2520 (void)dstStride;
2521 #endif
2522 }
2523
yuv420sp2rgbx(const Size2D & size,const u8 * yBase,ptrdiff_t yStride,const u8 * uvBase,ptrdiff_t uvStride,u8 * dstBase,ptrdiff_t dstStride)2524 void yuv420sp2rgbx(const Size2D &size,
2525 const u8 * yBase, ptrdiff_t yStride,
2526 const u8 * uvBase, ptrdiff_t uvStride,
2527 u8 * dstBase, ptrdiff_t dstStride)
2528 {
2529 internal::assertSupportedConfiguration();
2530 #ifdef CAROTENE_NEON
2531 YUV420_CONSTS(4, 2, 0)
2532 size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
2533
2534 for (size_t i = 0u; i < size.height; i+=2)
2535 {
2536 const u8 * uv = internal::getRowPtr(uvBase, uvStride, i>>1);
2537 const u8 * y1 = internal::getRowPtr(yBase, yStride, i);
2538 const u8 * y2 = internal::getRowPtr(yBase, yStride, i+1);
2539 u8 * dst1 = internal::getRowPtr(dstBase, dstStride, i);
2540 u8 * dst2 = internal::getRowPtr(dstBase, dstStride, i+1);
2541
2542 size_t dj = 0u, j = 0u;
2543 for (; j < roiw16; dj += 64, j += 16)
2544 {
2545 internal::prefetch(uv + j);
2546 internal::prefetch(y1 + j);
2547 internal::prefetch(y2 + j);
2548 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
2549 CONVERTYUV420TORGB(4, d1, d0, q5, q6)
2550 #else
2551 convertYUV420.ToRGB(y1 + j, y2 + j, uv + j, dst1 + dj, dst2 + dj);
2552 #endif
2553 }
2554 for (; j + 2 <= size.width; j+=2, dj += 8)
2555 {
2556 convertYUV420ToRGB<4, 2, 0>(y1+j, y2+j, uv+j, dst1 + dj, dst2 + dj);
2557 }
2558 }
2559 #else
2560 (void)size;
2561 (void)yBase;
2562 (void)yStride;
2563 (void)uvBase;
2564 (void)uvStride;
2565 (void)dstBase;
2566 (void)dstStride;
2567 #endif
2568 }
2569
yuv420i2rgb(const Size2D & size,const u8 * yBase,ptrdiff_t yStride,const u8 * uvBase,ptrdiff_t uvStride,u8 * dstBase,ptrdiff_t dstStride)2570 void yuv420i2rgb(const Size2D &size,
2571 const u8 * yBase, ptrdiff_t yStride,
2572 const u8 * uvBase, ptrdiff_t uvStride,
2573 u8 * dstBase, ptrdiff_t dstStride)
2574 {
2575 internal::assertSupportedConfiguration();
2576 #ifdef CAROTENE_NEON
2577 YUV420_CONSTS(3, 2, 1)
2578 size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
2579
2580 for (size_t i = 0u; i < size.height; i+=2)
2581 {
2582 const u8 * uv = internal::getRowPtr(uvBase, uvStride, i>>1);
2583 const u8 * y1 = internal::getRowPtr(yBase, yStride, i);
2584 const u8 * y2 = internal::getRowPtr(yBase, yStride, i+1);
2585 u8 * dst1 = internal::getRowPtr(dstBase, dstStride, i);
2586 u8 * dst2 = internal::getRowPtr(dstBase, dstStride, i+1);
2587
2588 size_t dj = 0u, j = 0u;
2589 for (; j < roiw16; dj += 48, j += 16)
2590 {
2591 internal::prefetch(uv + j);
2592 internal::prefetch(y1 + j);
2593 internal::prefetch(y2 + j);
2594 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
2595 CONVERTYUV420TORGB(3, d0, d1, q5, q6)
2596 #else
2597 convertYUV420.ToRGB(y1 + j, y2 + j, uv + j, dst1 + dj, dst2 + dj);
2598 #endif
2599 }
2600 for (; j + 2 <= size.width; j+=2, dj += 6)
2601 {
2602 convertYUV420ToRGB<3, 2, 1>(y1+j, y2+j, uv+j, dst1 + dj, dst2 + dj);
2603 }
2604 }
2605 #else
2606 (void)size;
2607 (void)yBase;
2608 (void)yStride;
2609 (void)uvBase;
2610 (void)uvStride;
2611 (void)dstBase;
2612 (void)dstStride;
2613 #endif
2614 }
2615
yuv420i2rgbx(const Size2D & size,const u8 * yBase,ptrdiff_t yStride,const u8 * uvBase,ptrdiff_t uvStride,u8 * dstBase,ptrdiff_t dstStride)2616 void yuv420i2rgbx(const Size2D &size,
2617 const u8 * yBase, ptrdiff_t yStride,
2618 const u8 * uvBase, ptrdiff_t uvStride,
2619 u8 * dstBase, ptrdiff_t dstStride)
2620 {
2621 internal::assertSupportedConfiguration();
2622 #ifdef CAROTENE_NEON
2623 YUV420_CONSTS(4, 2, 1)
2624 size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
2625
2626 for (size_t i = 0u; i < size.height; i+=2)
2627 {
2628 const u8 * uv = internal::getRowPtr(uvBase, uvStride, i>>1);
2629 const u8 * y1 = internal::getRowPtr(yBase, yStride, i);
2630 const u8 * y2 = internal::getRowPtr(yBase, yStride, i+1);
2631 u8 * dst1 = internal::getRowPtr(dstBase, dstStride, i);
2632 u8 * dst2 = internal::getRowPtr(dstBase, dstStride, i+1);
2633
2634 size_t dj = 0u, j = 0u;
2635 for (; j < roiw16; dj += 64, j += 16)
2636 {
2637 internal::prefetch(uv + j);
2638 internal::prefetch(y1 + j);
2639 internal::prefetch(y2 + j);
2640 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
2641 CONVERTYUV420TORGB(4, d0, d1, q5, q6)
2642 #else
2643 convertYUV420.ToRGB(y1 + j, y2 + j, uv + j, dst1 + dj, dst2 + dj);
2644 #endif
2645 }
2646 for (; j + 2 <= size.width; j+=2, dj += 8)
2647 {
2648 convertYUV420ToRGB<4, 2, 1>(y1+j, y2+j, uv+j, dst1 + dj, dst2 + dj);
2649 }
2650 }
2651 #else
2652 (void)size;
2653 (void)yBase;
2654 (void)yStride;
2655 (void)uvBase;
2656 (void)uvStride;
2657 (void)dstBase;
2658 (void)dstStride;
2659 #endif
2660 }
2661
yuv420sp2bgr(const Size2D & size,const u8 * yBase,ptrdiff_t yStride,const u8 * uvBase,ptrdiff_t uvStride,u8 * dstBase,ptrdiff_t dstStride)2662 void yuv420sp2bgr(const Size2D &size,
2663 const u8 * yBase, ptrdiff_t yStride,
2664 const u8 * uvBase, ptrdiff_t uvStride,
2665 u8 * dstBase, ptrdiff_t dstStride)
2666 {
2667 internal::assertSupportedConfiguration();
2668 #ifdef CAROTENE_NEON
2669 YUV420_CONSTS(3, 0, 0)
2670 size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
2671
2672 for (size_t i = 0u; i < size.height; i+=2)
2673 {
2674 const u8 * uv = internal::getRowPtr(uvBase, uvStride, i>>1);
2675 const u8 * y1 = internal::getRowPtr(yBase, yStride, i);
2676 const u8 * y2 = internal::getRowPtr(yBase, yStride, i+1);
2677 u8 * dst1 = internal::getRowPtr(dstBase, dstStride, i);
2678 u8 * dst2 = internal::getRowPtr(dstBase, dstStride, i+1);
2679
2680 size_t dj = 0u, j = 0u;
2681 for (; j < roiw16; dj += 48, j += 16)
2682 {
2683 internal::prefetch(uv + j);
2684 internal::prefetch(y1 + j);
2685 internal::prefetch(y2 + j);
2686 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
2687 CONVERTYUV420TORGB(3, d1, d0, q6, q5)
2688 #else
2689 convertYUV420.ToRGB(y1 + j, y2 + j, uv + j, dst1 + dj, dst2 + dj);
2690 #endif
2691 }
2692 for (; j + 2 <= size.width; j+=2, dj += 6)
2693 {
2694 convertYUV420ToRGB<3, 0, 0>(y1+j, y2+j, uv+j, dst1 + dj, dst2 + dj);
2695 }
2696 }
2697 #else
2698 (void)size;
2699 (void)yBase;
2700 (void)yStride;
2701 (void)uvBase;
2702 (void)uvStride;
2703 (void)dstBase;
2704 (void)dstStride;
2705 #endif
2706 }
2707
yuv420sp2bgrx(const Size2D & size,const u8 * yBase,ptrdiff_t yStride,const u8 * uvBase,ptrdiff_t uvStride,u8 * dstBase,ptrdiff_t dstStride)2708 void yuv420sp2bgrx(const Size2D &size,
2709 const u8 * yBase, ptrdiff_t yStride,
2710 const u8 * uvBase, ptrdiff_t uvStride,
2711 u8 * dstBase, ptrdiff_t dstStride)
2712 {
2713 internal::assertSupportedConfiguration();
2714 #ifdef CAROTENE_NEON
2715 YUV420_CONSTS(4, 0, 0)
2716 size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
2717
2718 for (size_t i = 0u; i < size.height; i+=2)
2719 {
2720 const u8 * uv = internal::getRowPtr(uvBase, uvStride, i>>1);
2721 const u8 * y1 = internal::getRowPtr(yBase, yStride, i);
2722 const u8 * y2 = internal::getRowPtr(yBase, yStride, i+1);
2723 u8 * dst1 = internal::getRowPtr(dstBase, dstStride, i);
2724 u8 * dst2 = internal::getRowPtr(dstBase, dstStride, i+1);
2725
2726 size_t dj = 0u, j = 0u;
2727 for (; j < roiw16; dj += 64, j += 16)
2728 {
2729 internal::prefetch(uv + j);
2730 internal::prefetch(y1 + j);
2731 internal::prefetch(y2 + j);
2732 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
2733 CONVERTYUV420TORGB(4, d1, d0, q6, q5)
2734 #else
2735 convertYUV420.ToRGB(y1 + j, y2 + j, uv + j, dst1 + dj, dst2 + dj);
2736 #endif
2737 }
2738 for (; j + 2 <= size.width; j+=2, dj += 8)
2739 {
2740 convertYUV420ToRGB<4, 0, 0>(y1+j, y2+j, uv+j, dst1 + dj, dst2 + dj);
2741 }
2742 }
2743 #else
2744 (void)size;
2745 (void)yBase;
2746 (void)yStride;
2747 (void)uvBase;
2748 (void)uvStride;
2749 (void)dstBase;
2750 (void)dstStride;
2751 #endif
2752 }
2753
yuv420i2bgr(const Size2D & size,const u8 * yBase,ptrdiff_t yStride,const u8 * uvBase,ptrdiff_t uvStride,u8 * dstBase,ptrdiff_t dstStride)2754 void yuv420i2bgr(const Size2D &size,
2755 const u8 * yBase, ptrdiff_t yStride,
2756 const u8 * uvBase, ptrdiff_t uvStride,
2757 u8 * dstBase, ptrdiff_t dstStride)
2758 {
2759 internal::assertSupportedConfiguration();
2760 #ifdef CAROTENE_NEON
2761 YUV420_CONSTS(3, 0, 1)
2762 size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
2763
2764 for (size_t i = 0u; i < size.height; i+=2)
2765 {
2766 const u8 * uv = internal::getRowPtr(uvBase, uvStride, i>>1);
2767 const u8 * y1 = internal::getRowPtr(yBase, yStride, i);
2768 const u8 * y2 = internal::getRowPtr(yBase, yStride, i+1);
2769 u8 * dst1 = internal::getRowPtr(dstBase, dstStride, i);
2770 u8 * dst2 = internal::getRowPtr(dstBase, dstStride, i+1);
2771
2772 size_t dj = 0u, j = 0u;
2773 for (; j < roiw16; dj += 48, j += 16)
2774 {
2775 internal::prefetch(uv + j);
2776 internal::prefetch(y1 + j);
2777 internal::prefetch(y2 + j);
2778 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
2779 CONVERTYUV420TORGB(3, d0, d1, q6, q5)
2780 #else
2781 convertYUV420.ToRGB(y1 + j, y2 + j, uv + j, dst1 + dj, dst2 + dj);
2782 #endif
2783 }
2784 for (; j + 2 <= size.width; j+=2, dj += 6)
2785 {
2786 convertYUV420ToRGB<3, 0, 1>(y1+j, y2+j, uv+j, dst1 + dj, dst2 + dj);
2787 }
2788 }
2789 #else
2790 (void)size;
2791 (void)yBase;
2792 (void)yStride;
2793 (void)uvBase;
2794 (void)uvStride;
2795 (void)dstBase;
2796 (void)dstStride;
2797 #endif
2798 }
2799
yuv420i2bgrx(const Size2D & size,const u8 * yBase,ptrdiff_t yStride,const u8 * uvBase,ptrdiff_t uvStride,u8 * dstBase,ptrdiff_t dstStride)2800 void yuv420i2bgrx(const Size2D &size,
2801 const u8 * yBase, ptrdiff_t yStride,
2802 const u8 * uvBase, ptrdiff_t uvStride,
2803 u8 * dstBase, ptrdiff_t dstStride)
2804 {
2805 internal::assertSupportedConfiguration();
2806 #ifdef CAROTENE_NEON
2807 YUV420_CONSTS(4, 0, 1)
2808 size_t roiw16 = size.width >= 15 ? size.width - 15 : 0;
2809
2810 for (size_t i = 0u; i < size.height; i+=2)
2811 {
2812 const u8 * uv = internal::getRowPtr(uvBase, uvStride, i>>1);
2813 const u8 * y1 = internal::getRowPtr(yBase, yStride, i);
2814 const u8 * y2 = internal::getRowPtr(yBase, yStride, i+1);
2815 u8 * dst1 = internal::getRowPtr(dstBase, dstStride, i);
2816 u8 * dst2 = internal::getRowPtr(dstBase, dstStride, i+1);
2817
2818 size_t dj = 0u, j = 0u;
2819 for (; j < roiw16; dj += 64, j += 16)
2820 {
2821 internal::prefetch(uv + j);
2822 internal::prefetch(y1 + j);
2823 internal::prefetch(y2 + j);
2824 #if !defined(__aarch64__) && defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ < 7 && !defined(__clang__)
2825 CONVERTYUV420TORGB(4, d0, d1, q6, q5)
2826 #else
2827 convertYUV420.ToRGB(y1 + j, y2 + j, uv + j, dst1 + dj, dst2 + dj);
2828 #endif
2829 }
2830 for (; j + 2 <= size.width; j+=2, dj += 8)
2831 {
2832 convertYUV420ToRGB<4, 0, 1>(y1+j, y2+j, uv+j, dst1 + dj, dst2 + dj);
2833 }
2834 }
2835 #else
2836 (void)size;
2837 (void)yBase;
2838 (void)yStride;
2839 (void)uvBase;
2840 (void)uvStride;
2841 (void)dstBase;
2842 (void)dstStride;
2843 #endif
2844 }
2845
2846 } // namespace CAROTENE_NS
2847