1 /*
2 * jdmrgext-neon.c - merged upsampling/color conversion (Arm Neon)
3 *
4 * Copyright (C) 2020, Arm Limited. All Rights Reserved.
5 * Copyright (C) 2020, D. R. Commander. All Rights Reserved.
6 *
7 * This software is provided 'as-is', without any express or implied
8 * warranty. In no event will the authors be held liable for any damages
9 * arising from the use of this software.
10 *
11 * Permission is granted to anyone to use this software for any purpose,
12 * including commercial applications, and to alter it and redistribute it
13 * freely, subject to the following restrictions:
14 *
15 * 1. The origin of this software must not be misrepresented; you must not
16 * claim that you wrote the original software. If you use this software
17 * in a product, an acknowledgment in the product documentation would be
18 * appreciated but is not required.
19 * 2. Altered source versions must be plainly marked as such, and must not be
20 * misrepresented as being the original software.
21 * 3. This notice may not be removed or altered from any source distribution.
22 */
23
24 /* This file is included by jdmerge-neon.c. */
25
26
27 /* These routines combine simple (non-fancy, i.e. non-smooth) h2v1 or h2v2
28 * chroma upsampling and YCbCr -> RGB color conversion into a single function.
29 *
30 * As with the standalone functions, YCbCr -> RGB conversion is defined by the
31 * following equations:
32 * R = Y + 1.40200 * (Cr - 128)
33 * G = Y - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128)
34 * B = Y + 1.77200 * (Cb - 128)
35 *
36 * Scaled integer constants are used to avoid floating-point arithmetic:
37 * 0.3441467 = 11277 * 2^-15
38 * 0.7141418 = 23401 * 2^-15
39 * 1.4020386 = 22971 * 2^-14
40 * 1.7720337 = 29033 * 2^-14
41 * These constants are defined in jdmerge-neon.c.
42 *
43 * To ensure correct results, rounding is used when descaling.
44 */
45
46 /* Notes on safe memory access for merged upsampling/YCbCr -> RGB conversion
47 * routines:
48 *
49 * Input memory buffers can be safely overread up to the next multiple of
50 * ALIGN_SIZE bytes, since they are always allocated by alloc_sarray() in
51 * jmemmgr.c.
52 *
53 * The output buffer cannot safely be written beyond output_width, since
54 * output_buf points to a possibly unpadded row in the decompressed image
55 * buffer allocated by the calling program.
56 */
57
58 /* Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
59 */
60
jsimd_h2v1_merged_upsample_neon(JDIMENSION output_width,JSAMPIMAGE input_buf,JDIMENSION in_row_group_ctr,JSAMPARRAY output_buf)61 void jsimd_h2v1_merged_upsample_neon(JDIMENSION output_width,
62 JSAMPIMAGE input_buf,
63 JDIMENSION in_row_group_ctr,
64 JSAMPARRAY output_buf)
65 {
66 JSAMPROW outptr;
67 /* Pointers to Y, Cb, and Cr data */
68 JSAMPROW inptr0, inptr1, inptr2;
69
70 const int16x4_t consts = vld1_s16(jsimd_ycc_rgb_convert_neon_consts);
71 const int16x8_t neg_128 = vdupq_n_s16(-128);
72
73 inptr0 = input_buf[0][in_row_group_ctr];
74 inptr1 = input_buf[1][in_row_group_ctr];
75 inptr2 = input_buf[2][in_row_group_ctr];
76 outptr = output_buf[0];
77
78 int cols_remaining = output_width;
79 for (; cols_remaining >= 16; cols_remaining -= 16) {
80 /* De-interleave Y component values into two separate vectors, one
81 * containing the component values with even-numbered indices and one
82 * containing the component values with odd-numbered indices.
83 */
84 uint8x8x2_t y = vld2_u8(inptr0);
85 uint8x8_t cb = vld1_u8(inptr1);
86 uint8x8_t cr = vld1_u8(inptr2);
87 /* Subtract 128 from Cb and Cr. */
88 int16x8_t cr_128 =
89 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
90 int16x8_t cb_128 =
91 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
92 /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
93 int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
94 int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
95 g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
96 g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
97 /* Descale G components: shift right 15, round, and narrow to 16-bit. */
98 int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
99 vrshrn_n_s32(g_sub_y_h, 15));
100 /* Compute R-Y: 1.40200 * (Cr - 128) */
101 int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1), consts, 2);
102 /* Compute B-Y: 1.77200 * (Cb - 128) */
103 int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1), consts, 3);
104 /* Add the chroma-derived values (G-Y, R-Y, and B-Y) to both the "even" and
105 * "odd" Y component values. This effectively upsamples the chroma
106 * components horizontally.
107 */
108 int16x8_t g_even =
109 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
110 y.val[0]));
111 int16x8_t r_even =
112 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
113 y.val[0]));
114 int16x8_t b_even =
115 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
116 y.val[0]));
117 int16x8_t g_odd =
118 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
119 y.val[1]));
120 int16x8_t r_odd =
121 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
122 y.val[1]));
123 int16x8_t b_odd =
124 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
125 y.val[1]));
126 /* Convert each component to unsigned and narrow, clamping to [0-255].
127 * Re-interleave the "even" and "odd" component values.
128 */
129 uint8x8x2_t r = vzip_u8(vqmovun_s16(r_even), vqmovun_s16(r_odd));
130 uint8x8x2_t g = vzip_u8(vqmovun_s16(g_even), vqmovun_s16(g_odd));
131 uint8x8x2_t b = vzip_u8(vqmovun_s16(b_even), vqmovun_s16(b_odd));
132
133 #ifdef RGB_ALPHA
134 uint8x16x4_t rgba;
135 rgba.val[RGB_RED] = vcombine_u8(r.val[0], r.val[1]);
136 rgba.val[RGB_GREEN] = vcombine_u8(g.val[0], g.val[1]);
137 rgba.val[RGB_BLUE] = vcombine_u8(b.val[0], b.val[1]);
138 /* Set alpha channel to opaque (0xFF). */
139 rgba.val[RGB_ALPHA] = vdupq_n_u8(0xFF);
140 /* Store RGBA pixel data to memory. */
141 vst4q_u8(outptr, rgba);
142 #else
143 uint8x16x3_t rgb;
144 rgb.val[RGB_RED] = vcombine_u8(r.val[0], r.val[1]);
145 rgb.val[RGB_GREEN] = vcombine_u8(g.val[0], g.val[1]);
146 rgb.val[RGB_BLUE] = vcombine_u8(b.val[0], b.val[1]);
147 /* Store RGB pixel data to memory. */
148 vst3q_u8(outptr, rgb);
149 #endif
150
151 /* Increment pointers. */
152 inptr0 += 16;
153 inptr1 += 8;
154 inptr2 += 8;
155 outptr += (RGB_PIXELSIZE * 16);
156 }
157
158 if (cols_remaining > 0) {
159 /* De-interleave Y component values into two separate vectors, one
160 * containing the component values with even-numbered indices and one
161 * containing the component values with odd-numbered indices.
162 */
163 uint8x8x2_t y = vld2_u8(inptr0);
164 uint8x8_t cb = vld1_u8(inptr1);
165 uint8x8_t cr = vld1_u8(inptr2);
166 /* Subtract 128 from Cb and Cr. */
167 int16x8_t cr_128 =
168 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
169 int16x8_t cb_128 =
170 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
171 /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
172 int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
173 int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
174 g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
175 g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
176 /* Descale G components: shift right 15, round, and narrow to 16-bit. */
177 int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
178 vrshrn_n_s32(g_sub_y_h, 15));
179 /* Compute R-Y: 1.40200 * (Cr - 128) */
180 int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1), consts, 2);
181 /* Compute B-Y: 1.77200 * (Cb - 128) */
182 int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1), consts, 3);
183 /* Add the chroma-derived values (G-Y, R-Y, and B-Y) to both the "even" and
184 * "odd" Y component values. This effectively upsamples the chroma
185 * components horizontally.
186 */
187 int16x8_t g_even =
188 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
189 y.val[0]));
190 int16x8_t r_even =
191 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
192 y.val[0]));
193 int16x8_t b_even =
194 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
195 y.val[0]));
196 int16x8_t g_odd =
197 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
198 y.val[1]));
199 int16x8_t r_odd =
200 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
201 y.val[1]));
202 int16x8_t b_odd =
203 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
204 y.val[1]));
205 /* Convert each component to unsigned and narrow, clamping to [0-255].
206 * Re-interleave the "even" and "odd" component values.
207 */
208 uint8x8x2_t r = vzip_u8(vqmovun_s16(r_even), vqmovun_s16(r_odd));
209 uint8x8x2_t g = vzip_u8(vqmovun_s16(g_even), vqmovun_s16(g_odd));
210 uint8x8x2_t b = vzip_u8(vqmovun_s16(b_even), vqmovun_s16(b_odd));
211
212 #ifdef RGB_ALPHA
213 uint8x8x4_t rgba_h;
214 rgba_h.val[RGB_RED] = r.val[1];
215 rgba_h.val[RGB_GREEN] = g.val[1];
216 rgba_h.val[RGB_BLUE] = b.val[1];
217 /* Set alpha channel to opaque (0xFF). */
218 rgba_h.val[RGB_ALPHA] = vdup_n_u8(0xFF);
219 uint8x8x4_t rgba_l;
220 rgba_l.val[RGB_RED] = r.val[0];
221 rgba_l.val[RGB_GREEN] = g.val[0];
222 rgba_l.val[RGB_BLUE] = b.val[0];
223 /* Set alpha channel to opaque (0xFF). */
224 rgba_l.val[RGB_ALPHA] = vdup_n_u8(0xFF);
225 /* Store RGBA pixel data to memory. */
226 switch (cols_remaining) {
227 case 15:
228 vst4_lane_u8(outptr + 14 * RGB_PIXELSIZE, rgba_h, 6);
229 FALLTHROUGH /*FALLTHROUGH*/
230 case 14:
231 vst4_lane_u8(outptr + 13 * RGB_PIXELSIZE, rgba_h, 5);
232 FALLTHROUGH /*FALLTHROUGH*/
233 case 13:
234 vst4_lane_u8(outptr + 12 * RGB_PIXELSIZE, rgba_h, 4);
235 FALLTHROUGH /*FALLTHROUGH*/
236 case 12:
237 vst4_lane_u8(outptr + 11 * RGB_PIXELSIZE, rgba_h, 3);
238 FALLTHROUGH /*FALLTHROUGH*/
239 case 11:
240 vst4_lane_u8(outptr + 10 * RGB_PIXELSIZE, rgba_h, 2);
241 FALLTHROUGH /*FALLTHROUGH*/
242 case 10:
243 vst4_lane_u8(outptr + 9 * RGB_PIXELSIZE, rgba_h, 1);
244 FALLTHROUGH /*FALLTHROUGH*/
245 case 9:
246 vst4_lane_u8(outptr + 8 * RGB_PIXELSIZE, rgba_h, 0);
247 FALLTHROUGH /*FALLTHROUGH*/
248 case 8:
249 vst4_u8(outptr, rgba_l);
250 break;
251 case 7:
252 vst4_lane_u8(outptr + 6 * RGB_PIXELSIZE, rgba_l, 6);
253 FALLTHROUGH /*FALLTHROUGH*/
254 case 6:
255 vst4_lane_u8(outptr + 5 * RGB_PIXELSIZE, rgba_l, 5);
256 FALLTHROUGH /*FALLTHROUGH*/
257 case 5:
258 vst4_lane_u8(outptr + 4 * RGB_PIXELSIZE, rgba_l, 4);
259 FALLTHROUGH /*FALLTHROUGH*/
260 case 4:
261 vst4_lane_u8(outptr + 3 * RGB_PIXELSIZE, rgba_l, 3);
262 FALLTHROUGH /*FALLTHROUGH*/
263 case 3:
264 vst4_lane_u8(outptr + 2 * RGB_PIXELSIZE, rgba_l, 2);
265 FALLTHROUGH /*FALLTHROUGH*/
266 case 2:
267 vst4_lane_u8(outptr + RGB_PIXELSIZE, rgba_l, 1);
268 FALLTHROUGH /*FALLTHROUGH*/
269 case 1:
270 vst4_lane_u8(outptr, rgba_l, 0);
271 FALLTHROUGH /*FALLTHROUGH*/
272 default:
273 break;
274 }
275 #else
276 uint8x8x3_t rgb_h;
277 rgb_h.val[RGB_RED] = r.val[1];
278 rgb_h.val[RGB_GREEN] = g.val[1];
279 rgb_h.val[RGB_BLUE] = b.val[1];
280 uint8x8x3_t rgb_l;
281 rgb_l.val[RGB_RED] = r.val[0];
282 rgb_l.val[RGB_GREEN] = g.val[0];
283 rgb_l.val[RGB_BLUE] = b.val[0];
284 /* Store RGB pixel data to memory. */
285 switch (cols_remaining) {
286 case 15:
287 vst3_lane_u8(outptr + 14 * RGB_PIXELSIZE, rgb_h, 6);
288 FALLTHROUGH /*FALLTHROUGH*/
289 case 14:
290 vst3_lane_u8(outptr + 13 * RGB_PIXELSIZE, rgb_h, 5);
291 FALLTHROUGH /*FALLTHROUGH*/
292 case 13:
293 vst3_lane_u8(outptr + 12 * RGB_PIXELSIZE, rgb_h, 4);
294 FALLTHROUGH /*FALLTHROUGH*/
295 case 12:
296 vst3_lane_u8(outptr + 11 * RGB_PIXELSIZE, rgb_h, 3);
297 FALLTHROUGH /*FALLTHROUGH*/
298 case 11:
299 vst3_lane_u8(outptr + 10 * RGB_PIXELSIZE, rgb_h, 2);
300 FALLTHROUGH /*FALLTHROUGH*/
301 case 10:
302 vst3_lane_u8(outptr + 9 * RGB_PIXELSIZE, rgb_h, 1);
303 FALLTHROUGH /*FALLTHROUGH*/
304 case 9:
305 vst3_lane_u8(outptr + 8 * RGB_PIXELSIZE, rgb_h, 0);
306 FALLTHROUGH /*FALLTHROUGH*/
307 case 8:
308 vst3_u8(outptr, rgb_l);
309 break;
310 case 7:
311 vst3_lane_u8(outptr + 6 * RGB_PIXELSIZE, rgb_l, 6);
312 FALLTHROUGH /*FALLTHROUGH*/
313 case 6:
314 vst3_lane_u8(outptr + 5 * RGB_PIXELSIZE, rgb_l, 5);
315 FALLTHROUGH /*FALLTHROUGH*/
316 case 5:
317 vst3_lane_u8(outptr + 4 * RGB_PIXELSIZE, rgb_l, 4);
318 FALLTHROUGH /*FALLTHROUGH*/
319 case 4:
320 vst3_lane_u8(outptr + 3 * RGB_PIXELSIZE, rgb_l, 3);
321 FALLTHROUGH /*FALLTHROUGH*/
322 case 3:
323 vst3_lane_u8(outptr + 2 * RGB_PIXELSIZE, rgb_l, 2);
324 FALLTHROUGH /*FALLTHROUGH*/
325 case 2:
326 vst3_lane_u8(outptr + RGB_PIXELSIZE, rgb_l, 1);
327 FALLTHROUGH /*FALLTHROUGH*/
328 case 1:
329 vst3_lane_u8(outptr, rgb_l, 0);
330 FALLTHROUGH /*FALLTHROUGH*/
331 default:
332 break;
333 }
334 #endif
335 }
336 }
337
338
339 /* Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
340 *
341 * See comments above for details regarding color conversion and safe memory
342 * access.
343 */
344
jsimd_h2v2_merged_upsample_neon(JDIMENSION output_width,JSAMPIMAGE input_buf,JDIMENSION in_row_group_ctr,JSAMPARRAY output_buf)345 void jsimd_h2v2_merged_upsample_neon(JDIMENSION output_width,
346 JSAMPIMAGE input_buf,
347 JDIMENSION in_row_group_ctr,
348 JSAMPARRAY output_buf)
349 {
350 JSAMPROW outptr0, outptr1;
351 /* Pointers to Y (both rows), Cb, and Cr data */
352 JSAMPROW inptr0_0, inptr0_1, inptr1, inptr2;
353
354 const int16x4_t consts = vld1_s16(jsimd_ycc_rgb_convert_neon_consts);
355 const int16x8_t neg_128 = vdupq_n_s16(-128);
356
357 inptr0_0 = input_buf[0][in_row_group_ctr * 2];
358 inptr0_1 = input_buf[0][in_row_group_ctr * 2 + 1];
359 inptr1 = input_buf[1][in_row_group_ctr];
360 inptr2 = input_buf[2][in_row_group_ctr];
361 outptr0 = output_buf[0];
362 outptr1 = output_buf[1];
363
364 int cols_remaining = output_width;
365 for (; cols_remaining >= 16; cols_remaining -= 16) {
366 /* For each row, de-interleave Y component values into two separate
367 * vectors, one containing the component values with even-numbered indices
368 * and one containing the component values with odd-numbered indices.
369 */
370 uint8x8x2_t y0 = vld2_u8(inptr0_0);
371 uint8x8x2_t y1 = vld2_u8(inptr0_1);
372 uint8x8_t cb = vld1_u8(inptr1);
373 uint8x8_t cr = vld1_u8(inptr2);
374 /* Subtract 128 from Cb and Cr. */
375 int16x8_t cr_128 =
376 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
377 int16x8_t cb_128 =
378 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
379 /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
380 int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
381 int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
382 g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
383 g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
384 /* Descale G components: shift right 15, round, and narrow to 16-bit. */
385 int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
386 vrshrn_n_s32(g_sub_y_h, 15));
387 /* Compute R-Y: 1.40200 * (Cr - 128) */
388 int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1), consts, 2);
389 /* Compute B-Y: 1.77200 * (Cb - 128) */
390 int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1), consts, 3);
391 /* For each row, add the chroma-derived values (G-Y, R-Y, and B-Y) to both
392 * the "even" and "odd" Y component values. This effectively upsamples the
393 * chroma components both horizontally and vertically.
394 */
395 int16x8_t g0_even =
396 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
397 y0.val[0]));
398 int16x8_t r0_even =
399 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
400 y0.val[0]));
401 int16x8_t b0_even =
402 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
403 y0.val[0]));
404 int16x8_t g0_odd =
405 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
406 y0.val[1]));
407 int16x8_t r0_odd =
408 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
409 y0.val[1]));
410 int16x8_t b0_odd =
411 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
412 y0.val[1]));
413 int16x8_t g1_even =
414 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
415 y1.val[0]));
416 int16x8_t r1_even =
417 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
418 y1.val[0]));
419 int16x8_t b1_even =
420 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
421 y1.val[0]));
422 int16x8_t g1_odd =
423 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
424 y1.val[1]));
425 int16x8_t r1_odd =
426 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
427 y1.val[1]));
428 int16x8_t b1_odd =
429 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
430 y1.val[1]));
431 /* Convert each component to unsigned and narrow, clamping to [0-255].
432 * Re-interleave the "even" and "odd" component values.
433 */
434 uint8x8x2_t r0 = vzip_u8(vqmovun_s16(r0_even), vqmovun_s16(r0_odd));
435 uint8x8x2_t r1 = vzip_u8(vqmovun_s16(r1_even), vqmovun_s16(r1_odd));
436 uint8x8x2_t g0 = vzip_u8(vqmovun_s16(g0_even), vqmovun_s16(g0_odd));
437 uint8x8x2_t g1 = vzip_u8(vqmovun_s16(g1_even), vqmovun_s16(g1_odd));
438 uint8x8x2_t b0 = vzip_u8(vqmovun_s16(b0_even), vqmovun_s16(b0_odd));
439 uint8x8x2_t b1 = vzip_u8(vqmovun_s16(b1_even), vqmovun_s16(b1_odd));
440
441 #ifdef RGB_ALPHA
442 uint8x16x4_t rgba0, rgba1;
443 rgba0.val[RGB_RED] = vcombine_u8(r0.val[0], r0.val[1]);
444 rgba1.val[RGB_RED] = vcombine_u8(r1.val[0], r1.val[1]);
445 rgba0.val[RGB_GREEN] = vcombine_u8(g0.val[0], g0.val[1]);
446 rgba1.val[RGB_GREEN] = vcombine_u8(g1.val[0], g1.val[1]);
447 rgba0.val[RGB_BLUE] = vcombine_u8(b0.val[0], b0.val[1]);
448 rgba1.val[RGB_BLUE] = vcombine_u8(b1.val[0], b1.val[1]);
449 /* Set alpha channel to opaque (0xFF). */
450 rgba0.val[RGB_ALPHA] = vdupq_n_u8(0xFF);
451 rgba1.val[RGB_ALPHA] = vdupq_n_u8(0xFF);
452 /* Store RGBA pixel data to memory. */
453 vst4q_u8(outptr0, rgba0);
454 vst4q_u8(outptr1, rgba1);
455 #else
456 uint8x16x3_t rgb0, rgb1;
457 rgb0.val[RGB_RED] = vcombine_u8(r0.val[0], r0.val[1]);
458 rgb1.val[RGB_RED] = vcombine_u8(r1.val[0], r1.val[1]);
459 rgb0.val[RGB_GREEN] = vcombine_u8(g0.val[0], g0.val[1]);
460 rgb1.val[RGB_GREEN] = vcombine_u8(g1.val[0], g1.val[1]);
461 rgb0.val[RGB_BLUE] = vcombine_u8(b0.val[0], b0.val[1]);
462 rgb1.val[RGB_BLUE] = vcombine_u8(b1.val[0], b1.val[1]);
463 /* Store RGB pixel data to memory. */
464 vst3q_u8(outptr0, rgb0);
465 vst3q_u8(outptr1, rgb1);
466 #endif
467
468 /* Increment pointers. */
469 inptr0_0 += 16;
470 inptr0_1 += 16;
471 inptr1 += 8;
472 inptr2 += 8;
473 outptr0 += (RGB_PIXELSIZE * 16);
474 outptr1 += (RGB_PIXELSIZE * 16);
475 }
476
477 if (cols_remaining > 0) {
478 /* For each row, de-interleave Y component values into two separate
479 * vectors, one containing the component values with even-numbered indices
480 * and one containing the component values with odd-numbered indices.
481 */
482 uint8x8x2_t y0 = vld2_u8(inptr0_0);
483 uint8x8x2_t y1 = vld2_u8(inptr0_1);
484 uint8x8_t cb = vld1_u8(inptr1);
485 uint8x8_t cr = vld1_u8(inptr2);
486 /* Subtract 128 from Cb and Cr. */
487 int16x8_t cr_128 =
488 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
489 int16x8_t cb_128 =
490 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
491 /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
492 int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
493 int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
494 g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
495 g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
496 /* Descale G components: shift right 15, round, and narrow to 16-bit. */
497 int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
498 vrshrn_n_s32(g_sub_y_h, 15));
499 /* Compute R-Y: 1.40200 * (Cr - 128) */
500 int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1), consts, 2);
501 /* Compute B-Y: 1.77200 * (Cb - 128) */
502 int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1), consts, 3);
503 /* For each row, add the chroma-derived values (G-Y, R-Y, and B-Y) to both
504 * the "even" and "odd" Y component values. This effectively upsamples the
505 * chroma components both horizontally and vertically.
506 */
507 int16x8_t g0_even =
508 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
509 y0.val[0]));
510 int16x8_t r0_even =
511 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
512 y0.val[0]));
513 int16x8_t b0_even =
514 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
515 y0.val[0]));
516 int16x8_t g0_odd =
517 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
518 y0.val[1]));
519 int16x8_t r0_odd =
520 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
521 y0.val[1]));
522 int16x8_t b0_odd =
523 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
524 y0.val[1]));
525 int16x8_t g1_even =
526 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
527 y1.val[0]));
528 int16x8_t r1_even =
529 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
530 y1.val[0]));
531 int16x8_t b1_even =
532 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
533 y1.val[0]));
534 int16x8_t g1_odd =
535 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
536 y1.val[1]));
537 int16x8_t r1_odd =
538 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
539 y1.val[1]));
540 int16x8_t b1_odd =
541 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
542 y1.val[1]));
543 /* Convert each component to unsigned and narrow, clamping to [0-255].
544 * Re-interleave the "even" and "odd" component values.
545 */
546 uint8x8x2_t r0 = vzip_u8(vqmovun_s16(r0_even), vqmovun_s16(r0_odd));
547 uint8x8x2_t r1 = vzip_u8(vqmovun_s16(r1_even), vqmovun_s16(r1_odd));
548 uint8x8x2_t g0 = vzip_u8(vqmovun_s16(g0_even), vqmovun_s16(g0_odd));
549 uint8x8x2_t g1 = vzip_u8(vqmovun_s16(g1_even), vqmovun_s16(g1_odd));
550 uint8x8x2_t b0 = vzip_u8(vqmovun_s16(b0_even), vqmovun_s16(b0_odd));
551 uint8x8x2_t b1 = vzip_u8(vqmovun_s16(b1_even), vqmovun_s16(b1_odd));
552
553 #ifdef RGB_ALPHA
554 uint8x8x4_t rgba0_h, rgba1_h;
555 rgba0_h.val[RGB_RED] = r0.val[1];
556 rgba1_h.val[RGB_RED] = r1.val[1];
557 rgba0_h.val[RGB_GREEN] = g0.val[1];
558 rgba1_h.val[RGB_GREEN] = g1.val[1];
559 rgba0_h.val[RGB_BLUE] = b0.val[1];
560 rgba1_h.val[RGB_BLUE] = b1.val[1];
561 /* Set alpha channel to opaque (0xFF). */
562 rgba0_h.val[RGB_ALPHA] = vdup_n_u8(0xFF);
563 rgba1_h.val[RGB_ALPHA] = vdup_n_u8(0xFF);
564
565 uint8x8x4_t rgba0_l, rgba1_l;
566 rgba0_l.val[RGB_RED] = r0.val[0];
567 rgba1_l.val[RGB_RED] = r1.val[0];
568 rgba0_l.val[RGB_GREEN] = g0.val[0];
569 rgba1_l.val[RGB_GREEN] = g1.val[0];
570 rgba0_l.val[RGB_BLUE] = b0.val[0];
571 rgba1_l.val[RGB_BLUE] = b1.val[0];
572 /* Set alpha channel to opaque (0xFF). */
573 rgba0_l.val[RGB_ALPHA] = vdup_n_u8(0xFF);
574 rgba1_l.val[RGB_ALPHA] = vdup_n_u8(0xFF);
575 /* Store RGBA pixel data to memory. */
576 switch (cols_remaining) {
577 case 15:
578 vst4_lane_u8(outptr0 + 14 * RGB_PIXELSIZE, rgba0_h, 6);
579 vst4_lane_u8(outptr1 + 14 * RGB_PIXELSIZE, rgba1_h, 6);
580 FALLTHROUGH /*FALLTHROUGH*/
581 case 14:
582 vst4_lane_u8(outptr0 + 13 * RGB_PIXELSIZE, rgba0_h, 5);
583 vst4_lane_u8(outptr1 + 13 * RGB_PIXELSIZE, rgba1_h, 5);
584 FALLTHROUGH /*FALLTHROUGH*/
585 case 13:
586 vst4_lane_u8(outptr0 + 12 * RGB_PIXELSIZE, rgba0_h, 4);
587 vst4_lane_u8(outptr1 + 12 * RGB_PIXELSIZE, rgba1_h, 4);
588 FALLTHROUGH /*FALLTHROUGH*/
589 case 12:
590 vst4_lane_u8(outptr0 + 11 * RGB_PIXELSIZE, rgba0_h, 3);
591 vst4_lane_u8(outptr1 + 11 * RGB_PIXELSIZE, rgba1_h, 3);
592 FALLTHROUGH /*FALLTHROUGH*/
593 case 11:
594 vst4_lane_u8(outptr0 + 10 * RGB_PIXELSIZE, rgba0_h, 2);
595 vst4_lane_u8(outptr1 + 10 * RGB_PIXELSIZE, rgba1_h, 2);
596 FALLTHROUGH /*FALLTHROUGH*/
597 case 10:
598 vst4_lane_u8(outptr0 + 9 * RGB_PIXELSIZE, rgba0_h, 1);
599 vst4_lane_u8(outptr1 + 9 * RGB_PIXELSIZE, rgba1_h, 1);
600 FALLTHROUGH /*FALLTHROUGH*/
601 case 9:
602 vst4_lane_u8(outptr0 + 8 * RGB_PIXELSIZE, rgba0_h, 0);
603 vst4_lane_u8(outptr1 + 8 * RGB_PIXELSIZE, rgba1_h, 0);
604 FALLTHROUGH /*FALLTHROUGH*/
605 case 8:
606 vst4_u8(outptr0, rgba0_l);
607 vst4_u8(outptr1, rgba1_l);
608 break;
609 case 7:
610 vst4_lane_u8(outptr0 + 6 * RGB_PIXELSIZE, rgba0_l, 6);
611 vst4_lane_u8(outptr1 + 6 * RGB_PIXELSIZE, rgba1_l, 6);
612 FALLTHROUGH /*FALLTHROUGH*/
613 case 6:
614 vst4_lane_u8(outptr0 + 5 * RGB_PIXELSIZE, rgba0_l, 5);
615 vst4_lane_u8(outptr1 + 5 * RGB_PIXELSIZE, rgba1_l, 5);
616 FALLTHROUGH /*FALLTHROUGH*/
617 case 5:
618 vst4_lane_u8(outptr0 + 4 * RGB_PIXELSIZE, rgba0_l, 4);
619 vst4_lane_u8(outptr1 + 4 * RGB_PIXELSIZE, rgba1_l, 4);
620 FALLTHROUGH /*FALLTHROUGH*/
621 case 4:
622 vst4_lane_u8(outptr0 + 3 * RGB_PIXELSIZE, rgba0_l, 3);
623 vst4_lane_u8(outptr1 + 3 * RGB_PIXELSIZE, rgba1_l, 3);
624 FALLTHROUGH /*FALLTHROUGH*/
625 case 3:
626 vst4_lane_u8(outptr0 + 2 * RGB_PIXELSIZE, rgba0_l, 2);
627 vst4_lane_u8(outptr1 + 2 * RGB_PIXELSIZE, rgba1_l, 2);
628 FALLTHROUGH /*FALLTHROUGH*/
629 case 2:
630 vst4_lane_u8(outptr0 + 1 * RGB_PIXELSIZE, rgba0_l, 1);
631 vst4_lane_u8(outptr1 + 1 * RGB_PIXELSIZE, rgba1_l, 1);
632 FALLTHROUGH /*FALLTHROUGH*/
633 case 1:
634 vst4_lane_u8(outptr0, rgba0_l, 0);
635 vst4_lane_u8(outptr1, rgba1_l, 0);
636 FALLTHROUGH /*FALLTHROUGH*/
637 default:
638 break;
639 }
640 #else
641 uint8x8x3_t rgb0_h, rgb1_h;
642 rgb0_h.val[RGB_RED] = r0.val[1];
643 rgb1_h.val[RGB_RED] = r1.val[1];
644 rgb0_h.val[RGB_GREEN] = g0.val[1];
645 rgb1_h.val[RGB_GREEN] = g1.val[1];
646 rgb0_h.val[RGB_BLUE] = b0.val[1];
647 rgb1_h.val[RGB_BLUE] = b1.val[1];
648
649 uint8x8x3_t rgb0_l, rgb1_l;
650 rgb0_l.val[RGB_RED] = r0.val[0];
651 rgb1_l.val[RGB_RED] = r1.val[0];
652 rgb0_l.val[RGB_GREEN] = g0.val[0];
653 rgb1_l.val[RGB_GREEN] = g1.val[0];
654 rgb0_l.val[RGB_BLUE] = b0.val[0];
655 rgb1_l.val[RGB_BLUE] = b1.val[0];
656 /* Store RGB pixel data to memory. */
657 switch (cols_remaining) {
658 case 15:
659 vst3_lane_u8(outptr0 + 14 * RGB_PIXELSIZE, rgb0_h, 6);
660 vst3_lane_u8(outptr1 + 14 * RGB_PIXELSIZE, rgb1_h, 6);
661 FALLTHROUGH /*FALLTHROUGH*/
662 case 14:
663 vst3_lane_u8(outptr0 + 13 * RGB_PIXELSIZE, rgb0_h, 5);
664 vst3_lane_u8(outptr1 + 13 * RGB_PIXELSIZE, rgb1_h, 5);
665 FALLTHROUGH /*FALLTHROUGH*/
666 case 13:
667 vst3_lane_u8(outptr0 + 12 * RGB_PIXELSIZE, rgb0_h, 4);
668 vst3_lane_u8(outptr1 + 12 * RGB_PIXELSIZE, rgb1_h, 4);
669 FALLTHROUGH /*FALLTHROUGH*/
670 case 12:
671 vst3_lane_u8(outptr0 + 11 * RGB_PIXELSIZE, rgb0_h, 3);
672 vst3_lane_u8(outptr1 + 11 * RGB_PIXELSIZE, rgb1_h, 3);
673 FALLTHROUGH /*FALLTHROUGH*/
674 case 11:
675 vst3_lane_u8(outptr0 + 10 * RGB_PIXELSIZE, rgb0_h, 2);
676 vst3_lane_u8(outptr1 + 10 * RGB_PIXELSIZE, rgb1_h, 2);
677 FALLTHROUGH /*FALLTHROUGH*/
678 case 10:
679 vst3_lane_u8(outptr0 + 9 * RGB_PIXELSIZE, rgb0_h, 1);
680 vst3_lane_u8(outptr1 + 9 * RGB_PIXELSIZE, rgb1_h, 1);
681 FALLTHROUGH /*FALLTHROUGH*/
682 case 9:
683 vst3_lane_u8(outptr0 + 8 * RGB_PIXELSIZE, rgb0_h, 0);
684 vst3_lane_u8(outptr1 + 8 * RGB_PIXELSIZE, rgb1_h, 0);
685 FALLTHROUGH /*FALLTHROUGH*/
686 case 8:
687 vst3_u8(outptr0, rgb0_l);
688 vst3_u8(outptr1, rgb1_l);
689 break;
690 case 7:
691 vst3_lane_u8(outptr0 + 6 * RGB_PIXELSIZE, rgb0_l, 6);
692 vst3_lane_u8(outptr1 + 6 * RGB_PIXELSIZE, rgb1_l, 6);
693 FALLTHROUGH /*FALLTHROUGH*/
694 case 6:
695 vst3_lane_u8(outptr0 + 5 * RGB_PIXELSIZE, rgb0_l, 5);
696 vst3_lane_u8(outptr1 + 5 * RGB_PIXELSIZE, rgb1_l, 5);
697 FALLTHROUGH /*FALLTHROUGH*/
698 case 5:
699 vst3_lane_u8(outptr0 + 4 * RGB_PIXELSIZE, rgb0_l, 4);
700 vst3_lane_u8(outptr1 + 4 * RGB_PIXELSIZE, rgb1_l, 4);
701 FALLTHROUGH /*FALLTHROUGH*/
702 case 4:
703 vst3_lane_u8(outptr0 + 3 * RGB_PIXELSIZE, rgb0_l, 3);
704 vst3_lane_u8(outptr1 + 3 * RGB_PIXELSIZE, rgb1_l, 3);
705 FALLTHROUGH /*FALLTHROUGH*/
706 case 3:
707 vst3_lane_u8(outptr0 + 2 * RGB_PIXELSIZE, rgb0_l, 2);
708 vst3_lane_u8(outptr1 + 2 * RGB_PIXELSIZE, rgb1_l, 2);
709 FALLTHROUGH /*FALLTHROUGH*/
710 case 2:
711 vst3_lane_u8(outptr0 + 1 * RGB_PIXELSIZE, rgb0_l, 1);
712 vst3_lane_u8(outptr1 + 1 * RGB_PIXELSIZE, rgb1_l, 1);
713 FALLTHROUGH /*FALLTHROUGH*/
714 case 1:
715 vst3_lane_u8(outptr0, rgb0_l, 0);
716 vst3_lane_u8(outptr1, rgb1_l, 0);
717 FALLTHROUGH /*FALLTHROUGH*/
718 default:
719 break;
720 }
721 #endif
722 }
723 }
724