• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * jdmrgext-neon.c - merged upsampling/color conversion (Arm Neon)
3  *
4  * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
5  * Copyright (C) 2020, D. R. Commander.  All Rights Reserved.
6  *
7  * This software is provided 'as-is', without any express or implied
8  * warranty.  In no event will the authors be held liable for any damages
9  * arising from the use of this software.
10  *
11  * Permission is granted to anyone to use this software for any purpose,
12  * including commercial applications, and to alter it and redistribute it
13  * freely, subject to the following restrictions:
14  *
15  * 1. The origin of this software must not be misrepresented; you must not
16  *    claim that you wrote the original software. If you use this software
17  *    in a product, an acknowledgment in the product documentation would be
18  *    appreciated but is not required.
19  * 2. Altered source versions must be plainly marked as such, and must not be
20  *    misrepresented as being the original software.
21  * 3. This notice may not be removed or altered from any source distribution.
22  */
23 
24 /* This file is included by jdmerge-neon.c. */
25 
26 
27 /* These routines combine simple (non-fancy, i.e. non-smooth) h2v1 or h2v2
28  * chroma upsampling and YCbCr -> RGB color conversion into a single function.
29  *
30  * As with the standalone functions, YCbCr -> RGB conversion is defined by the
31  * following equations:
32  *    R = Y                        + 1.40200 * (Cr - 128)
33  *    G = Y - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128)
34  *    B = Y + 1.77200 * (Cb - 128)
35  *
36  * Scaled integer constants are used to avoid floating-point arithmetic:
37  *    0.3441467 = 11277 * 2^-15
38  *    0.7141418 = 23401 * 2^-15
39  *    1.4020386 = 22971 * 2^-14
40  *    1.7720337 = 29033 * 2^-14
41  * These constants are defined in jdmerge-neon.c.
42  *
43  * To ensure correct results, rounding is used when descaling.
44  */
45 
46 /* Notes on safe memory access for merged upsampling/YCbCr -> RGB conversion
47  * routines:
48  *
49  * Input memory buffers can be safely overread up to the next multiple of
50  * ALIGN_SIZE bytes, since they are always allocated by alloc_sarray() in
51  * jmemmgr.c.
52  *
53  * The output buffer cannot safely be written beyond output_width, since
54  * output_buf points to a possibly unpadded row in the decompressed image
55  * buffer allocated by the calling program.
56  */
57 
58 /* Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
59  */
60 
jsimd_h2v1_merged_upsample_neon(JDIMENSION output_width,JSAMPIMAGE input_buf,JDIMENSION in_row_group_ctr,JSAMPARRAY output_buf)61 void jsimd_h2v1_merged_upsample_neon(JDIMENSION output_width,
62                                      JSAMPIMAGE input_buf,
63                                      JDIMENSION in_row_group_ctr,
64                                      JSAMPARRAY output_buf)
65 {
66   JSAMPROW outptr;
67   /* Pointers to Y, Cb, and Cr data */
68   JSAMPROW inptr0, inptr1, inptr2;
69 
70   const int16x4_t consts = vld1_s16(jsimd_ycc_rgb_convert_neon_consts);
71   const int16x8_t neg_128 = vdupq_n_s16(-128);
72 
73   inptr0 = input_buf[0][in_row_group_ctr];
74   inptr1 = input_buf[1][in_row_group_ctr];
75   inptr2 = input_buf[2][in_row_group_ctr];
76   outptr = output_buf[0];
77 
78   int cols_remaining = output_width;
79   for (; cols_remaining >= 16; cols_remaining -= 16) {
80     /* De-interleave Y component values into two separate vectors, one
81      * containing the component values with even-numbered indices and one
82      * containing the component values with odd-numbered indices.
83      */
84     uint8x8x2_t y = vld2_u8(inptr0);
85     uint8x8_t cb = vld1_u8(inptr1);
86     uint8x8_t cr = vld1_u8(inptr2);
87     /* Subtract 128 from Cb and Cr. */
88     int16x8_t cr_128 =
89       vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
90     int16x8_t cb_128 =
91       vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
92     /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
93     int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
94     int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
95     g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
96     g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
97     /* Descale G components: shift right 15, round, and narrow to 16-bit. */
98     int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
99                                      vrshrn_n_s32(g_sub_y_h, 15));
100     /* Compute R-Y: 1.40200 * (Cr - 128) */
101     int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1), consts, 2);
102     /* Compute B-Y: 1.77200 * (Cb - 128) */
103     int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1), consts, 3);
104     /* Add the chroma-derived values (G-Y, R-Y, and B-Y) to both the "even" and
105      * "odd" Y component values.  This effectively upsamples the chroma
106      * components horizontally.
107      */
108     int16x8_t g_even =
109       vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
110                                      y.val[0]));
111     int16x8_t r_even =
112       vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
113                                      y.val[0]));
114     int16x8_t b_even =
115       vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
116                                      y.val[0]));
117     int16x8_t g_odd =
118       vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
119                                      y.val[1]));
120     int16x8_t r_odd =
121       vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
122                                      y.val[1]));
123     int16x8_t b_odd =
124       vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
125                                      y.val[1]));
126     /* Convert each component to unsigned and narrow, clamping to [0-255].
127      * Re-interleave the "even" and "odd" component values.
128      */
129     uint8x8x2_t r = vzip_u8(vqmovun_s16(r_even), vqmovun_s16(r_odd));
130     uint8x8x2_t g = vzip_u8(vqmovun_s16(g_even), vqmovun_s16(g_odd));
131     uint8x8x2_t b = vzip_u8(vqmovun_s16(b_even), vqmovun_s16(b_odd));
132 
133 #ifdef RGB_ALPHA
134     uint8x16x4_t rgba;
135     rgba.val[RGB_RED] = vcombine_u8(r.val[0], r.val[1]);
136     rgba.val[RGB_GREEN] = vcombine_u8(g.val[0], g.val[1]);
137     rgba.val[RGB_BLUE] = vcombine_u8(b.val[0], b.val[1]);
138     /* Set alpha channel to opaque (0xFF). */
139     rgba.val[RGB_ALPHA] = vdupq_n_u8(0xFF);
140     /* Store RGBA pixel data to memory. */
141     vst4q_u8(outptr, rgba);
142 #else
143     uint8x16x3_t rgb;
144     rgb.val[RGB_RED] = vcombine_u8(r.val[0], r.val[1]);
145     rgb.val[RGB_GREEN] = vcombine_u8(g.val[0], g.val[1]);
146     rgb.val[RGB_BLUE] = vcombine_u8(b.val[0], b.val[1]);
147     /* Store RGB pixel data to memory. */
148     vst3q_u8(outptr, rgb);
149 #endif
150 
151     /* Increment pointers. */
152     inptr0 += 16;
153     inptr1 += 8;
154     inptr2 += 8;
155     outptr += (RGB_PIXELSIZE * 16);
156   }
157 
158   if (cols_remaining > 0) {
159     /* De-interleave Y component values into two separate vectors, one
160      * containing the component values with even-numbered indices and one
161      * containing the component values with odd-numbered indices.
162      */
163     uint8x8x2_t y = vld2_u8(inptr0);
164     uint8x8_t cb = vld1_u8(inptr1);
165     uint8x8_t cr = vld1_u8(inptr2);
166     /* Subtract 128 from Cb and Cr. */
167     int16x8_t cr_128 =
168       vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
169     int16x8_t cb_128 =
170       vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
171     /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
172     int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
173     int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
174     g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
175     g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
176     /* Descale G components: shift right 15, round, and narrow to 16-bit. */
177     int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
178                                      vrshrn_n_s32(g_sub_y_h, 15));
179     /* Compute R-Y: 1.40200 * (Cr - 128) */
180     int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1), consts, 2);
181     /* Compute B-Y: 1.77200 * (Cb - 128) */
182     int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1), consts, 3);
183     /* Add the chroma-derived values (G-Y, R-Y, and B-Y) to both the "even" and
184      * "odd" Y component values.  This effectively upsamples the chroma
185      * components horizontally.
186      */
187     int16x8_t g_even =
188       vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
189                                      y.val[0]));
190     int16x8_t r_even =
191       vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
192                                      y.val[0]));
193     int16x8_t b_even =
194       vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
195                                      y.val[0]));
196     int16x8_t g_odd =
197       vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
198                                      y.val[1]));
199     int16x8_t r_odd =
200       vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
201                                      y.val[1]));
202     int16x8_t b_odd =
203       vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
204                                      y.val[1]));
205     /* Convert each component to unsigned and narrow, clamping to [0-255].
206      * Re-interleave the "even" and "odd" component values.
207      */
208     uint8x8x2_t r = vzip_u8(vqmovun_s16(r_even), vqmovun_s16(r_odd));
209     uint8x8x2_t g = vzip_u8(vqmovun_s16(g_even), vqmovun_s16(g_odd));
210     uint8x8x2_t b = vzip_u8(vqmovun_s16(b_even), vqmovun_s16(b_odd));
211 
212 #ifdef RGB_ALPHA
213     uint8x8x4_t rgba_h;
214     rgba_h.val[RGB_RED] = r.val[1];
215     rgba_h.val[RGB_GREEN] = g.val[1];
216     rgba_h.val[RGB_BLUE] = b.val[1];
217     /* Set alpha channel to opaque (0xFF). */
218     rgba_h.val[RGB_ALPHA] = vdup_n_u8(0xFF);
219     uint8x8x4_t rgba_l;
220     rgba_l.val[RGB_RED] = r.val[0];
221     rgba_l.val[RGB_GREEN] = g.val[0];
222     rgba_l.val[RGB_BLUE] = b.val[0];
223     /* Set alpha channel to opaque (0xFF). */
224     rgba_l.val[RGB_ALPHA] = vdup_n_u8(0xFF);
225     /* Store RGBA pixel data to memory. */
226     switch (cols_remaining) {
227     case 15:
228       vst4_lane_u8(outptr + 14 * RGB_PIXELSIZE, rgba_h, 6);
229       FALLTHROUGH               /*FALLTHROUGH*/
230     case 14:
231       vst4_lane_u8(outptr + 13 * RGB_PIXELSIZE, rgba_h, 5);
232       FALLTHROUGH               /*FALLTHROUGH*/
233     case 13:
234       vst4_lane_u8(outptr + 12 * RGB_PIXELSIZE, rgba_h, 4);
235       FALLTHROUGH               /*FALLTHROUGH*/
236     case 12:
237       vst4_lane_u8(outptr + 11 * RGB_PIXELSIZE, rgba_h, 3);
238       FALLTHROUGH               /*FALLTHROUGH*/
239     case 11:
240       vst4_lane_u8(outptr + 10 * RGB_PIXELSIZE, rgba_h, 2);
241       FALLTHROUGH               /*FALLTHROUGH*/
242     case 10:
243       vst4_lane_u8(outptr + 9 * RGB_PIXELSIZE, rgba_h, 1);
244       FALLTHROUGH               /*FALLTHROUGH*/
245     case 9:
246       vst4_lane_u8(outptr + 8 * RGB_PIXELSIZE, rgba_h, 0);
247       FALLTHROUGH               /*FALLTHROUGH*/
248     case 8:
249       vst4_u8(outptr, rgba_l);
250       break;
251     case 7:
252       vst4_lane_u8(outptr + 6 * RGB_PIXELSIZE, rgba_l, 6);
253       FALLTHROUGH               /*FALLTHROUGH*/
254     case 6:
255       vst4_lane_u8(outptr + 5 * RGB_PIXELSIZE, rgba_l, 5);
256       FALLTHROUGH               /*FALLTHROUGH*/
257     case 5:
258       vst4_lane_u8(outptr + 4 * RGB_PIXELSIZE, rgba_l, 4);
259       FALLTHROUGH               /*FALLTHROUGH*/
260     case 4:
261       vst4_lane_u8(outptr + 3 * RGB_PIXELSIZE, rgba_l, 3);
262       FALLTHROUGH               /*FALLTHROUGH*/
263     case 3:
264       vst4_lane_u8(outptr + 2 * RGB_PIXELSIZE, rgba_l, 2);
265       FALLTHROUGH               /*FALLTHROUGH*/
266     case 2:
267       vst4_lane_u8(outptr + RGB_PIXELSIZE, rgba_l, 1);
268       FALLTHROUGH               /*FALLTHROUGH*/
269     case 1:
270       vst4_lane_u8(outptr, rgba_l, 0);
271       FALLTHROUGH               /*FALLTHROUGH*/
272     default:
273       break;
274     }
275 #else
276     uint8x8x3_t rgb_h;
277     rgb_h.val[RGB_RED] = r.val[1];
278     rgb_h.val[RGB_GREEN] = g.val[1];
279     rgb_h.val[RGB_BLUE] = b.val[1];
280     uint8x8x3_t rgb_l;
281     rgb_l.val[RGB_RED] = r.val[0];
282     rgb_l.val[RGB_GREEN] = g.val[0];
283     rgb_l.val[RGB_BLUE] = b.val[0];
284     /* Store RGB pixel data to memory. */
285     switch (cols_remaining) {
286     case 15:
287       vst3_lane_u8(outptr + 14 * RGB_PIXELSIZE, rgb_h, 6);
288       FALLTHROUGH               /*FALLTHROUGH*/
289     case 14:
290       vst3_lane_u8(outptr + 13 * RGB_PIXELSIZE, rgb_h, 5);
291       FALLTHROUGH               /*FALLTHROUGH*/
292     case 13:
293       vst3_lane_u8(outptr + 12 * RGB_PIXELSIZE, rgb_h, 4);
294       FALLTHROUGH               /*FALLTHROUGH*/
295     case 12:
296       vst3_lane_u8(outptr + 11 * RGB_PIXELSIZE, rgb_h, 3);
297       FALLTHROUGH               /*FALLTHROUGH*/
298     case 11:
299       vst3_lane_u8(outptr + 10 * RGB_PIXELSIZE, rgb_h, 2);
300       FALLTHROUGH               /*FALLTHROUGH*/
301     case 10:
302       vst3_lane_u8(outptr + 9 * RGB_PIXELSIZE, rgb_h, 1);
303       FALLTHROUGH               /*FALLTHROUGH*/
304     case 9:
305       vst3_lane_u8(outptr + 8 * RGB_PIXELSIZE, rgb_h, 0);
306       FALLTHROUGH               /*FALLTHROUGH*/
307     case 8:
308       vst3_u8(outptr, rgb_l);
309       break;
310     case 7:
311       vst3_lane_u8(outptr + 6 * RGB_PIXELSIZE, rgb_l, 6);
312       FALLTHROUGH               /*FALLTHROUGH*/
313     case 6:
314       vst3_lane_u8(outptr + 5 * RGB_PIXELSIZE, rgb_l, 5);
315       FALLTHROUGH               /*FALLTHROUGH*/
316     case 5:
317       vst3_lane_u8(outptr + 4 * RGB_PIXELSIZE, rgb_l, 4);
318       FALLTHROUGH               /*FALLTHROUGH*/
319     case 4:
320       vst3_lane_u8(outptr + 3 * RGB_PIXELSIZE, rgb_l, 3);
321       FALLTHROUGH               /*FALLTHROUGH*/
322     case 3:
323       vst3_lane_u8(outptr + 2 * RGB_PIXELSIZE, rgb_l, 2);
324       FALLTHROUGH               /*FALLTHROUGH*/
325     case 2:
326       vst3_lane_u8(outptr + RGB_PIXELSIZE, rgb_l, 1);
327       FALLTHROUGH               /*FALLTHROUGH*/
328     case 1:
329       vst3_lane_u8(outptr, rgb_l, 0);
330       FALLTHROUGH               /*FALLTHROUGH*/
331     default:
332       break;
333     }
334 #endif
335   }
336 }
337 
338 
339 /* Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
340  *
341  * See comments above for details regarding color conversion and safe memory
342  * access.
343  */
344 
jsimd_h2v2_merged_upsample_neon(JDIMENSION output_width,JSAMPIMAGE input_buf,JDIMENSION in_row_group_ctr,JSAMPARRAY output_buf)345 void jsimd_h2v2_merged_upsample_neon(JDIMENSION output_width,
346                                      JSAMPIMAGE input_buf,
347                                      JDIMENSION in_row_group_ctr,
348                                      JSAMPARRAY output_buf)
349 {
350   JSAMPROW outptr0, outptr1;
351   /* Pointers to Y (both rows), Cb, and Cr data */
352   JSAMPROW inptr0_0, inptr0_1, inptr1, inptr2;
353 
354   const int16x4_t consts = vld1_s16(jsimd_ycc_rgb_convert_neon_consts);
355   const int16x8_t neg_128 = vdupq_n_s16(-128);
356 
357   inptr0_0 = input_buf[0][in_row_group_ctr * 2];
358   inptr0_1 = input_buf[0][in_row_group_ctr * 2 + 1];
359   inptr1 = input_buf[1][in_row_group_ctr];
360   inptr2 = input_buf[2][in_row_group_ctr];
361   outptr0 = output_buf[0];
362   outptr1 = output_buf[1];
363 
364   int cols_remaining = output_width;
365   for (; cols_remaining >= 16; cols_remaining -= 16) {
366     /* For each row, de-interleave Y component values into two separate
367      * vectors, one containing the component values with even-numbered indices
368      * and one containing the component values with odd-numbered indices.
369      */
370     uint8x8x2_t y0 = vld2_u8(inptr0_0);
371     uint8x8x2_t y1 = vld2_u8(inptr0_1);
372     uint8x8_t cb = vld1_u8(inptr1);
373     uint8x8_t cr = vld1_u8(inptr2);
374     /* Subtract 128 from Cb and Cr. */
375     int16x8_t cr_128 =
376       vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
377     int16x8_t cb_128 =
378       vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
379     /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
380     int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
381     int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
382     g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
383     g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
384     /* Descale G components: shift right 15, round, and narrow to 16-bit. */
385     int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
386                                      vrshrn_n_s32(g_sub_y_h, 15));
387     /* Compute R-Y: 1.40200 * (Cr - 128) */
388     int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1), consts, 2);
389     /* Compute B-Y: 1.77200 * (Cb - 128) */
390     int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1), consts, 3);
391     /* For each row, add the chroma-derived values (G-Y, R-Y, and B-Y) to both
392      * the "even" and "odd" Y component values.  This effectively upsamples the
393      * chroma components both horizontally and vertically.
394      */
395     int16x8_t g0_even =
396       vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
397                                      y0.val[0]));
398     int16x8_t r0_even =
399       vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
400                                      y0.val[0]));
401     int16x8_t b0_even =
402       vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
403                                      y0.val[0]));
404     int16x8_t g0_odd =
405       vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
406                                      y0.val[1]));
407     int16x8_t r0_odd =
408       vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
409                                      y0.val[1]));
410     int16x8_t b0_odd =
411       vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
412                                      y0.val[1]));
413     int16x8_t g1_even =
414       vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
415                                      y1.val[0]));
416     int16x8_t r1_even =
417       vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
418                                      y1.val[0]));
419     int16x8_t b1_even =
420       vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
421                                      y1.val[0]));
422     int16x8_t g1_odd =
423       vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
424                                      y1.val[1]));
425     int16x8_t r1_odd =
426       vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
427                                      y1.val[1]));
428     int16x8_t b1_odd =
429       vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
430                                      y1.val[1]));
431     /* Convert each component to unsigned and narrow, clamping to [0-255].
432      * Re-interleave the "even" and "odd" component values.
433      */
434     uint8x8x2_t r0 = vzip_u8(vqmovun_s16(r0_even), vqmovun_s16(r0_odd));
435     uint8x8x2_t r1 = vzip_u8(vqmovun_s16(r1_even), vqmovun_s16(r1_odd));
436     uint8x8x2_t g0 = vzip_u8(vqmovun_s16(g0_even), vqmovun_s16(g0_odd));
437     uint8x8x2_t g1 = vzip_u8(vqmovun_s16(g1_even), vqmovun_s16(g1_odd));
438     uint8x8x2_t b0 = vzip_u8(vqmovun_s16(b0_even), vqmovun_s16(b0_odd));
439     uint8x8x2_t b1 = vzip_u8(vqmovun_s16(b1_even), vqmovun_s16(b1_odd));
440 
441 #ifdef RGB_ALPHA
442     uint8x16x4_t rgba0, rgba1;
443     rgba0.val[RGB_RED] = vcombine_u8(r0.val[0], r0.val[1]);
444     rgba1.val[RGB_RED] = vcombine_u8(r1.val[0], r1.val[1]);
445     rgba0.val[RGB_GREEN] = vcombine_u8(g0.val[0], g0.val[1]);
446     rgba1.val[RGB_GREEN] = vcombine_u8(g1.val[0], g1.val[1]);
447     rgba0.val[RGB_BLUE] = vcombine_u8(b0.val[0], b0.val[1]);
448     rgba1.val[RGB_BLUE] = vcombine_u8(b1.val[0], b1.val[1]);
449     /* Set alpha channel to opaque (0xFF). */
450     rgba0.val[RGB_ALPHA] = vdupq_n_u8(0xFF);
451     rgba1.val[RGB_ALPHA] = vdupq_n_u8(0xFF);
452     /* Store RGBA pixel data to memory. */
453     vst4q_u8(outptr0, rgba0);
454     vst4q_u8(outptr1, rgba1);
455 #else
456     uint8x16x3_t rgb0, rgb1;
457     rgb0.val[RGB_RED] = vcombine_u8(r0.val[0], r0.val[1]);
458     rgb1.val[RGB_RED] = vcombine_u8(r1.val[0], r1.val[1]);
459     rgb0.val[RGB_GREEN] = vcombine_u8(g0.val[0], g0.val[1]);
460     rgb1.val[RGB_GREEN] = vcombine_u8(g1.val[0], g1.val[1]);
461     rgb0.val[RGB_BLUE] = vcombine_u8(b0.val[0], b0.val[1]);
462     rgb1.val[RGB_BLUE] = vcombine_u8(b1.val[0], b1.val[1]);
463     /* Store RGB pixel data to memory. */
464     vst3q_u8(outptr0, rgb0);
465     vst3q_u8(outptr1, rgb1);
466 #endif
467 
468     /* Increment pointers. */
469     inptr0_0 += 16;
470     inptr0_1 += 16;
471     inptr1 += 8;
472     inptr2 += 8;
473     outptr0 += (RGB_PIXELSIZE * 16);
474     outptr1 += (RGB_PIXELSIZE * 16);
475   }
476 
477   if (cols_remaining > 0) {
478     /* For each row, de-interleave Y component values into two separate
479      * vectors, one containing the component values with even-numbered indices
480      * and one containing the component values with odd-numbered indices.
481      */
482     uint8x8x2_t y0 = vld2_u8(inptr0_0);
483     uint8x8x2_t y1 = vld2_u8(inptr0_1);
484     uint8x8_t cb = vld1_u8(inptr1);
485     uint8x8_t cr = vld1_u8(inptr2);
486     /* Subtract 128 from Cb and Cr. */
487     int16x8_t cr_128 =
488       vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
489     int16x8_t cb_128 =
490       vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
491     /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
492     int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
493     int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
494     g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
495     g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
496     /* Descale G components: shift right 15, round, and narrow to 16-bit. */
497     int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
498                                      vrshrn_n_s32(g_sub_y_h, 15));
499     /* Compute R-Y: 1.40200 * (Cr - 128) */
500     int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1), consts, 2);
501     /* Compute B-Y: 1.77200 * (Cb - 128) */
502     int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1), consts, 3);
503     /* For each row, add the chroma-derived values (G-Y, R-Y, and B-Y) to both
504      * the "even" and "odd" Y component values.  This effectively upsamples the
505      * chroma components both horizontally and vertically.
506      */
507     int16x8_t g0_even =
508       vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
509                                      y0.val[0]));
510     int16x8_t r0_even =
511       vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
512                                      y0.val[0]));
513     int16x8_t b0_even =
514       vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
515                                      y0.val[0]));
516     int16x8_t g0_odd =
517       vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
518                                      y0.val[1]));
519     int16x8_t r0_odd =
520       vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
521                                      y0.val[1]));
522     int16x8_t b0_odd =
523       vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
524                                      y0.val[1]));
525     int16x8_t g1_even =
526       vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
527                                      y1.val[0]));
528     int16x8_t r1_even =
529       vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
530                                      y1.val[0]));
531     int16x8_t b1_even =
532       vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
533                                      y1.val[0]));
534     int16x8_t g1_odd =
535       vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
536                                      y1.val[1]));
537     int16x8_t r1_odd =
538       vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
539                                      y1.val[1]));
540     int16x8_t b1_odd =
541       vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
542                                      y1.val[1]));
543     /* Convert each component to unsigned and narrow, clamping to [0-255].
544      * Re-interleave the "even" and "odd" component values.
545      */
546     uint8x8x2_t r0 = vzip_u8(vqmovun_s16(r0_even), vqmovun_s16(r0_odd));
547     uint8x8x2_t r1 = vzip_u8(vqmovun_s16(r1_even), vqmovun_s16(r1_odd));
548     uint8x8x2_t g0 = vzip_u8(vqmovun_s16(g0_even), vqmovun_s16(g0_odd));
549     uint8x8x2_t g1 = vzip_u8(vqmovun_s16(g1_even), vqmovun_s16(g1_odd));
550     uint8x8x2_t b0 = vzip_u8(vqmovun_s16(b0_even), vqmovun_s16(b0_odd));
551     uint8x8x2_t b1 = vzip_u8(vqmovun_s16(b1_even), vqmovun_s16(b1_odd));
552 
553 #ifdef RGB_ALPHA
554     uint8x8x4_t rgba0_h, rgba1_h;
555     rgba0_h.val[RGB_RED] = r0.val[1];
556     rgba1_h.val[RGB_RED] = r1.val[1];
557     rgba0_h.val[RGB_GREEN] = g0.val[1];
558     rgba1_h.val[RGB_GREEN] = g1.val[1];
559     rgba0_h.val[RGB_BLUE] = b0.val[1];
560     rgba1_h.val[RGB_BLUE] = b1.val[1];
561     /* Set alpha channel to opaque (0xFF). */
562     rgba0_h.val[RGB_ALPHA] = vdup_n_u8(0xFF);
563     rgba1_h.val[RGB_ALPHA] = vdup_n_u8(0xFF);
564 
565     uint8x8x4_t rgba0_l, rgba1_l;
566     rgba0_l.val[RGB_RED] = r0.val[0];
567     rgba1_l.val[RGB_RED] = r1.val[0];
568     rgba0_l.val[RGB_GREEN] = g0.val[0];
569     rgba1_l.val[RGB_GREEN] = g1.val[0];
570     rgba0_l.val[RGB_BLUE] = b0.val[0];
571     rgba1_l.val[RGB_BLUE] = b1.val[0];
572     /* Set alpha channel to opaque (0xFF). */
573     rgba0_l.val[RGB_ALPHA] = vdup_n_u8(0xFF);
574     rgba1_l.val[RGB_ALPHA] = vdup_n_u8(0xFF);
575     /* Store RGBA pixel data to memory. */
576     switch (cols_remaining) {
577     case 15:
578       vst4_lane_u8(outptr0 + 14 * RGB_PIXELSIZE, rgba0_h, 6);
579       vst4_lane_u8(outptr1 + 14 * RGB_PIXELSIZE, rgba1_h, 6);
580       FALLTHROUGH               /*FALLTHROUGH*/
581     case 14:
582       vst4_lane_u8(outptr0 + 13 * RGB_PIXELSIZE, rgba0_h, 5);
583       vst4_lane_u8(outptr1 + 13 * RGB_PIXELSIZE, rgba1_h, 5);
584       FALLTHROUGH               /*FALLTHROUGH*/
585     case 13:
586       vst4_lane_u8(outptr0 + 12 * RGB_PIXELSIZE, rgba0_h, 4);
587       vst4_lane_u8(outptr1 + 12 * RGB_PIXELSIZE, rgba1_h, 4);
588       FALLTHROUGH               /*FALLTHROUGH*/
589     case 12:
590       vst4_lane_u8(outptr0 + 11 * RGB_PIXELSIZE, rgba0_h, 3);
591       vst4_lane_u8(outptr1 + 11 * RGB_PIXELSIZE, rgba1_h, 3);
592       FALLTHROUGH               /*FALLTHROUGH*/
593     case 11:
594       vst4_lane_u8(outptr0 + 10 * RGB_PIXELSIZE, rgba0_h, 2);
595       vst4_lane_u8(outptr1 + 10 * RGB_PIXELSIZE, rgba1_h, 2);
596       FALLTHROUGH               /*FALLTHROUGH*/
597     case 10:
598       vst4_lane_u8(outptr0 + 9 * RGB_PIXELSIZE, rgba0_h, 1);
599       vst4_lane_u8(outptr1 + 9 * RGB_PIXELSIZE, rgba1_h, 1);
600       FALLTHROUGH               /*FALLTHROUGH*/
601     case 9:
602       vst4_lane_u8(outptr0 + 8 * RGB_PIXELSIZE, rgba0_h, 0);
603       vst4_lane_u8(outptr1 + 8 * RGB_PIXELSIZE, rgba1_h, 0);
604       FALLTHROUGH               /*FALLTHROUGH*/
605     case 8:
606       vst4_u8(outptr0, rgba0_l);
607       vst4_u8(outptr1, rgba1_l);
608       break;
609     case 7:
610       vst4_lane_u8(outptr0 + 6 * RGB_PIXELSIZE, rgba0_l, 6);
611       vst4_lane_u8(outptr1 + 6 * RGB_PIXELSIZE, rgba1_l, 6);
612       FALLTHROUGH               /*FALLTHROUGH*/
613     case 6:
614       vst4_lane_u8(outptr0 + 5 * RGB_PIXELSIZE, rgba0_l, 5);
615       vst4_lane_u8(outptr1 + 5 * RGB_PIXELSIZE, rgba1_l, 5);
616       FALLTHROUGH               /*FALLTHROUGH*/
617     case 5:
618       vst4_lane_u8(outptr0 + 4 * RGB_PIXELSIZE, rgba0_l, 4);
619       vst4_lane_u8(outptr1 + 4 * RGB_PIXELSIZE, rgba1_l, 4);
620       FALLTHROUGH               /*FALLTHROUGH*/
621     case 4:
622       vst4_lane_u8(outptr0 + 3 * RGB_PIXELSIZE, rgba0_l, 3);
623       vst4_lane_u8(outptr1 + 3 * RGB_PIXELSIZE, rgba1_l, 3);
624       FALLTHROUGH               /*FALLTHROUGH*/
625     case 3:
626       vst4_lane_u8(outptr0 + 2 * RGB_PIXELSIZE, rgba0_l, 2);
627       vst4_lane_u8(outptr1 + 2 * RGB_PIXELSIZE, rgba1_l, 2);
628       FALLTHROUGH               /*FALLTHROUGH*/
629     case 2:
630       vst4_lane_u8(outptr0 + 1 * RGB_PIXELSIZE, rgba0_l, 1);
631       vst4_lane_u8(outptr1 + 1 * RGB_PIXELSIZE, rgba1_l, 1);
632       FALLTHROUGH               /*FALLTHROUGH*/
633     case 1:
634       vst4_lane_u8(outptr0, rgba0_l, 0);
635       vst4_lane_u8(outptr1, rgba1_l, 0);
636       FALLTHROUGH               /*FALLTHROUGH*/
637     default:
638       break;
639     }
640 #else
641     uint8x8x3_t rgb0_h, rgb1_h;
642     rgb0_h.val[RGB_RED] = r0.val[1];
643     rgb1_h.val[RGB_RED] = r1.val[1];
644     rgb0_h.val[RGB_GREEN] = g0.val[1];
645     rgb1_h.val[RGB_GREEN] = g1.val[1];
646     rgb0_h.val[RGB_BLUE] = b0.val[1];
647     rgb1_h.val[RGB_BLUE] = b1.val[1];
648 
649     uint8x8x3_t rgb0_l, rgb1_l;
650     rgb0_l.val[RGB_RED] = r0.val[0];
651     rgb1_l.val[RGB_RED] = r1.val[0];
652     rgb0_l.val[RGB_GREEN] = g0.val[0];
653     rgb1_l.val[RGB_GREEN] = g1.val[0];
654     rgb0_l.val[RGB_BLUE] = b0.val[0];
655     rgb1_l.val[RGB_BLUE] = b1.val[0];
656     /* Store RGB pixel data to memory. */
657     switch (cols_remaining) {
658     case 15:
659       vst3_lane_u8(outptr0 + 14 * RGB_PIXELSIZE, rgb0_h, 6);
660       vst3_lane_u8(outptr1 + 14 * RGB_PIXELSIZE, rgb1_h, 6);
661       FALLTHROUGH               /*FALLTHROUGH*/
662     case 14:
663       vst3_lane_u8(outptr0 + 13 * RGB_PIXELSIZE, rgb0_h, 5);
664       vst3_lane_u8(outptr1 + 13 * RGB_PIXELSIZE, rgb1_h, 5);
665       FALLTHROUGH               /*FALLTHROUGH*/
666     case 13:
667       vst3_lane_u8(outptr0 + 12 * RGB_PIXELSIZE, rgb0_h, 4);
668       vst3_lane_u8(outptr1 + 12 * RGB_PIXELSIZE, rgb1_h, 4);
669       FALLTHROUGH               /*FALLTHROUGH*/
670     case 12:
671       vst3_lane_u8(outptr0 + 11 * RGB_PIXELSIZE, rgb0_h, 3);
672       vst3_lane_u8(outptr1 + 11 * RGB_PIXELSIZE, rgb1_h, 3);
673       FALLTHROUGH               /*FALLTHROUGH*/
674     case 11:
675       vst3_lane_u8(outptr0 + 10 * RGB_PIXELSIZE, rgb0_h, 2);
676       vst3_lane_u8(outptr1 + 10 * RGB_PIXELSIZE, rgb1_h, 2);
677       FALLTHROUGH               /*FALLTHROUGH*/
678     case 10:
679       vst3_lane_u8(outptr0 + 9 * RGB_PIXELSIZE, rgb0_h, 1);
680       vst3_lane_u8(outptr1 + 9 * RGB_PIXELSIZE, rgb1_h, 1);
681       FALLTHROUGH               /*FALLTHROUGH*/
682     case 9:
683       vst3_lane_u8(outptr0 + 8 * RGB_PIXELSIZE, rgb0_h, 0);
684       vst3_lane_u8(outptr1 + 8 * RGB_PIXELSIZE, rgb1_h, 0);
685       FALLTHROUGH               /*FALLTHROUGH*/
686     case 8:
687       vst3_u8(outptr0, rgb0_l);
688       vst3_u8(outptr1, rgb1_l);
689       break;
690     case 7:
691       vst3_lane_u8(outptr0 + 6 * RGB_PIXELSIZE, rgb0_l, 6);
692       vst3_lane_u8(outptr1 + 6 * RGB_PIXELSIZE, rgb1_l, 6);
693       FALLTHROUGH               /*FALLTHROUGH*/
694     case 6:
695       vst3_lane_u8(outptr0 + 5 * RGB_PIXELSIZE, rgb0_l, 5);
696       vst3_lane_u8(outptr1 + 5 * RGB_PIXELSIZE, rgb1_l, 5);
697       FALLTHROUGH               /*FALLTHROUGH*/
698     case 5:
699       vst3_lane_u8(outptr0 + 4 * RGB_PIXELSIZE, rgb0_l, 4);
700       vst3_lane_u8(outptr1 + 4 * RGB_PIXELSIZE, rgb1_l, 4);
701       FALLTHROUGH               /*FALLTHROUGH*/
702     case 4:
703       vst3_lane_u8(outptr0 + 3 * RGB_PIXELSIZE, rgb0_l, 3);
704       vst3_lane_u8(outptr1 + 3 * RGB_PIXELSIZE, rgb1_l, 3);
705       FALLTHROUGH               /*FALLTHROUGH*/
706     case 3:
707       vst3_lane_u8(outptr0 + 2 * RGB_PIXELSIZE, rgb0_l, 2);
708       vst3_lane_u8(outptr1 + 2 * RGB_PIXELSIZE, rgb1_l, 2);
709       FALLTHROUGH               /*FALLTHROUGH*/
710     case 2:
711       vst3_lane_u8(outptr0 + 1 * RGB_PIXELSIZE, rgb0_l, 1);
712       vst3_lane_u8(outptr1 + 1 * RGB_PIXELSIZE, rgb1_l, 1);
713       FALLTHROUGH               /*FALLTHROUGH*/
714     case 1:
715       vst3_lane_u8(outptr0, rgb0_l, 0);
716       vst3_lane_u8(outptr1, rgb1_l, 0);
717       FALLTHROUGH               /*FALLTHROUGH*/
718     default:
719       break;
720     }
721 #endif
722   }
723 }
724