• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  *  Copyright 2014 The LibYuv Project Authors. All rights reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS. All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "libyuv/row.h"
12 
13 #ifdef __cplusplus
14 namespace libyuv {
15 extern "C" {
16 #endif
17 
18 // This module is for GCC Neon armv8 64 bit.
19 #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
20 
21 // Read 8 Y, 4 U and 4 V from 422
22 #define READYUV422                                                             \
23     MEMACCESS(0)                                                               \
24     "ld1        {v0.8b}, [%0], #8              \n"                             \
25     MEMACCESS(1)                                                               \
26     "ld1        {v1.s}[0], [%1], #4            \n"                             \
27     MEMACCESS(2)                                                               \
28     "ld1        {v1.s}[1], [%2], #4            \n"
29 
30 // Read 8 Y, 2 U and 2 V from 422
31 #define READYUV411                                                             \
32     MEMACCESS(0)                                                               \
33     "ld1        {v0.8b}, [%0], #8              \n"                             \
34     MEMACCESS(1)                                                               \
35     "ld1        {v2.h}[0], [%1], #2            \n"                             \
36     MEMACCESS(2)                                                               \
37     "ld1        {v2.h}[1], [%2], #2            \n"                             \
38     "zip1       v1.8b, v2.8b, v2.8b            \n"
39 
40 // Read 8 Y, 8 U and 8 V from 444
41 #define READYUV444                                                             \
42     MEMACCESS(0)                                                               \
43     "ld1        {v0.8b}, [%0], #8              \n"                             \
44     MEMACCESS(1)                                                               \
45     "ld1        {v1.d}[0], [%1], #8            \n"                             \
46     MEMACCESS(2)                                                               \
47     "ld1        {v1.d}[1], [%2], #8            \n"                             \
48     "uaddlp     v1.8h, v1.16b                  \n"                             \
49     "rshrn      v1.8b, v1.8h, #1               \n"
50 
51 // Read 8 Y, and set 4 U and 4 V to 128
52 #define READYUV400                                                             \
53     MEMACCESS(0)                                                               \
54     "ld1        {v0.8b}, [%0], #8              \n"                             \
55     "movi       v1.8b , #128                   \n"
56 
57 // Read 8 Y and 4 UV from NV12
58 #define READNV12                                                               \
59     MEMACCESS(0)                                                               \
60     "ld1        {v0.8b}, [%0], #8              \n"                             \
61     MEMACCESS(1)                                                               \
62     "ld1        {v2.8b}, [%1], #8              \n"                             \
63     "uzp1       v1.8b, v2.8b, v2.8b            \n"                             \
64     "uzp2       v3.8b, v2.8b, v2.8b            \n"                             \
65     "ins        v1.s[1], v3.s[0]               \n"
66 
67 // Read 8 Y and 4 VU from NV21
68 #define READNV21                                                               \
69     MEMACCESS(0)                                                               \
70     "ld1        {v0.8b}, [%0], #8              \n"                             \
71     MEMACCESS(1)                                                               \
72     "ld1        {v2.8b}, [%1], #8              \n"                             \
73     "uzp1       v3.8b, v2.8b, v2.8b            \n"                             \
74     "uzp2       v1.8b, v2.8b, v2.8b            \n"                             \
75     "ins        v1.s[1], v3.s[0]               \n"
76 
77 // Read 8 YUY2
78 #define READYUY2                                                               \
79     MEMACCESS(0)                                                               \
80     "ld2        {v0.8b, v1.8b}, [%0], #16      \n"                             \
81     "uzp2       v3.8b, v1.8b, v1.8b            \n"                             \
82     "uzp1       v1.8b, v1.8b, v1.8b            \n"                             \
83     "ins        v1.s[1], v3.s[0]               \n"
84 
85 // Read 8 UYVY
86 #define READUYVY                                                               \
87     MEMACCESS(0)                                                               \
88     "ld2        {v2.8b, v3.8b}, [%0], #16      \n"                             \
89     "orr        v0.8b, v3.8b, v3.8b            \n"                             \
90     "uzp1       v1.8b, v2.8b, v2.8b            \n"                             \
91     "uzp2       v3.8b, v2.8b, v2.8b            \n"                             \
92     "ins        v1.s[1], v3.s[0]               \n"
93 
94 #define YUVTORGB_SETUP                                                         \
95     "ld1r       {v24.8h}, [%[kUVBiasBGR]], #2  \n"                             \
96     "ld1r       {v25.8h}, [%[kUVBiasBGR]], #2  \n"                             \
97     "ld1r       {v26.8h}, [%[kUVBiasBGR]]      \n"                             \
98     "ld1r       {v31.4s}, [%[kYToRgb]]         \n"                             \
99     "ld2        {v27.8h, v28.8h}, [%[kUVToRB]] \n"                             \
100     "ld2        {v29.8h, v30.8h}, [%[kUVToG]]  \n"
101 
102 #define YUVTORGB(vR, vG, vB)                                                   \
103     "uxtl       v0.8h, v0.8b                   \n" /* Extract Y    */          \
104     "shll       v2.8h, v1.8b, #8               \n" /* Replicate UV */          \
105     "ushll2     v3.4s, v0.8h, #0               \n" /* Y */                     \
106     "ushll      v0.4s, v0.4h, #0               \n"                             \
107     "mul        v3.4s, v3.4s, v31.4s           \n"                             \
108     "mul        v0.4s, v0.4s, v31.4s           \n"                             \
109     "sqshrun    v0.4h, v0.4s, #16              \n"                             \
110     "sqshrun2   v0.8h, v3.4s, #16              \n" /* Y */                     \
111     "uaddw      v1.8h, v2.8h, v1.8b            \n" /* Replicate UV */          \
112     "mov        v2.d[0], v1.d[1]               \n" /* Extract V */             \
113     "uxtl       v2.8h, v2.8b                   \n"                             \
114     "uxtl       v1.8h, v1.8b                   \n" /* Extract U */             \
115     "mul        v3.8h, v1.8h, v27.8h           \n"                             \
116     "mul        v5.8h, v1.8h, v29.8h           \n"                             \
117     "mul        v6.8h, v2.8h, v30.8h           \n"                             \
118     "mul        v7.8h, v2.8h, v28.8h           \n"                             \
119     "sqadd      v6.8h, v6.8h, v5.8h            \n"                             \
120     "sqadd      " #vB ".8h, v24.8h, v0.8h      \n" /* B */                     \
121     "sqadd      " #vG ".8h, v25.8h, v0.8h      \n" /* G */                     \
122     "sqadd      " #vR ".8h, v26.8h, v0.8h      \n" /* R */                     \
123     "sqadd      " #vB ".8h, " #vB ".8h, v3.8h  \n" /* B */                     \
124     "sqsub      " #vG ".8h, " #vG ".8h, v6.8h  \n" /* G */                     \
125     "sqadd      " #vR ".8h, " #vR ".8h, v7.8h  \n" /* R */                     \
126     "sqshrun    " #vB ".8b, " #vB ".8h, #6     \n" /* B */                     \
127     "sqshrun    " #vG ".8b, " #vG ".8h, #6     \n" /* G */                     \
128     "sqshrun    " #vR ".8b, " #vR ".8h, #6     \n" /* R */                     \
129 
I444ToARGBRow_NEON(const uint8 * src_y,const uint8 * src_u,const uint8 * src_v,uint8 * dst_argb,const struct YuvConstants * yuvconstants,int width)130 void I444ToARGBRow_NEON(const uint8* src_y,
131                         const uint8* src_u,
132                         const uint8* src_v,
133                         uint8* dst_argb,
134                         const struct YuvConstants* yuvconstants,
135                         int width) {
136   asm volatile (
137     YUVTORGB_SETUP
138     "movi       v23.8b, #255                   \n" /* A */
139   "1:                                          \n"
140     READYUV444
141     YUVTORGB(v22, v21, v20)
142     "subs       %w4, %w4, #8                   \n"
143     MEMACCESS(3)
144     "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
145     "b.gt       1b                             \n"
146     : "+r"(src_y),     // %0
147       "+r"(src_u),     // %1
148       "+r"(src_v),     // %2
149       "+r"(dst_argb),  // %3
150       "+r"(width)      // %4
151     : [kUVToRB]"r"(&yuvconstants->kUVToRB),
152       [kUVToG]"r"(&yuvconstants->kUVToG),
153       [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
154       [kYToRgb]"r"(&yuvconstants->kYToRgb)
155     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
156       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
157   );
158 }
159 
I422ToARGBRow_NEON(const uint8 * src_y,const uint8 * src_u,const uint8 * src_v,uint8 * dst_argb,const struct YuvConstants * yuvconstants,int width)160 void I422ToARGBRow_NEON(const uint8* src_y,
161                         const uint8* src_u,
162                         const uint8* src_v,
163                         uint8* dst_argb,
164                         const struct YuvConstants* yuvconstants,
165                         int width) {
166   asm volatile (
167     YUVTORGB_SETUP
168     "movi       v23.8b, #255                   \n" /* A */
169   "1:                                          \n"
170     READYUV422
171     YUVTORGB(v22, v21, v20)
172     "subs       %w4, %w4, #8                   \n"
173     MEMACCESS(3)
174     "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32     \n"
175     "b.gt       1b                             \n"
176     : "+r"(src_y),     // %0
177       "+r"(src_u),     // %1
178       "+r"(src_v),     // %2
179       "+r"(dst_argb),  // %3
180       "+r"(width)      // %4
181     : [kUVToRB]"r"(&yuvconstants->kUVToRB),
182       [kUVToG]"r"(&yuvconstants->kUVToG),
183       [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
184       [kYToRgb]"r"(&yuvconstants->kYToRgb)
185     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
186       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
187   );
188 }
189 
I422AlphaToARGBRow_NEON(const uint8 * src_y,const uint8 * src_u,const uint8 * src_v,const uint8 * src_a,uint8 * dst_argb,const struct YuvConstants * yuvconstants,int width)190 void I422AlphaToARGBRow_NEON(const uint8* src_y,
191                              const uint8* src_u,
192                              const uint8* src_v,
193                              const uint8* src_a,
194                              uint8* dst_argb,
195                              const struct YuvConstants* yuvconstants,
196                              int width) {
197   asm volatile (
198     YUVTORGB_SETUP
199   "1:                                          \n"
200     READYUV422
201     YUVTORGB(v22, v21, v20)
202     MEMACCESS(3)
203     "ld1        {v23.8b}, [%3], #8             \n"
204     "subs       %w5, %w5, #8                   \n"
205     MEMACCESS(4)
206     "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%4], #32     \n"
207     "b.gt       1b                             \n"
208     : "+r"(src_y),     // %0
209       "+r"(src_u),     // %1
210       "+r"(src_v),     // %2
211       "+r"(src_a),     // %3
212       "+r"(dst_argb),  // %4
213       "+r"(width)      // %5
214     : [kUVToRB]"r"(&yuvconstants->kUVToRB),
215       [kUVToG]"r"(&yuvconstants->kUVToG),
216       [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
217       [kYToRgb]"r"(&yuvconstants->kYToRgb)
218     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
219       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
220   );
221 }
222 
I411ToARGBRow_NEON(const uint8 * src_y,const uint8 * src_u,const uint8 * src_v,uint8 * dst_argb,const struct YuvConstants * yuvconstants,int width)223 void I411ToARGBRow_NEON(const uint8* src_y,
224                         const uint8* src_u,
225                         const uint8* src_v,
226                         uint8* dst_argb,
227                         const struct YuvConstants* yuvconstants,
228                         int width) {
229   asm volatile (
230     YUVTORGB_SETUP
231     "movi       v23.8b, #255                   \n" /* A */
232   "1:                                          \n"
233     READYUV411
234     YUVTORGB(v22, v21, v20)
235     "subs       %w4, %w4, #8                   \n"
236     MEMACCESS(3)
237     "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32     \n"
238     "b.gt       1b                             \n"
239     : "+r"(src_y),     // %0
240       "+r"(src_u),     // %1
241       "+r"(src_v),     // %2
242       "+r"(dst_argb),  // %3
243       "+r"(width)      // %4
244     : [kUVToRB]"r"(&yuvconstants->kUVToRB),
245       [kUVToG]"r"(&yuvconstants->kUVToG),
246       [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
247       [kYToRgb]"r"(&yuvconstants->kYToRgb)
248     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
249       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
250   );
251 }
252 
I422ToRGBARow_NEON(const uint8 * src_y,const uint8 * src_u,const uint8 * src_v,uint8 * dst_rgba,const struct YuvConstants * yuvconstants,int width)253 void I422ToRGBARow_NEON(const uint8* src_y,
254                         const uint8* src_u,
255                         const uint8* src_v,
256                         uint8* dst_rgba,
257                         const struct YuvConstants* yuvconstants,
258                         int width) {
259   asm volatile (
260     YUVTORGB_SETUP
261     "movi       v20.8b, #255                   \n" /* A */
262   "1:                                          \n"
263     READYUV422
264     YUVTORGB(v23, v22, v21)
265     "subs       %w4, %w4, #8                   \n"
266     MEMACCESS(3)
267     "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32     \n"
268     "b.gt       1b                             \n"
269     : "+r"(src_y),     // %0
270       "+r"(src_u),     // %1
271       "+r"(src_v),     // %2
272       "+r"(dst_rgba),  // %3
273       "+r"(width)      // %4
274     : [kUVToRB]"r"(&yuvconstants->kUVToRB),
275       [kUVToG]"r"(&yuvconstants->kUVToG),
276       [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
277       [kYToRgb]"r"(&yuvconstants->kYToRgb)
278     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
279       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
280   );
281 }
282 
I422ToRGB24Row_NEON(const uint8 * src_y,const uint8 * src_u,const uint8 * src_v,uint8 * dst_rgb24,const struct YuvConstants * yuvconstants,int width)283 void I422ToRGB24Row_NEON(const uint8* src_y,
284                          const uint8* src_u,
285                          const uint8* src_v,
286                          uint8* dst_rgb24,
287                          const struct YuvConstants* yuvconstants,
288                          int width) {
289   asm volatile (
290     YUVTORGB_SETUP
291   "1:                                          \n"
292     READYUV422
293     YUVTORGB(v22, v21, v20)
294     "subs       %w4, %w4, #8                   \n"
295     MEMACCESS(3)
296     "st3        {v20.8b,v21.8b,v22.8b}, [%3], #24     \n"
297     "b.gt       1b                             \n"
298     : "+r"(src_y),     // %0
299       "+r"(src_u),     // %1
300       "+r"(src_v),     // %2
301       "+r"(dst_rgb24), // %3
302       "+r"(width)      // %4
303     : [kUVToRB]"r"(&yuvconstants->kUVToRB),
304       [kUVToG]"r"(&yuvconstants->kUVToG),
305       [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
306       [kYToRgb]"r"(&yuvconstants->kYToRgb)
307     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
308       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
309   );
310 }
311 
312 #define ARGBTORGB565                                                           \
313     "shll       v0.8h,  v22.8b, #8             \n"  /* R                    */ \
314     "shll       v21.8h, v21.8b, #8             \n"  /* G                    */ \
315     "shll       v20.8h, v20.8b, #8             \n"  /* B                    */ \
316     "sri        v0.8h,  v21.8h, #5             \n"  /* RG                   */ \
317     "sri        v0.8h,  v20.8h, #11            \n"  /* RGB                  */
318 
I422ToRGB565Row_NEON(const uint8 * src_y,const uint8 * src_u,const uint8 * src_v,uint8 * dst_rgb565,const struct YuvConstants * yuvconstants,int width)319 void I422ToRGB565Row_NEON(const uint8* src_y,
320                           const uint8* src_u,
321                           const uint8* src_v,
322                           uint8* dst_rgb565,
323                           const struct YuvConstants* yuvconstants,
324                           int width) {
325   asm volatile (
326     YUVTORGB_SETUP
327   "1:                                          \n"
328     READYUV422
329     YUVTORGB(v22, v21, v20)
330     "subs       %w4, %w4, #8                   \n"
331     ARGBTORGB565
332     MEMACCESS(3)
333     "st1        {v0.8h}, [%3], #16             \n"  // store 8 pixels RGB565.
334     "b.gt       1b                             \n"
335     : "+r"(src_y),    // %0
336       "+r"(src_u),    // %1
337       "+r"(src_v),    // %2
338       "+r"(dst_rgb565),  // %3
339       "+r"(width)     // %4
340     : [kUVToRB]"r"(&yuvconstants->kUVToRB),
341       [kUVToG]"r"(&yuvconstants->kUVToG),
342       [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
343       [kYToRgb]"r"(&yuvconstants->kYToRgb)
344     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
345       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
346   );
347 }
348 
349 #define ARGBTOARGB1555                                                         \
350     "shll       v0.8h,  v23.8b, #8             \n"  /* A                    */ \
351     "shll       v22.8h, v22.8b, #8             \n"  /* R                    */ \
352     "shll       v21.8h, v21.8b, #8             \n"  /* G                    */ \
353     "shll       v20.8h, v20.8b, #8             \n"  /* B                    */ \
354     "sri        v0.8h,  v22.8h, #1             \n"  /* AR                   */ \
355     "sri        v0.8h,  v21.8h, #6             \n"  /* ARG                  */ \
356     "sri        v0.8h,  v20.8h, #11            \n"  /* ARGB                 */
357 
I422ToARGB1555Row_NEON(const uint8 * src_y,const uint8 * src_u,const uint8 * src_v,uint8 * dst_argb1555,const struct YuvConstants * yuvconstants,int width)358 void I422ToARGB1555Row_NEON(const uint8* src_y,
359                             const uint8* src_u,
360                             const uint8* src_v,
361                             uint8* dst_argb1555,
362                             const struct YuvConstants* yuvconstants,
363                             int width) {
364   asm volatile (
365     YUVTORGB_SETUP
366     "movi       v23.8b, #255                   \n"
367   "1:                                          \n"
368     READYUV422
369     YUVTORGB(v22, v21, v20)
370     "subs       %w4, %w4, #8                   \n"
371     ARGBTOARGB1555
372     MEMACCESS(3)
373     "st1        {v0.8h}, [%3], #16             \n"  // store 8 pixels RGB565.
374     "b.gt       1b                             \n"
375     : "+r"(src_y),    // %0
376       "+r"(src_u),    // %1
377       "+r"(src_v),    // %2
378       "+r"(dst_argb1555),  // %3
379       "+r"(width)     // %4
380     : [kUVToRB]"r"(&yuvconstants->kUVToRB),
381       [kUVToG]"r"(&yuvconstants->kUVToG),
382       [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
383       [kYToRgb]"r"(&yuvconstants->kYToRgb)
384     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
385       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
386   );
387 }
388 
389 #define ARGBTOARGB4444                                                         \
390     /* Input v20.8b<=B, v21.8b<=G, v22.8b<=R, v23.8b<=A, v4.8b<=0x0f        */ \
391     "ushr       v20.8b, v20.8b, #4             \n"  /* B                    */ \
392     "bic        v21.8b, v21.8b, v4.8b          \n"  /* G                    */ \
393     "ushr       v22.8b, v22.8b, #4             \n"  /* R                    */ \
394     "bic        v23.8b, v23.8b, v4.8b          \n"  /* A                    */ \
395     "orr        v0.8b,  v20.8b, v21.8b         \n"  /* BG                   */ \
396     "orr        v1.8b,  v22.8b, v23.8b         \n"  /* RA                   */ \
397     "zip1       v0.16b, v0.16b, v1.16b         \n"  /* BGRA                 */
398 
I422ToARGB4444Row_NEON(const uint8 * src_y,const uint8 * src_u,const uint8 * src_v,uint8 * dst_argb4444,const struct YuvConstants * yuvconstants,int width)399 void I422ToARGB4444Row_NEON(const uint8* src_y,
400                             const uint8* src_u,
401                             const uint8* src_v,
402                             uint8* dst_argb4444,
403                             const struct YuvConstants* yuvconstants,
404                             int width) {
405   asm volatile (
406     YUVTORGB_SETUP
407     "movi       v4.16b, #0x0f                  \n"  // bits to clear with vbic.
408   "1:                                          \n"
409     READYUV422
410     YUVTORGB(v22, v21, v20)
411     "subs       %w4, %w4, #8                   \n"
412     "movi       v23.8b, #255                   \n"
413     ARGBTOARGB4444
414     MEMACCESS(3)
415     "st1        {v0.8h}, [%3], #16             \n"  // store 8 pixels ARGB4444.
416     "b.gt       1b                             \n"
417     : "+r"(src_y),    // %0
418       "+r"(src_u),    // %1
419       "+r"(src_v),    // %2
420       "+r"(dst_argb4444),  // %3
421       "+r"(width)     // %4
422     : [kUVToRB]"r"(&yuvconstants->kUVToRB),
423       [kUVToG]"r"(&yuvconstants->kUVToG),
424       [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
425       [kYToRgb]"r"(&yuvconstants->kYToRgb)
426     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
427       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
428   );
429 }
430 
I400ToARGBRow_NEON(const uint8 * src_y,uint8 * dst_argb,int width)431 void I400ToARGBRow_NEON(const uint8* src_y,
432                         uint8* dst_argb,
433                         int width) {
434   asm volatile (
435     YUVTORGB_SETUP
436     "movi       v23.8b, #255                   \n"
437   "1:                                          \n"
438     READYUV400
439     YUVTORGB(v22, v21, v20)
440     "subs       %w2, %w2, #8                   \n"
441     MEMACCESS(1)
442     "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32     \n"
443     "b.gt       1b                             \n"
444     : "+r"(src_y),     // %0
445       "+r"(dst_argb),  // %1
446       "+r"(width)      // %2
447     : [kUVToRB]"r"(&kYuvI601Constants.kUVToRB),
448       [kUVToG]"r"(&kYuvI601Constants.kUVToG),
449       [kUVBiasBGR]"r"(&kYuvI601Constants.kUVBiasBGR),
450       [kYToRgb]"r"(&kYuvI601Constants.kYToRgb)
451     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
452       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
453   );
454 }
455 
J400ToARGBRow_NEON(const uint8 * src_y,uint8 * dst_argb,int width)456 void J400ToARGBRow_NEON(const uint8* src_y,
457                         uint8* dst_argb,
458                         int width) {
459   asm volatile (
460     "movi       v23.8b, #255                   \n"
461   "1:                                          \n"
462     MEMACCESS(0)
463     "ld1        {v20.8b}, [%0], #8             \n"
464     "orr        v21.8b, v20.8b, v20.8b         \n"
465     "orr        v22.8b, v20.8b, v20.8b         \n"
466     "subs       %w2, %w2, #8                   \n"
467     MEMACCESS(1)
468     "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32     \n"
469     "b.gt       1b                             \n"
470     : "+r"(src_y),     // %0
471       "+r"(dst_argb),  // %1
472       "+r"(width)      // %2
473     :
474     : "cc", "memory", "v20", "v21", "v22", "v23"
475   );
476 }
477 
NV12ToARGBRow_NEON(const uint8 * src_y,const uint8 * src_uv,uint8 * dst_argb,const struct YuvConstants * yuvconstants,int width)478 void NV12ToARGBRow_NEON(const uint8* src_y,
479                         const uint8* src_uv,
480                         uint8* dst_argb,
481                         const struct YuvConstants* yuvconstants,
482                         int width) {
483   asm volatile (
484     YUVTORGB_SETUP
485     "movi       v23.8b, #255                   \n"
486   "1:                                          \n"
487     READNV12
488     YUVTORGB(v22, v21, v20)
489     "subs       %w3, %w3, #8                   \n"
490     MEMACCESS(2)
491     "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32     \n"
492     "b.gt       1b                             \n"
493     : "+r"(src_y),     // %0
494       "+r"(src_uv),    // %1
495       "+r"(dst_argb),  // %2
496       "+r"(width)      // %3
497     : [kUVToRB]"r"(&yuvconstants->kUVToRB),
498       [kUVToG]"r"(&yuvconstants->kUVToG),
499       [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
500       [kYToRgb]"r"(&yuvconstants->kYToRgb)
501     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
502       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
503   );
504 }
505 
NV21ToARGBRow_NEON(const uint8 * src_y,const uint8 * src_vu,uint8 * dst_argb,const struct YuvConstants * yuvconstants,int width)506 void NV21ToARGBRow_NEON(const uint8* src_y,
507                         const uint8* src_vu,
508                         uint8* dst_argb,
509                         const struct YuvConstants* yuvconstants,
510                         int width) {
511   asm volatile (
512     YUVTORGB_SETUP
513     "movi       v23.8b, #255                   \n"
514   "1:                                          \n"
515     READNV21
516     YUVTORGB(v22, v21, v20)
517     "subs       %w3, %w3, #8                   \n"
518     MEMACCESS(2)
519     "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32     \n"
520     "b.gt       1b                             \n"
521     : "+r"(src_y),     // %0
522       "+r"(src_vu),    // %1
523       "+r"(dst_argb),  // %2
524       "+r"(width)      // %3
525     : [kUVToRB]"r"(&yuvconstants->kUVToRB),
526       [kUVToG]"r"(&yuvconstants->kUVToG),
527       [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
528       [kYToRgb]"r"(&yuvconstants->kYToRgb)
529     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
530       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
531   );
532 }
533 
NV12ToRGB565Row_NEON(const uint8 * src_y,const uint8 * src_uv,uint8 * dst_rgb565,const struct YuvConstants * yuvconstants,int width)534 void NV12ToRGB565Row_NEON(const uint8* src_y,
535                           const uint8* src_uv,
536                           uint8* dst_rgb565,
537                           const struct YuvConstants* yuvconstants,
538                           int width) {
539   asm volatile (
540     YUVTORGB_SETUP
541   "1:                                          \n"
542     READNV12
543     YUVTORGB(v22, v21, v20)
544     "subs       %w3, %w3, #8                   \n"
545     ARGBTORGB565
546     MEMACCESS(2)
547     "st1        {v0.8h}, [%2], 16              \n"  // store 8 pixels RGB565.
548     "b.gt       1b                             \n"
549     : "+r"(src_y),     // %0
550       "+r"(src_uv),    // %1
551       "+r"(dst_rgb565),  // %2
552       "+r"(width)      // %3
553     : [kUVToRB]"r"(&yuvconstants->kUVToRB),
554       [kUVToG]"r"(&yuvconstants->kUVToG),
555       [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
556       [kYToRgb]"r"(&yuvconstants->kYToRgb)
557     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
558       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
559   );
560 }
561 
YUY2ToARGBRow_NEON(const uint8 * src_yuy2,uint8 * dst_argb,const struct YuvConstants * yuvconstants,int width)562 void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
563                         uint8* dst_argb,
564                         const struct YuvConstants* yuvconstants,
565                         int width) {
566   asm volatile (
567     YUVTORGB_SETUP
568     "movi       v23.8b, #255                   \n"
569   "1:                                          \n"
570     READYUY2
571     YUVTORGB(v22, v21, v20)
572     "subs       %w2, %w2, #8                   \n"
573     MEMACCESS(1)
574     "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32      \n"
575     "b.gt       1b                             \n"
576     : "+r"(src_yuy2),  // %0
577       "+r"(dst_argb),  // %1
578       "+r"(width)      // %2
579     : [kUVToRB]"r"(&yuvconstants->kUVToRB),
580       [kUVToG]"r"(&yuvconstants->kUVToG),
581       [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
582       [kYToRgb]"r"(&yuvconstants->kYToRgb)
583     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
584       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
585   );
586 }
587 
UYVYToARGBRow_NEON(const uint8 * src_uyvy,uint8 * dst_argb,const struct YuvConstants * yuvconstants,int width)588 void UYVYToARGBRow_NEON(const uint8* src_uyvy,
589                         uint8* dst_argb,
590                         const struct YuvConstants* yuvconstants,
591                         int width) {
592   asm volatile (
593     YUVTORGB_SETUP
594     "movi       v23.8b, #255                   \n"
595   "1:                                          \n"
596     READUYVY
597     YUVTORGB(v22, v21, v20)
598     "subs       %w2, %w2, #8                   \n"
599     MEMACCESS(1)
600     "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], 32      \n"
601     "b.gt       1b                             \n"
602     : "+r"(src_uyvy),  // %0
603       "+r"(dst_argb),  // %1
604       "+r"(width)      // %2
605     : [kUVToRB]"r"(&yuvconstants->kUVToRB),
606       [kUVToG]"r"(&yuvconstants->kUVToG),
607       [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
608       [kYToRgb]"r"(&yuvconstants->kYToRgb)
609     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
610       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
611   );
612 }
613 
614 // Reads 16 pairs of UV and write even values to dst_u and odd to dst_v.
SplitUVRow_NEON(const uint8 * src_uv,uint8 * dst_u,uint8 * dst_v,int width)615 void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
616                      int width) {
617   asm volatile (
618   "1:                                          \n"
619     MEMACCESS(0)
620     "ld2        {v0.16b,v1.16b}, [%0], #32     \n"  // load 16 pairs of UV
621     "subs       %w3, %w3, #16                  \n"  // 16 processed per loop
622     MEMACCESS(1)
623     "st1        {v0.16b}, [%1], #16            \n"  // store U
624     MEMACCESS(2)
625     "st1        {v1.16b}, [%2], #16            \n"  // store V
626     "b.gt       1b                             \n"
627     : "+r"(src_uv),  // %0
628       "+r"(dst_u),   // %1
629       "+r"(dst_v),   // %2
630       "+r"(width)    // %3  // Output registers
631     :                       // Input registers
632     : "cc", "memory", "v0", "v1"  // Clobber List
633   );
634 }
635 
636 // Reads 16 U's and V's and writes out 16 pairs of UV.
MergeUVRow_NEON(const uint8 * src_u,const uint8 * src_v,uint8 * dst_uv,int width)637 void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
638                      int width) {
639   asm volatile (
640   "1:                                          \n"
641     MEMACCESS(0)
642     "ld1        {v0.16b}, [%0], #16            \n"  // load U
643     MEMACCESS(1)
644     "ld1        {v1.16b}, [%1], #16            \n"  // load V
645     "subs       %w3, %w3, #16                  \n"  // 16 processed per loop
646     MEMACCESS(2)
647     "st2        {v0.16b,v1.16b}, [%2], #32     \n"  // store 16 pairs of UV
648     "b.gt       1b                             \n"
649     :
650       "+r"(src_u),   // %0
651       "+r"(src_v),   // %1
652       "+r"(dst_uv),  // %2
653       "+r"(width)    // %3  // Output registers
654     :                       // Input registers
655     : "cc", "memory", "v0", "v1"  // Clobber List
656   );
657 }
658 
659 // Copy multiple of 32.  vld4.8  allow unaligned and is fastest on a15.
CopyRow_NEON(const uint8 * src,uint8 * dst,int count)660 void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
661   asm volatile (
662   "1:                                          \n"
663     MEMACCESS(0)
664     "ld1        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32       \n"  // load 32
665     "subs       %w2, %w2, #32                  \n"  // 32 processed per loop
666     MEMACCESS(1)
667     "st1        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32       \n"  // store 32
668     "b.gt       1b                             \n"
669   : "+r"(src),   // %0
670     "+r"(dst),   // %1
671     "+r"(count)  // %2  // Output registers
672   :                     // Input registers
673   : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
674   );
675 }
676 
677 // SetRow writes 'count' bytes using an 8 bit value repeated.
SetRow_NEON(uint8 * dst,uint8 v8,int count)678 void SetRow_NEON(uint8* dst, uint8 v8, int count) {
679   asm volatile (
680     "dup        v0.16b, %w2                    \n"  // duplicate 16 bytes
681   "1:                                          \n"
682     "subs       %w1, %w1, #16                  \n"  // 16 bytes per loop
683     MEMACCESS(0)
684     "st1        {v0.16b}, [%0], #16            \n"  // store
685     "b.gt       1b                             \n"
686   : "+r"(dst),   // %0
687     "+r"(count)  // %1
688   : "r"(v8)      // %2
689   : "cc", "memory", "v0"
690   );
691 }
692 
ARGBSetRow_NEON(uint8 * dst,uint32 v32,int count)693 void ARGBSetRow_NEON(uint8* dst, uint32 v32, int count) {
694   asm volatile (
695     "dup        v0.4s, %w2                     \n"  // duplicate 4 ints
696   "1:                                          \n"
697     "subs       %w1, %w1, #4                   \n"  // 4 ints per loop
698     MEMACCESS(0)
699     "st1        {v0.16b}, [%0], #16            \n"  // store
700     "b.gt       1b                             \n"
701   : "+r"(dst),   // %0
702     "+r"(count)  // %1
703   : "r"(v32)     // %2
704   : "cc", "memory", "v0"
705   );
706 }
707 
MirrorRow_NEON(const uint8 * src,uint8 * dst,int width)708 void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
709   asm volatile (
710     // Start at end of source row.
711     "add        %0, %0, %w2, sxtw              \n"
712     "sub        %0, %0, #16                    \n"
713   "1:                                          \n"
714     MEMACCESS(0)
715     "ld1        {v0.16b}, [%0], %3             \n"  // src -= 16
716     "subs       %w2, %w2, #16                  \n"  // 16 pixels per loop.
717     "rev64      v0.16b, v0.16b                 \n"
718     MEMACCESS(1)
719     "st1        {v0.D}[1], [%1], #8            \n"  // dst += 16
720     MEMACCESS(1)
721     "st1        {v0.D}[0], [%1], #8            \n"
722     "b.gt       1b                             \n"
723   : "+r"(src),   // %0
724     "+r"(dst),   // %1
725     "+r"(width)  // %2
726   : "r"((ptrdiff_t)-16)    // %3
727   : "cc", "memory", "v0"
728   );
729 }
730 
MirrorUVRow_NEON(const uint8 * src_uv,uint8 * dst_u,uint8 * dst_v,int width)731 void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
732                       int width) {
733   asm volatile (
734     // Start at end of source row.
735     "add        %0, %0, %w3, sxtw #1           \n"
736     "sub        %0, %0, #16                    \n"
737   "1:                                          \n"
738     MEMACCESS(0)
739     "ld2        {v0.8b, v1.8b}, [%0], %4       \n"  // src -= 16
740     "subs       %w3, %w3, #8                   \n"  // 8 pixels per loop.
741     "rev64      v0.8b, v0.8b                   \n"
742     "rev64      v1.8b, v1.8b                   \n"
743     MEMACCESS(1)
744     "st1        {v0.8b}, [%1], #8              \n"  // dst += 8
745     MEMACCESS(2)
746     "st1        {v1.8b}, [%2], #8              \n"
747     "b.gt       1b                             \n"
748   : "+r"(src_uv),  // %0
749     "+r"(dst_u),   // %1
750     "+r"(dst_v),   // %2
751     "+r"(width)    // %3
752   : "r"((ptrdiff_t)-16)      // %4
753   : "cc", "memory", "v0", "v1"
754   );
755 }
756 
ARGBMirrorRow_NEON(const uint8 * src,uint8 * dst,int width)757 void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {
758   asm volatile (
759   // Start at end of source row.
760     "add        %0, %0, %w2, sxtw #2           \n"
761     "sub        %0, %0, #16                    \n"
762   "1:                                          \n"
763     MEMACCESS(0)
764     "ld1        {v0.16b}, [%0], %3             \n"  // src -= 16
765     "subs       %w2, %w2, #4                   \n"  // 4 pixels per loop.
766     "rev64      v0.4s, v0.4s                   \n"
767     MEMACCESS(1)
768     "st1        {v0.D}[1], [%1], #8            \n"  // dst += 16
769     MEMACCESS(1)
770     "st1        {v0.D}[0], [%1], #8            \n"
771     "b.gt       1b                             \n"
772   : "+r"(src),   // %0
773     "+r"(dst),   // %1
774     "+r"(width)  // %2
775   : "r"((ptrdiff_t)-16)    // %3
776   : "cc", "memory", "v0"
777   );
778 }
779 
RGB24ToARGBRow_NEON(const uint8 * src_rgb24,uint8 * dst_argb,int width)780 void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int width) {
781   asm volatile (
782     "movi       v4.8b, #255                    \n"  // Alpha
783   "1:                                          \n"
784     MEMACCESS(0)
785     "ld3        {v1.8b,v2.8b,v3.8b}, [%0], #24 \n"  // load 8 pixels of RGB24.
786     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
787     MEMACCESS(1)
788     "st4        {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n"  // store 8 ARGB pixels
789     "b.gt       1b                             \n"
790   : "+r"(src_rgb24),  // %0
791     "+r"(dst_argb),   // %1
792     "+r"(width)       // %2
793   :
794   : "cc", "memory", "v1", "v2", "v3", "v4"  // Clobber List
795   );
796 }
797 
RAWToARGBRow_NEON(const uint8 * src_raw,uint8 * dst_argb,int width)798 void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int width) {
799   asm volatile (
800     "movi       v5.8b, #255                    \n"  // Alpha
801   "1:                                          \n"
802     MEMACCESS(0)
803     "ld3        {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // read r g b
804     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
805     "orr        v3.8b, v1.8b, v1.8b            \n"  // move g
806     "orr        v4.8b, v0.8b, v0.8b            \n"  // move r
807     MEMACCESS(1)
808     "st4        {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n"  // store b g r a
809     "b.gt       1b                             \n"
810   : "+r"(src_raw),   // %0
811     "+r"(dst_argb),  // %1
812     "+r"(width)      // %2
813   :
814   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5"  // Clobber List
815   );
816 }
817 
RAWToRGB24Row_NEON(const uint8 * src_raw,uint8 * dst_rgb24,int width)818 void RAWToRGB24Row_NEON(const uint8* src_raw, uint8* dst_rgb24, int width) {
819   asm volatile (
820   "1:                                          \n"
821     MEMACCESS(0)
822     "ld3        {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // read r g b
823     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
824     "orr        v3.8b, v1.8b, v1.8b            \n"  // move g
825     "orr        v4.8b, v0.8b, v0.8b            \n"  // move r
826     MEMACCESS(1)
827     "st3        {v2.8b,v3.8b,v4.8b}, [%1], #24 \n"  // store b g r
828     "b.gt       1b                             \n"
829   : "+r"(src_raw),    // %0
830     "+r"(dst_rgb24),  // %1
831     "+r"(width)       // %2
832   :
833   : "cc", "memory", "v0", "v1", "v2", "v3", "v4"  // Clobber List
834   );
835 }
836 
837 #define RGB565TOARGB                                                           \
838     "shrn       v6.8b, v0.8h, #5               \n"  /* G xxGGGGGG           */ \
839     "shl        v6.8b, v6.8b, #2               \n"  /* G GGGGGG00 upper 6   */ \
840     "ushr       v4.8b, v6.8b, #6               \n"  /* G 000000GG lower 2   */ \
841     "orr        v1.8b, v4.8b, v6.8b            \n"  /* G                    */ \
842     "xtn        v2.8b, v0.8h                   \n"  /* B xxxBBBBB           */ \
843     "ushr       v0.8h, v0.8h, #11              \n"  /* R 000RRRRR           */ \
844     "xtn2       v2.16b,v0.8h                   \n"  /* R in upper part      */ \
845     "shl        v2.16b, v2.16b, #3             \n"  /* R,B BBBBB000 upper 5 */ \
846     "ushr       v0.16b, v2.16b, #5             \n"  /* R,B 00000BBB lower 3 */ \
847     "orr        v0.16b, v0.16b, v2.16b         \n"  /* R,B                  */ \
848     "dup        v2.2D, v0.D[1]                 \n"  /* R                    */
849 
RGB565ToARGBRow_NEON(const uint8 * src_rgb565,uint8 * dst_argb,int width)850 void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int width) {
851   asm volatile (
852     "movi       v3.8b, #255                    \n"  // Alpha
853   "1:                                          \n"
854     MEMACCESS(0)
855     "ld1        {v0.16b}, [%0], #16            \n"  // load 8 RGB565 pixels.
856     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
857     RGB565TOARGB
858     MEMACCESS(1)
859     "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB pixels
860     "b.gt       1b                             \n"
861   : "+r"(src_rgb565),  // %0
862     "+r"(dst_argb),    // %1
863     "+r"(width)          // %2
864   :
865   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6"  // Clobber List
866   );
867 }
868 
869 #define ARGB1555TOARGB                                                         \
870     "ushr       v2.8h, v0.8h, #10              \n"  /* R xxxRRRRR           */ \
871     "shl        v2.8h, v2.8h, #3               \n"  /* R RRRRR000 upper 5   */ \
872     "xtn        v3.8b, v2.8h                   \n"  /* RRRRR000 AAAAAAAA    */ \
873                                                                                \
874     "sshr       v2.8h, v0.8h, #15              \n"  /* A AAAAAAAA           */ \
875     "xtn2       v3.16b, v2.8h                  \n"                             \
876                                                                                \
877     "xtn        v2.8b, v0.8h                   \n"  /* B xxxBBBBB           */ \
878     "shrn2      v2.16b,v0.8h, #5               \n"  /* G xxxGGGGG           */ \
879                                                                                \
880     "ushr       v1.16b, v3.16b, #5             \n"  /* R,A 00000RRR lower 3 */ \
881     "shl        v0.16b, v2.16b, #3             \n"  /* B,G BBBBB000 upper 5 */ \
882     "ushr       v2.16b, v0.16b, #5             \n"  /* B,G 00000BBB lower 3 */ \
883                                                                                \
884     "orr        v0.16b, v0.16b, v2.16b         \n"  /* B,G                  */ \
885     "orr        v2.16b, v1.16b, v3.16b         \n"  /* R,A                  */ \
886     "dup        v1.2D, v0.D[1]                 \n"                             \
887     "dup        v3.2D, v2.D[1]                 \n"
888 
889 // RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha.
890 #define RGB555TOARGB                                                           \
891     "ushr       v2.8h, v0.8h, #10              \n"  /* R xxxRRRRR           */ \
892     "shl        v2.8h, v2.8h, #3               \n"  /* R RRRRR000 upper 5   */ \
893     "xtn        v3.8b, v2.8h                   \n"  /* RRRRR000             */ \
894                                                                                \
895     "xtn        v2.8b, v0.8h                   \n"  /* B xxxBBBBB           */ \
896     "shrn2      v2.16b,v0.8h, #5               \n"  /* G xxxGGGGG           */ \
897                                                                                \
898     "ushr       v1.16b, v3.16b, #5             \n"  /* R   00000RRR lower 3 */ \
899     "shl        v0.16b, v2.16b, #3             \n"  /* B,G BBBBB000 upper 5 */ \
900     "ushr       v2.16b, v0.16b, #5             \n"  /* B,G 00000BBB lower 3 */ \
901                                                                                \
902     "orr        v0.16b, v0.16b, v2.16b         \n"  /* B,G                  */ \
903     "orr        v2.16b, v1.16b, v3.16b         \n"  /* R                    */ \
904     "dup        v1.2D, v0.D[1]                 \n"  /* G */                    \
905 
ARGB1555ToARGBRow_NEON(const uint8 * src_argb1555,uint8 * dst_argb,int width)906 void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb,
907                             int width) {
908   asm volatile (
909     "movi       v3.8b, #255                    \n"  // Alpha
910   "1:                                          \n"
911     MEMACCESS(0)
912     "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB1555 pixels.
913     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
914     ARGB1555TOARGB
915     MEMACCESS(1)
916     "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB pixels
917     "b.gt       1b                             \n"
918   : "+r"(src_argb1555),  // %0
919     "+r"(dst_argb),    // %1
920     "+r"(width)          // %2
921   :
922   : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
923   );
924 }
925 
926 #define ARGB4444TOARGB                                                         \
927     "shrn       v1.8b,  v0.8h, #8              \n"  /* v1(l) AR             */ \
928     "xtn2       v1.16b, v0.8h                  \n"  /* v1(h) GB             */ \
929     "shl        v2.16b, v1.16b, #4             \n"  /* B,R BBBB0000         */ \
930     "ushr       v3.16b, v1.16b, #4             \n"  /* G,A 0000GGGG         */ \
931     "ushr       v0.16b, v2.16b, #4             \n"  /* B,R 0000BBBB         */ \
932     "shl        v1.16b, v3.16b, #4             \n"  /* G,A GGGG0000         */ \
933     "orr        v2.16b, v0.16b, v2.16b         \n"  /* B,R BBBBBBBB         */ \
934     "orr        v3.16b, v1.16b, v3.16b         \n"  /* G,A GGGGGGGG         */ \
935     "dup        v0.2D, v2.D[1]                 \n"                             \
936     "dup        v1.2D, v3.D[1]                 \n"
937 
ARGB4444ToARGBRow_NEON(const uint8 * src_argb4444,uint8 * dst_argb,int width)938 void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb,
939                             int width) {
940   asm volatile (
941   "1:                                          \n"
942     MEMACCESS(0)
943     "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB4444 pixels.
944     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
945     ARGB4444TOARGB
946     MEMACCESS(1)
947     "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB pixels
948     "b.gt       1b                             \n"
949   : "+r"(src_argb4444),  // %0
950     "+r"(dst_argb),    // %1
951     "+r"(width)          // %2
952   :
953   : "cc", "memory", "v0", "v1", "v2", "v3", "v4"  // Clobber List
954   );
955 }
956 
ARGBToRGB24Row_NEON(const uint8 * src_argb,uint8 * dst_rgb24,int width)957 void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int width) {
958   asm volatile (
959   "1:                                          \n"
960     MEMACCESS(0)
961     "ld4        {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n"  // load 8 ARGB pixels
962     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
963     MEMACCESS(1)
964     "st3        {v1.8b,v2.8b,v3.8b}, [%1], #24 \n"  // store 8 pixels of RGB24.
965     "b.gt       1b                             \n"
966   : "+r"(src_argb),   // %0
967     "+r"(dst_rgb24),  // %1
968     "+r"(width)         // %2
969   :
970   : "cc", "memory", "v1", "v2", "v3", "v4"  // Clobber List
971   );
972 }
973 
ARGBToRAWRow_NEON(const uint8 * src_argb,uint8 * dst_raw,int width)974 void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int width) {
975   asm volatile (
976   "1:                                          \n"
977     MEMACCESS(0)
978     "ld4        {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n"  // load b g r a
979     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
980     "orr        v4.8b, v2.8b, v2.8b            \n"  // mov g
981     "orr        v5.8b, v1.8b, v1.8b            \n"  // mov b
982     MEMACCESS(1)
983     "st3        {v3.8b,v4.8b,v5.8b}, [%1], #24 \n"  // store r g b
984     "b.gt       1b                             \n"
985   : "+r"(src_argb),  // %0
986     "+r"(dst_raw),   // %1
987     "+r"(width)        // %2
988   :
989   : "cc", "memory", "v1", "v2", "v3", "v4", "v5"  // Clobber List
990   );
991 }
992 
YUY2ToYRow_NEON(const uint8 * src_yuy2,uint8 * dst_y,int width)993 void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int width) {
994   asm volatile (
995   "1:                                          \n"
996     MEMACCESS(0)
997     "ld2        {v0.16b,v1.16b}, [%0], #32     \n"  // load 16 pixels of YUY2.
998     "subs       %w2, %w2, #16                  \n"  // 16 processed per loop.
999     MEMACCESS(1)
1000     "st1        {v0.16b}, [%1], #16            \n"  // store 16 pixels of Y.
1001     "b.gt       1b                             \n"
1002   : "+r"(src_yuy2),  // %0
1003     "+r"(dst_y),     // %1
1004     "+r"(width)        // %2
1005   :
1006   : "cc", "memory", "v0", "v1"  // Clobber List
1007   );
1008 }
1009 
UYVYToYRow_NEON(const uint8 * src_uyvy,uint8 * dst_y,int width)1010 void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int width) {
1011   asm volatile (
1012   "1:                                          \n"
1013     MEMACCESS(0)
1014     "ld2        {v0.16b,v1.16b}, [%0], #32     \n"  // load 16 pixels of UYVY.
1015     "subs       %w2, %w2, #16                  \n"  // 16 processed per loop.
1016     MEMACCESS(1)
1017     "st1        {v1.16b}, [%1], #16            \n"  // store 16 pixels of Y.
1018     "b.gt       1b                             \n"
1019   : "+r"(src_uyvy),  // %0
1020     "+r"(dst_y),     // %1
1021     "+r"(width)        // %2
1022   :
1023   : "cc", "memory", "v0", "v1"  // Clobber List
1024   );
1025 }
1026 
YUY2ToUV422Row_NEON(const uint8 * src_yuy2,uint8 * dst_u,uint8 * dst_v,int width)1027 void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
1028                          int width) {
1029   asm volatile (
1030   "1:                                          \n"
1031     MEMACCESS(0)
1032     "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 YUY2 pixels
1033     "subs       %w3, %w3, #16                  \n"  // 16 pixels = 8 UVs.
1034     MEMACCESS(1)
1035     "st1        {v1.8b}, [%1], #8              \n"  // store 8 U.
1036     MEMACCESS(2)
1037     "st1        {v3.8b}, [%2], #8              \n"  // store 8 V.
1038     "b.gt       1b                             \n"
1039   : "+r"(src_yuy2),  // %0
1040     "+r"(dst_u),     // %1
1041     "+r"(dst_v),     // %2
1042     "+r"(width)        // %3
1043   :
1044   : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
1045   );
1046 }
1047 
UYVYToUV422Row_NEON(const uint8 * src_uyvy,uint8 * dst_u,uint8 * dst_v,int width)1048 void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v,
1049                          int width) {
1050   asm volatile (
1051   "1:                                          \n"
1052     MEMACCESS(0)
1053     "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 UYVY pixels
1054     "subs       %w3, %w3, #16                  \n"  // 16 pixels = 8 UVs.
1055     MEMACCESS(1)
1056     "st1        {v0.8b}, [%1], #8              \n"  // store 8 U.
1057     MEMACCESS(2)
1058     "st1        {v2.8b}, [%2], #8              \n"  // store 8 V.
1059     "b.gt       1b                             \n"
1060   : "+r"(src_uyvy),  // %0
1061     "+r"(dst_u),     // %1
1062     "+r"(dst_v),     // %2
1063     "+r"(width)        // %3
1064   :
1065   : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
1066   );
1067 }
1068 
YUY2ToUVRow_NEON(const uint8 * src_yuy2,int stride_yuy2,uint8 * dst_u,uint8 * dst_v,int width)1069 void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
1070                       uint8* dst_u, uint8* dst_v, int width) {
1071   const uint8* src_yuy2b = src_yuy2 + stride_yuy2;
1072   asm volatile (
1073   "1:                                          \n"
1074     MEMACCESS(0)
1075     "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 pixels
1076     "subs       %w4, %w4, #16                  \n"  // 16 pixels = 8 UVs.
1077     MEMACCESS(1)
1078     "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load next row
1079     "urhadd     v1.8b, v1.8b, v5.8b            \n"  // average rows of U
1080     "urhadd     v3.8b, v3.8b, v7.8b            \n"  // average rows of V
1081     MEMACCESS(2)
1082     "st1        {v1.8b}, [%2], #8              \n"  // store 8 U.
1083     MEMACCESS(3)
1084     "st1        {v3.8b}, [%3], #8              \n"  // store 8 V.
1085     "b.gt       1b                             \n"
1086   : "+r"(src_yuy2),     // %0
1087     "+r"(src_yuy2b),    // %1
1088     "+r"(dst_u),        // %2
1089     "+r"(dst_v),        // %3
1090     "+r"(width)           // %4
1091   :
1092   : "cc", "memory", "v0", "v1", "v2", "v3", "v4",
1093     "v5", "v6", "v7"  // Clobber List
1094   );
1095 }
1096 
UYVYToUVRow_NEON(const uint8 * src_uyvy,int stride_uyvy,uint8 * dst_u,uint8 * dst_v,int width)1097 void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
1098                       uint8* dst_u, uint8* dst_v, int width) {
1099   const uint8* src_uyvyb = src_uyvy + stride_uyvy;
1100   asm volatile (
1101   "1:                                          \n"
1102     MEMACCESS(0)
1103     "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 pixels
1104     "subs       %w4, %w4, #16                  \n"  // 16 pixels = 8 UVs.
1105     MEMACCESS(1)
1106     "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load next row
1107     "urhadd     v0.8b, v0.8b, v4.8b            \n"  // average rows of U
1108     "urhadd     v2.8b, v2.8b, v6.8b            \n"  // average rows of V
1109     MEMACCESS(2)
1110     "st1        {v0.8b}, [%2], #8              \n"  // store 8 U.
1111     MEMACCESS(3)
1112     "st1        {v2.8b}, [%3], #8              \n"  // store 8 V.
1113     "b.gt       1b                             \n"
1114   : "+r"(src_uyvy),     // %0
1115     "+r"(src_uyvyb),    // %1
1116     "+r"(dst_u),        // %2
1117     "+r"(dst_v),        // %3
1118     "+r"(width)           // %4
1119   :
1120   : "cc", "memory", "v0", "v1", "v2", "v3", "v4",
1121     "v5", "v6", "v7"  // Clobber List
1122   );
1123 }
1124 
1125 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
ARGBShuffleRow_NEON(const uint8 * src_argb,uint8 * dst_argb,const uint8 * shuffler,int width)1126 void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,
1127                          const uint8* shuffler, int width) {
1128   asm volatile (
1129     MEMACCESS(3)
1130     "ld1        {v2.16b}, [%3]                 \n"  // shuffler
1131   "1:                                          \n"
1132     MEMACCESS(0)
1133     "ld1        {v0.16b}, [%0], #16            \n"  // load 4 pixels.
1134     "subs       %w2, %w2, #4                   \n"  // 4 processed per loop
1135     "tbl        v1.16b, {v0.16b}, v2.16b       \n"  // look up 4 pixels
1136     MEMACCESS(1)
1137     "st1        {v1.16b}, [%1], #16            \n"  // store 4.
1138     "b.gt       1b                             \n"
1139   : "+r"(src_argb),  // %0
1140     "+r"(dst_argb),  // %1
1141     "+r"(width)        // %2
1142   : "r"(shuffler)    // %3
1143   : "cc", "memory", "v0", "v1", "v2"  // Clobber List
1144   );
1145 }
1146 
I422ToYUY2Row_NEON(const uint8 * src_y,const uint8 * src_u,const uint8 * src_v,uint8 * dst_yuy2,int width)1147 void I422ToYUY2Row_NEON(const uint8* src_y,
1148                         const uint8* src_u,
1149                         const uint8* src_v,
1150                         uint8* dst_yuy2, int width) {
1151   asm volatile (
1152   "1:                                          \n"
1153     MEMACCESS(0)
1154     "ld2        {v0.8b, v1.8b}, [%0], #16      \n"  // load 16 Ys
1155     "orr        v2.8b, v1.8b, v1.8b            \n"
1156     MEMACCESS(1)
1157     "ld1        {v1.8b}, [%1], #8              \n"  // load 8 Us
1158     MEMACCESS(2)
1159     "ld1        {v3.8b}, [%2], #8              \n"  // load 8 Vs
1160     "subs       %w4, %w4, #16                  \n"  // 16 pixels
1161     MEMACCESS(3)
1162     "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n"  // Store 16 pixels.
1163     "b.gt       1b                             \n"
1164   : "+r"(src_y),     // %0
1165     "+r"(src_u),     // %1
1166     "+r"(src_v),     // %2
1167     "+r"(dst_yuy2),  // %3
1168     "+r"(width)      // %4
1169   :
1170   : "cc", "memory", "v0", "v1", "v2", "v3"
1171   );
1172 }
1173 
I422ToUYVYRow_NEON(const uint8 * src_y,const uint8 * src_u,const uint8 * src_v,uint8 * dst_uyvy,int width)1174 void I422ToUYVYRow_NEON(const uint8* src_y,
1175                         const uint8* src_u,
1176                         const uint8* src_v,
1177                         uint8* dst_uyvy, int width) {
1178   asm volatile (
1179   "1:                                          \n"
1180     MEMACCESS(0)
1181     "ld2        {v1.8b,v2.8b}, [%0], #16       \n"  // load 16 Ys
1182     "orr        v3.8b, v2.8b, v2.8b            \n"
1183     MEMACCESS(1)
1184     "ld1        {v0.8b}, [%1], #8              \n"  // load 8 Us
1185     MEMACCESS(2)
1186     "ld1        {v2.8b}, [%2], #8              \n"  // load 8 Vs
1187     "subs       %w4, %w4, #16                  \n"  // 16 pixels
1188     MEMACCESS(3)
1189     "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n"  // Store 16 pixels.
1190     "b.gt       1b                             \n"
1191   : "+r"(src_y),     // %0
1192     "+r"(src_u),     // %1
1193     "+r"(src_v),     // %2
1194     "+r"(dst_uyvy),  // %3
1195     "+r"(width)      // %4
1196   :
1197   : "cc", "memory", "v0", "v1", "v2", "v3"
1198   );
1199 }
1200 
ARGBToRGB565Row_NEON(const uint8 * src_argb,uint8 * dst_rgb565,int width)1201 void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int width) {
1202   asm volatile (
1203   "1:                                          \n"
1204     MEMACCESS(0)
1205     "ld4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n"  // load 8 pixels
1206     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
1207     ARGBTORGB565
1208     MEMACCESS(1)
1209     "st1        {v0.16b}, [%1], #16            \n"  // store 8 pixels RGB565.
1210     "b.gt       1b                             \n"
1211   : "+r"(src_argb),  // %0
1212     "+r"(dst_rgb565),  // %1
1213     "+r"(width)        // %2
1214   :
1215   : "cc", "memory", "v0", "v20", "v21", "v22", "v23"
1216   );
1217 }
1218 
ARGBToRGB565DitherRow_NEON(const uint8 * src_argb,uint8 * dst_rgb,const uint32 dither4,int width)1219 void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb,
1220                                 const uint32 dither4, int width) {
1221   asm volatile (
1222     "dup        v1.4s, %w2                     \n"  // dither4
1223   "1:                                          \n"
1224     MEMACCESS(1)
1225     "ld4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"  // load 8 pixels
1226     "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
1227     "uqadd      v20.8b, v20.8b, v1.8b          \n"
1228     "uqadd      v21.8b, v21.8b, v1.8b          \n"
1229     "uqadd      v22.8b, v22.8b, v1.8b          \n"
1230     ARGBTORGB565
1231     MEMACCESS(0)
1232     "st1        {v0.16b}, [%0], #16            \n"  // store 8 pixels RGB565.
1233     "b.gt       1b                             \n"
1234   : "+r"(dst_rgb)    // %0
1235   : "r"(src_argb),   // %1
1236     "r"(dither4),    // %2
1237     "r"(width)       // %3
1238   : "cc", "memory", "v0", "v1", "v20", "v21", "v22", "v23"
1239   );
1240 }
1241 
ARGBToARGB1555Row_NEON(const uint8 * src_argb,uint8 * dst_argb1555,int width)1242 void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555,
1243                             int width) {
1244   asm volatile (
1245   "1:                                          \n"
1246     MEMACCESS(0)
1247     "ld4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n"  // load 8 pixels
1248     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
1249     ARGBTOARGB1555
1250     MEMACCESS(1)
1251     "st1        {v0.16b}, [%1], #16            \n"  // store 8 pixels ARGB1555.
1252     "b.gt       1b                             \n"
1253   : "+r"(src_argb),  // %0
1254     "+r"(dst_argb1555),  // %1
1255     "+r"(width)        // %2
1256   :
1257   : "cc", "memory", "v0", "v20", "v21", "v22", "v23"
1258   );
1259 }
1260 
ARGBToARGB4444Row_NEON(const uint8 * src_argb,uint8 * dst_argb4444,int width)1261 void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444,
1262                             int width) {
1263   asm volatile (
1264     "movi       v4.16b, #0x0f                  \n"  // bits to clear with vbic.
1265   "1:                                          \n"
1266     MEMACCESS(0)
1267     "ld4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n"  // load 8 pixels
1268     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
1269     ARGBTOARGB4444
1270     MEMACCESS(1)
1271     "st1        {v0.16b}, [%1], #16            \n"  // store 8 pixels ARGB4444.
1272     "b.gt       1b                             \n"
1273   : "+r"(src_argb),      // %0
1274     "+r"(dst_argb4444),  // %1
1275     "+r"(width)            // %2
1276   :
1277   : "cc", "memory", "v0", "v1", "v4", "v20", "v21", "v22", "v23"
1278   );
1279 }
1280 
ARGBToYRow_NEON(const uint8 * src_argb,uint8 * dst_y,int width)1281 void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int width) {
1282   asm volatile (
1283     "movi       v4.8b, #13                     \n"  // B * 0.1016 coefficient
1284     "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
1285     "movi       v6.8b, #33                     \n"  // R * 0.2578 coefficient
1286     "movi       v7.8b, #16                     \n"  // Add 16 constant
1287   "1:                                          \n"
1288     MEMACCESS(0)
1289     "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
1290     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
1291     "umull      v3.8h, v0.8b, v4.8b            \n"  // B
1292     "umlal      v3.8h, v1.8b, v5.8b            \n"  // G
1293     "umlal      v3.8h, v2.8b, v6.8b            \n"  // R
1294     "sqrshrun   v0.8b, v3.8h, #7               \n"  // 16 bit to 8 bit Y
1295     "uqadd      v0.8b, v0.8b, v7.8b            \n"
1296     MEMACCESS(1)
1297     "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
1298     "b.gt       1b                             \n"
1299   : "+r"(src_argb),  // %0
1300     "+r"(dst_y),     // %1
1301     "+r"(width)        // %2
1302   :
1303   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
1304   );
1305 }
1306 
ARGBExtractAlphaRow_NEON(const uint8 * src_argb,uint8 * dst_a,int width)1307 void ARGBExtractAlphaRow_NEON(const uint8* src_argb, uint8* dst_a, int width) {
1308   asm volatile (
1309   "1:                                          \n"
1310     MEMACCESS(0)
1311     "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load row 16 pixels
1312     "subs       %w2, %w2, #16                  \n"  // 16 processed per loop
1313     MEMACCESS(1)
1314     "st1        {v3.16b}, [%1], #16            \n"  // store 16 A's.
1315     "b.gt       1b                             \n"
1316   : "+r"(src_argb),   // %0
1317     "+r"(dst_a),      // %1
1318     "+r"(width)       // %2
1319   :
1320   : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
1321   );
1322 }
1323 
ARGBToYJRow_NEON(const uint8 * src_argb,uint8 * dst_y,int width)1324 void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int width) {
1325   asm volatile (
1326     "movi       v4.8b, #15                     \n"  // B * 0.11400 coefficient
1327     "movi       v5.8b, #75                     \n"  // G * 0.58700 coefficient
1328     "movi       v6.8b, #38                     \n"  // R * 0.29900 coefficient
1329   "1:                                          \n"
1330     MEMACCESS(0)
1331     "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
1332     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
1333     "umull      v3.8h, v0.8b, v4.8b            \n"  // B
1334     "umlal      v3.8h, v1.8b, v5.8b            \n"  // G
1335     "umlal      v3.8h, v2.8b, v6.8b            \n"  // R
1336     "sqrshrun   v0.8b, v3.8h, #7               \n"  // 15 bit to 8 bit Y
1337     MEMACCESS(1)
1338     "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
1339     "b.gt       1b                             \n"
1340   : "+r"(src_argb),  // %0
1341     "+r"(dst_y),     // %1
1342     "+r"(width)        // %2
1343   :
1344   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"
1345   );
1346 }
1347 
1348 // 8x1 pixels.
ARGBToUV444Row_NEON(const uint8 * src_argb,uint8 * dst_u,uint8 * dst_v,int width)1349 void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
1350                          int width) {
1351   asm volatile (
1352     "movi       v24.8b, #112                   \n"  // UB / VR 0.875 coefficient
1353     "movi       v25.8b, #74                    \n"  // UG -0.5781 coefficient
1354     "movi       v26.8b, #38                    \n"  // UR -0.2969 coefficient
1355     "movi       v27.8b, #18                    \n"  // VB -0.1406 coefficient
1356     "movi       v28.8b, #94                    \n"  // VG -0.7344 coefficient
1357     "movi       v29.16b,#0x80                  \n"  // 128.5
1358   "1:                                          \n"
1359     MEMACCESS(0)
1360     "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
1361     "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
1362     "umull      v4.8h, v0.8b, v24.8b           \n"  // B
1363     "umlsl      v4.8h, v1.8b, v25.8b           \n"  // G
1364     "umlsl      v4.8h, v2.8b, v26.8b           \n"  // R
1365     "add        v4.8h, v4.8h, v29.8h           \n"  // +128 -> unsigned
1366 
1367     "umull      v3.8h, v2.8b, v24.8b           \n"  // R
1368     "umlsl      v3.8h, v1.8b, v28.8b           \n"  // G
1369     "umlsl      v3.8h, v0.8b, v27.8b           \n"  // B
1370     "add        v3.8h, v3.8h, v29.8h           \n"  // +128 -> unsigned
1371 
1372     "uqshrn     v0.8b, v4.8h, #8               \n"  // 16 bit to 8 bit U
1373     "uqshrn     v1.8b, v3.8h, #8               \n"  // 16 bit to 8 bit V
1374 
1375     MEMACCESS(1)
1376     "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels U.
1377     MEMACCESS(2)
1378     "st1        {v1.8b}, [%2], #8              \n"  // store 8 pixels V.
1379     "b.gt       1b                             \n"
1380   : "+r"(src_argb),  // %0
1381     "+r"(dst_u),     // %1
1382     "+r"(dst_v),     // %2
1383     "+r"(width)        // %3
1384   :
1385   : "cc", "memory", "v0", "v1", "v2", "v3", "v4",
1386     "v24", "v25", "v26", "v27", "v28", "v29"
1387   );
1388 }
1389 
1390 #define RGBTOUV_SETUP_REG                                                      \
1391     "movi       v20.8h, #56, lsl #0  \n"  /* UB/VR coefficient (0.875) / 2 */  \
1392     "movi       v21.8h, #37, lsl #0  \n"  /* UG coefficient (-0.5781) / 2  */  \
1393     "movi       v22.8h, #19, lsl #0  \n"  /* UR coefficient (-0.2969) / 2  */  \
1394     "movi       v23.8h, #9,  lsl #0  \n"  /* VB coefficient (-0.1406) / 2  */  \
1395     "movi       v24.8h, #47, lsl #0  \n"  /* VG coefficient (-0.7344) / 2  */  \
1396     "movi       v25.16b, #0x80       \n"  /* 128.5 (0x8080 in 16-bit)      */
1397 
1398 // 32x1 pixels -> 8x1.  width is number of argb pixels. e.g. 32.
ARGBToUV411Row_NEON(const uint8 * src_argb,uint8 * dst_u,uint8 * dst_v,int width)1399 void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
1400                          int width) {
1401   asm volatile (
1402     RGBTOUV_SETUP_REG
1403   "1:                                          \n"
1404     MEMACCESS(0)
1405     "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
1406     "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
1407     "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
1408     "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
1409     MEMACCESS(0)
1410     "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%0], #64 \n"  // load next 16.
1411     "uaddlp     v4.8h, v4.16b                  \n"  // B 16 bytes -> 8 shorts.
1412     "uaddlp     v5.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
1413     "uaddlp     v6.8h, v6.16b                  \n"  // R 16 bytes -> 8 shorts.
1414 
1415     "addp       v0.8h, v0.8h, v4.8h            \n"  // B 16 shorts -> 8 shorts.
1416     "addp       v1.8h, v1.8h, v5.8h            \n"  // G 16 shorts -> 8 shorts.
1417     "addp       v2.8h, v2.8h, v6.8h            \n"  // R 16 shorts -> 8 shorts.
1418 
1419     "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
1420     "urshr      v1.8h, v1.8h, #1               \n"
1421     "urshr      v2.8h, v2.8h, #1               \n"
1422 
1423     "subs       %w3, %w3, #32                  \n"  // 32 processed per loop.
1424     "mul        v3.8h, v0.8h, v20.8h           \n"  // B
1425     "mls        v3.8h, v1.8h, v21.8h           \n"  // G
1426     "mls        v3.8h, v2.8h, v22.8h           \n"  // R
1427     "add        v3.8h, v3.8h, v25.8h           \n"  // +128 -> unsigned
1428     "mul        v4.8h, v2.8h, v20.8h           \n"  // R
1429     "mls        v4.8h, v1.8h, v24.8h           \n"  // G
1430     "mls        v4.8h, v0.8h, v23.8h           \n"  // B
1431     "add        v4.8h, v4.8h, v25.8h           \n"  // +128 -> unsigned
1432     "uqshrn     v0.8b, v3.8h, #8               \n"  // 16 bit to 8 bit U
1433     "uqshrn     v1.8b, v4.8h, #8               \n"  // 16 bit to 8 bit V
1434     MEMACCESS(1)
1435     "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels U.
1436     MEMACCESS(2)
1437     "st1        {v1.8b}, [%2], #8              \n"  // store 8 pixels V.
1438     "b.gt       1b                             \n"
1439   : "+r"(src_argb),  // %0
1440     "+r"(dst_u),     // %1
1441     "+r"(dst_v),     // %2
1442     "+r"(width)        // %3
1443   :
1444   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1445     "v20", "v21", "v22", "v23", "v24", "v25"
1446   );
1447 }
1448 
1449 // 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
1450 #define RGBTOUV(QB, QG, QR) \
1451     "mul        v3.8h, " #QB ",v20.8h          \n"  /* B                    */ \
1452     "mul        v4.8h, " #QR ",v20.8h          \n"  /* R                    */ \
1453     "mls        v3.8h, " #QG ",v21.8h          \n"  /* G                    */ \
1454     "mls        v4.8h, " #QG ",v24.8h          \n"  /* G                    */ \
1455     "mls        v3.8h, " #QR ",v22.8h          \n"  /* R                    */ \
1456     "mls        v4.8h, " #QB ",v23.8h          \n"  /* B                    */ \
1457     "add        v3.8h, v3.8h, v25.8h           \n"  /* +128 -> unsigned     */ \
1458     "add        v4.8h, v4.8h, v25.8h           \n"  /* +128 -> unsigned     */ \
1459     "uqshrn     v0.8b, v3.8h, #8               \n"  /* 16 bit to 8 bit U    */ \
1460     "uqshrn     v1.8b, v4.8h, #8               \n"  /* 16 bit to 8 bit V    */
1461 
1462 // TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
1463 // TODO(fbarchard): consider ptrdiff_t for all strides.
1464 
ARGBToUVRow_NEON(const uint8 * src_argb,int src_stride_argb,uint8 * dst_u,uint8 * dst_v,int width)1465 void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,
1466                       uint8* dst_u, uint8* dst_v, int width) {
1467   const uint8* src_argb_1 = src_argb + src_stride_argb;
1468   asm volatile (
1469     RGBTOUV_SETUP_REG
1470   "1:                                          \n"
1471     MEMACCESS(0)
1472     "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
1473     "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
1474     "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
1475     "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
1476 
1477     MEMACCESS(1)
1478     "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load next 16
1479     "uadalp     v0.8h, v4.16b                  \n"  // B 16 bytes -> 8 shorts.
1480     "uadalp     v1.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
1481     "uadalp     v2.8h, v6.16b                  \n"  // R 16 bytes -> 8 shorts.
1482 
1483     "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
1484     "urshr      v1.8h, v1.8h, #1               \n"
1485     "urshr      v2.8h, v2.8h, #1               \n"
1486 
1487     "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
1488     RGBTOUV(v0.8h, v1.8h, v2.8h)
1489     MEMACCESS(2)
1490     "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
1491     MEMACCESS(3)
1492     "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
1493     "b.gt       1b                             \n"
1494   : "+r"(src_argb),  // %0
1495     "+r"(src_argb_1),  // %1
1496     "+r"(dst_u),     // %2
1497     "+r"(dst_v),     // %3
1498     "+r"(width)        // %4
1499   :
1500   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1501     "v20", "v21", "v22", "v23", "v24", "v25"
1502   );
1503 }
1504 
1505 // TODO(fbarchard): Subsample match C code.
ARGBToUVJRow_NEON(const uint8 * src_argb,int src_stride_argb,uint8 * dst_u,uint8 * dst_v,int width)1506 void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb,
1507                        uint8* dst_u, uint8* dst_v, int width) {
1508   const uint8* src_argb_1 = src_argb + src_stride_argb;
1509   asm volatile (
1510     "movi       v20.8h, #63, lsl #0            \n"  // UB/VR coeff (0.500) / 2
1511     "movi       v21.8h, #42, lsl #0            \n"  // UG coeff (-0.33126) / 2
1512     "movi       v22.8h, #21, lsl #0            \n"  // UR coeff (-0.16874) / 2
1513     "movi       v23.8h, #10, lsl #0            \n"  // VB coeff (-0.08131) / 2
1514     "movi       v24.8h, #53, lsl #0            \n"  // VG coeff (-0.41869) / 2
1515     "movi       v25.16b, #0x80                 \n"  // 128.5 (0x8080 in 16-bit)
1516   "1:                                          \n"
1517     MEMACCESS(0)
1518     "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
1519     "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
1520     "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
1521     "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
1522     MEMACCESS(1)
1523     "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64  \n"  // load next 16
1524     "uadalp     v0.8h, v4.16b                  \n"  // B 16 bytes -> 8 shorts.
1525     "uadalp     v1.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
1526     "uadalp     v2.8h, v6.16b                  \n"  // R 16 bytes -> 8 shorts.
1527 
1528     "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
1529     "urshr      v1.8h, v1.8h, #1               \n"
1530     "urshr      v2.8h, v2.8h, #1               \n"
1531 
1532     "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
1533     RGBTOUV(v0.8h, v1.8h, v2.8h)
1534     MEMACCESS(2)
1535     "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
1536     MEMACCESS(3)
1537     "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
1538     "b.gt       1b                             \n"
1539   : "+r"(src_argb),  // %0
1540     "+r"(src_argb_1),  // %1
1541     "+r"(dst_u),     // %2
1542     "+r"(dst_v),     // %3
1543     "+r"(width)        // %4
1544   :
1545   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1546     "v20", "v21", "v22", "v23", "v24", "v25"
1547   );
1548 }
1549 
BGRAToUVRow_NEON(const uint8 * src_bgra,int src_stride_bgra,uint8 * dst_u,uint8 * dst_v,int width)1550 void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra,
1551                       uint8* dst_u, uint8* dst_v, int width) {
1552   const uint8* src_bgra_1 = src_bgra + src_stride_bgra;
1553   asm volatile (
1554     RGBTOUV_SETUP_REG
1555   "1:                                          \n"
1556     MEMACCESS(0)
1557     "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
1558     "uaddlp     v0.8h, v3.16b                  \n"  // B 16 bytes -> 8 shorts.
1559     "uaddlp     v3.8h, v2.16b                  \n"  // G 16 bytes -> 8 shorts.
1560     "uaddlp     v2.8h, v1.16b                  \n"  // R 16 bytes -> 8 shorts.
1561     MEMACCESS(1)
1562     "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load 16 more
1563     "uadalp     v0.8h, v7.16b                  \n"  // B 16 bytes -> 8 shorts.
1564     "uadalp     v3.8h, v6.16b                  \n"  // G 16 bytes -> 8 shorts.
1565     "uadalp     v2.8h, v5.16b                  \n"  // R 16 bytes -> 8 shorts.
1566 
1567     "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
1568     "urshr      v1.8h, v3.8h, #1               \n"
1569     "urshr      v2.8h, v2.8h, #1               \n"
1570 
1571     "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
1572     RGBTOUV(v0.8h, v1.8h, v2.8h)
1573     MEMACCESS(2)
1574     "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
1575     MEMACCESS(3)
1576     "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
1577     "b.gt       1b                             \n"
1578   : "+r"(src_bgra),  // %0
1579     "+r"(src_bgra_1),  // %1
1580     "+r"(dst_u),     // %2
1581     "+r"(dst_v),     // %3
1582     "+r"(width)        // %4
1583   :
1584   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1585     "v20", "v21", "v22", "v23", "v24", "v25"
1586   );
1587 }
1588 
ABGRToUVRow_NEON(const uint8 * src_abgr,int src_stride_abgr,uint8 * dst_u,uint8 * dst_v,int width)1589 void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr,
1590                       uint8* dst_u, uint8* dst_v, int width) {
1591   const uint8* src_abgr_1 = src_abgr + src_stride_abgr;
1592   asm volatile (
1593     RGBTOUV_SETUP_REG
1594   "1:                                          \n"
1595     MEMACCESS(0)
1596     "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
1597     "uaddlp     v3.8h, v2.16b                  \n"  // B 16 bytes -> 8 shorts.
1598     "uaddlp     v2.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
1599     "uaddlp     v1.8h, v0.16b                  \n"  // R 16 bytes -> 8 shorts.
1600     MEMACCESS(1)
1601     "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load 16 more.
1602     "uadalp     v3.8h, v6.16b                  \n"  // B 16 bytes -> 8 shorts.
1603     "uadalp     v2.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
1604     "uadalp     v1.8h, v4.16b                  \n"  // R 16 bytes -> 8 shorts.
1605 
1606     "urshr      v0.8h, v3.8h, #1               \n"  // 2x average
1607     "urshr      v2.8h, v2.8h, #1               \n"
1608     "urshr      v1.8h, v1.8h, #1               \n"
1609 
1610     "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
1611     RGBTOUV(v0.8h, v2.8h, v1.8h)
1612     MEMACCESS(2)
1613     "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
1614     MEMACCESS(3)
1615     "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
1616     "b.gt       1b                             \n"
1617   : "+r"(src_abgr),  // %0
1618     "+r"(src_abgr_1),  // %1
1619     "+r"(dst_u),     // %2
1620     "+r"(dst_v),     // %3
1621     "+r"(width)        // %4
1622   :
1623   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1624     "v20", "v21", "v22", "v23", "v24", "v25"
1625   );
1626 }
1627 
RGBAToUVRow_NEON(const uint8 * src_rgba,int src_stride_rgba,uint8 * dst_u,uint8 * dst_v,int width)1628 void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba,
1629                       uint8* dst_u, uint8* dst_v, int width) {
1630   const uint8* src_rgba_1 = src_rgba + src_stride_rgba;
1631   asm volatile (
1632     RGBTOUV_SETUP_REG
1633   "1:                                          \n"
1634     MEMACCESS(0)
1635     "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
1636     "uaddlp     v0.8h, v1.16b                  \n"  // B 16 bytes -> 8 shorts.
1637     "uaddlp     v1.8h, v2.16b                  \n"  // G 16 bytes -> 8 shorts.
1638     "uaddlp     v2.8h, v3.16b                  \n"  // R 16 bytes -> 8 shorts.
1639     MEMACCESS(1)
1640     "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load 16 more.
1641     "uadalp     v0.8h, v5.16b                  \n"  // B 16 bytes -> 8 shorts.
1642     "uadalp     v1.8h, v6.16b                  \n"  // G 16 bytes -> 8 shorts.
1643     "uadalp     v2.8h, v7.16b                  \n"  // R 16 bytes -> 8 shorts.
1644 
1645     "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
1646     "urshr      v1.8h, v1.8h, #1               \n"
1647     "urshr      v2.8h, v2.8h, #1               \n"
1648 
1649     "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
1650     RGBTOUV(v0.8h, v1.8h, v2.8h)
1651     MEMACCESS(2)
1652     "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
1653     MEMACCESS(3)
1654     "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
1655     "b.gt       1b                             \n"
1656   : "+r"(src_rgba),  // %0
1657     "+r"(src_rgba_1),  // %1
1658     "+r"(dst_u),     // %2
1659     "+r"(dst_v),     // %3
1660     "+r"(width)        // %4
1661   :
1662   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1663     "v20", "v21", "v22", "v23", "v24", "v25"
1664   );
1665 }
1666 
RGB24ToUVRow_NEON(const uint8 * src_rgb24,int src_stride_rgb24,uint8 * dst_u,uint8 * dst_v,int width)1667 void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24,
1668                        uint8* dst_u, uint8* dst_v, int width) {
1669   const uint8* src_rgb24_1 = src_rgb24 + src_stride_rgb24;
1670   asm volatile (
1671     RGBTOUV_SETUP_REG
1672   "1:                                          \n"
1673     MEMACCESS(0)
1674     "ld3        {v0.16b,v1.16b,v2.16b}, [%0], #48 \n"  // load 16 pixels.
1675     "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
1676     "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
1677     "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
1678     MEMACCESS(1)
1679     "ld3        {v4.16b,v5.16b,v6.16b}, [%1], #48 \n"  // load 16 more.
1680     "uadalp     v0.8h, v4.16b                  \n"  // B 16 bytes -> 8 shorts.
1681     "uadalp     v1.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
1682     "uadalp     v2.8h, v6.16b                  \n"  // R 16 bytes -> 8 shorts.
1683 
1684     "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
1685     "urshr      v1.8h, v1.8h, #1               \n"
1686     "urshr      v2.8h, v2.8h, #1               \n"
1687 
1688     "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
1689     RGBTOUV(v0.8h, v1.8h, v2.8h)
1690     MEMACCESS(2)
1691     "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
1692     MEMACCESS(3)
1693     "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
1694     "b.gt       1b                             \n"
1695   : "+r"(src_rgb24),  // %0
1696     "+r"(src_rgb24_1),  // %1
1697     "+r"(dst_u),     // %2
1698     "+r"(dst_v),     // %3
1699     "+r"(width)        // %4
1700   :
1701   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1702     "v20", "v21", "v22", "v23", "v24", "v25"
1703   );
1704 }
1705 
RAWToUVRow_NEON(const uint8 * src_raw,int src_stride_raw,uint8 * dst_u,uint8 * dst_v,int width)1706 void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw,
1707                      uint8* dst_u, uint8* dst_v, int width) {
1708   const uint8* src_raw_1 = src_raw + src_stride_raw;
1709   asm volatile (
1710     RGBTOUV_SETUP_REG
1711   "1:                                          \n"
1712     MEMACCESS(0)
1713     "ld3        {v0.16b,v1.16b,v2.16b}, [%0], #48 \n"  // load 8 RAW pixels.
1714     "uaddlp     v2.8h, v2.16b                  \n"  // B 16 bytes -> 8 shorts.
1715     "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
1716     "uaddlp     v0.8h, v0.16b                  \n"  // R 16 bytes -> 8 shorts.
1717     MEMACCESS(1)
1718     "ld3        {v4.16b,v5.16b,v6.16b}, [%1], #48 \n"  // load 8 more RAW pixels
1719     "uadalp     v2.8h, v6.16b                  \n"  // B 16 bytes -> 8 shorts.
1720     "uadalp     v1.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
1721     "uadalp     v0.8h, v4.16b                  \n"  // R 16 bytes -> 8 shorts.
1722 
1723     "urshr      v2.8h, v2.8h, #1               \n"  // 2x average
1724     "urshr      v1.8h, v1.8h, #1               \n"
1725     "urshr      v0.8h, v0.8h, #1               \n"
1726 
1727     "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
1728     RGBTOUV(v2.8h, v1.8h, v0.8h)
1729     MEMACCESS(2)
1730     "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
1731     MEMACCESS(3)
1732     "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
1733     "b.gt       1b                             \n"
1734   : "+r"(src_raw),  // %0
1735     "+r"(src_raw_1),  // %1
1736     "+r"(dst_u),     // %2
1737     "+r"(dst_v),     // %3
1738     "+r"(width)        // %4
1739   :
1740   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1741     "v20", "v21", "v22", "v23", "v24", "v25"
1742   );
1743 }
1744 
1745 // 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
RGB565ToUVRow_NEON(const uint8 * src_rgb565,int src_stride_rgb565,uint8 * dst_u,uint8 * dst_v,int width)1746 void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565,
1747                         uint8* dst_u, uint8* dst_v, int width) {
1748   const uint8* src_rgb565_1 = src_rgb565 + src_stride_rgb565;
1749   asm volatile (
1750     "movi       v22.8h, #56, lsl #0            \n"  // UB / VR coeff (0.875) / 2
1751     "movi       v23.8h, #37, lsl #0            \n"  // UG coeff (-0.5781) / 2
1752     "movi       v24.8h, #19, lsl #0            \n"  // UR coeff (-0.2969) / 2
1753     "movi       v25.8h, #9 , lsl #0            \n"  // VB coeff (-0.1406) / 2
1754     "movi       v26.8h, #47, lsl #0            \n"  // VG coeff (-0.7344) / 2
1755     "movi       v27.16b, #0x80                 \n"  // 128.5 (0x8080 in 16-bit)
1756   "1:                                          \n"
1757     MEMACCESS(0)
1758     "ld1        {v0.16b}, [%0], #16            \n"  // load 8 RGB565 pixels.
1759     RGB565TOARGB
1760     "uaddlp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
1761     "uaddlp     v18.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
1762     "uaddlp     v20.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
1763     MEMACCESS(0)
1764     "ld1        {v0.16b}, [%0], #16            \n"  // next 8 RGB565 pixels.
1765     RGB565TOARGB
1766     "uaddlp     v17.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
1767     "uaddlp     v19.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
1768     "uaddlp     v21.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
1769 
1770     MEMACCESS(1)
1771     "ld1        {v0.16b}, [%1], #16            \n"  // load 8 RGB565 pixels.
1772     RGB565TOARGB
1773     "uadalp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
1774     "uadalp     v18.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
1775     "uadalp     v20.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
1776     MEMACCESS(1)
1777     "ld1        {v0.16b}, [%1], #16            \n"  // next 8 RGB565 pixels.
1778     RGB565TOARGB
1779     "uadalp     v17.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
1780     "uadalp     v19.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
1781     "uadalp     v21.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
1782 
1783     "ins        v16.D[1], v17.D[0]             \n"
1784     "ins        v18.D[1], v19.D[0]             \n"
1785     "ins        v20.D[1], v21.D[0]             \n"
1786 
1787     "urshr      v4.8h, v16.8h, #1              \n"  // 2x average
1788     "urshr      v5.8h, v18.8h, #1              \n"
1789     "urshr      v6.8h, v20.8h, #1              \n"
1790 
1791     "subs       %w4, %w4, #16                  \n"  // 16 processed per loop.
1792     "mul        v16.8h, v4.8h, v22.8h          \n"  // B
1793     "mls        v16.8h, v5.8h, v23.8h          \n"  // G
1794     "mls        v16.8h, v6.8h, v24.8h          \n"  // R
1795     "add        v16.8h, v16.8h, v27.8h         \n"  // +128 -> unsigned
1796     "mul        v17.8h, v6.8h, v22.8h          \n"  // R
1797     "mls        v17.8h, v5.8h, v26.8h          \n"  // G
1798     "mls        v17.8h, v4.8h, v25.8h          \n"  // B
1799     "add        v17.8h, v17.8h, v27.8h         \n"  // +128 -> unsigned
1800     "uqshrn     v0.8b, v16.8h, #8              \n"  // 16 bit to 8 bit U
1801     "uqshrn     v1.8b, v17.8h, #8              \n"  // 16 bit to 8 bit V
1802     MEMACCESS(2)
1803     "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
1804     MEMACCESS(3)
1805     "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
1806     "b.gt       1b                             \n"
1807   : "+r"(src_rgb565),  // %0
1808     "+r"(src_rgb565_1),  // %1
1809     "+r"(dst_u),     // %2
1810     "+r"(dst_v),     // %3
1811     "+r"(width)        // %4
1812   :
1813   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1814     "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24",
1815     "v25", "v26", "v27"
1816   );
1817 }
1818 
1819 // 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
ARGB1555ToUVRow_NEON(const uint8 * src_argb1555,int src_stride_argb1555,uint8 * dst_u,uint8 * dst_v,int width)1820 void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555,
1821                         uint8* dst_u, uint8* dst_v, int width) {
1822   const uint8* src_argb1555_1 = src_argb1555 + src_stride_argb1555;
1823   asm volatile (
1824     RGBTOUV_SETUP_REG
1825   "1:                                          \n"
1826     MEMACCESS(0)
1827     "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB1555 pixels.
1828     RGB555TOARGB
1829     "uaddlp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
1830     "uaddlp     v17.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
1831     "uaddlp     v18.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
1832     MEMACCESS(0)
1833     "ld1        {v0.16b}, [%0], #16            \n"  // next 8 ARGB1555 pixels.
1834     RGB555TOARGB
1835     "uaddlp     v26.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
1836     "uaddlp     v27.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
1837     "uaddlp     v28.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
1838 
1839     MEMACCESS(1)
1840     "ld1        {v0.16b}, [%1], #16            \n"  // load 8 ARGB1555 pixels.
1841     RGB555TOARGB
1842     "uadalp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
1843     "uadalp     v17.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
1844     "uadalp     v18.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
1845     MEMACCESS(1)
1846     "ld1        {v0.16b}, [%1], #16            \n"  // next 8 ARGB1555 pixels.
1847     RGB555TOARGB
1848     "uadalp     v26.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
1849     "uadalp     v27.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
1850     "uadalp     v28.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
1851 
1852     "ins        v16.D[1], v26.D[0]             \n"
1853     "ins        v17.D[1], v27.D[0]             \n"
1854     "ins        v18.D[1], v28.D[0]             \n"
1855 
1856     "urshr      v4.8h, v16.8h, #1              \n"  // 2x average
1857     "urshr      v5.8h, v17.8h, #1              \n"
1858     "urshr      v6.8h, v18.8h, #1              \n"
1859 
1860     "subs       %w4, %w4, #16                  \n"  // 16 processed per loop.
1861     "mul        v2.8h, v4.8h, v20.8h           \n"  // B
1862     "mls        v2.8h, v5.8h, v21.8h           \n"  // G
1863     "mls        v2.8h, v6.8h, v22.8h           \n"  // R
1864     "add        v2.8h, v2.8h, v25.8h           \n"  // +128 -> unsigned
1865     "mul        v3.8h, v6.8h, v20.8h           \n"  // R
1866     "mls        v3.8h, v5.8h, v24.8h           \n"  // G
1867     "mls        v3.8h, v4.8h, v23.8h           \n"  // B
1868     "add        v3.8h, v3.8h, v25.8h           \n"  // +128 -> unsigned
1869     "uqshrn     v0.8b, v2.8h, #8               \n"  // 16 bit to 8 bit U
1870     "uqshrn     v1.8b, v3.8h, #8               \n"  // 16 bit to 8 bit V
1871     MEMACCESS(2)
1872     "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
1873     MEMACCESS(3)
1874     "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
1875     "b.gt       1b                             \n"
1876   : "+r"(src_argb1555),  // %0
1877     "+r"(src_argb1555_1),  // %1
1878     "+r"(dst_u),     // %2
1879     "+r"(dst_v),     // %3
1880     "+r"(width)        // %4
1881   :
1882   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
1883     "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
1884     "v26", "v27", "v28"
1885   );
1886 }
1887 
1888 // 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
ARGB4444ToUVRow_NEON(const uint8 * src_argb4444,int src_stride_argb4444,uint8 * dst_u,uint8 * dst_v,int width)1889 void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444,
1890                           uint8* dst_u, uint8* dst_v, int width) {
1891   const uint8* src_argb4444_1 = src_argb4444 + src_stride_argb4444;
1892   asm volatile (
1893     RGBTOUV_SETUP_REG
1894   "1:                                          \n"
1895     MEMACCESS(0)
1896     "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB4444 pixels.
1897     ARGB4444TOARGB
1898     "uaddlp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
1899     "uaddlp     v17.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
1900     "uaddlp     v18.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
1901     MEMACCESS(0)
1902     "ld1        {v0.16b}, [%0], #16            \n"  // next 8 ARGB4444 pixels.
1903     ARGB4444TOARGB
1904     "uaddlp     v26.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
1905     "uaddlp     v27.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
1906     "uaddlp     v28.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
1907 
1908     MEMACCESS(1)
1909     "ld1        {v0.16b}, [%1], #16            \n"  // load 8 ARGB4444 pixels.
1910     ARGB4444TOARGB
1911     "uadalp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
1912     "uadalp     v17.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
1913     "uadalp     v18.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
1914     MEMACCESS(1)
1915     "ld1        {v0.16b}, [%1], #16            \n"  // next 8 ARGB4444 pixels.
1916     ARGB4444TOARGB
1917     "uadalp     v26.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
1918     "uadalp     v27.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
1919     "uadalp     v28.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
1920 
1921     "ins        v16.D[1], v26.D[0]             \n"
1922     "ins        v17.D[1], v27.D[0]             \n"
1923     "ins        v18.D[1], v28.D[0]             \n"
1924 
1925     "urshr      v4.8h, v16.8h, #1              \n"  // 2x average
1926     "urshr      v5.8h, v17.8h, #1              \n"
1927     "urshr      v6.8h, v18.8h, #1              \n"
1928 
1929     "subs       %w4, %w4, #16                  \n"  // 16 processed per loop.
1930     "mul        v2.8h, v4.8h, v20.8h           \n"  // B
1931     "mls        v2.8h, v5.8h, v21.8h           \n"  // G
1932     "mls        v2.8h, v6.8h, v22.8h           \n"  // R
1933     "add        v2.8h, v2.8h, v25.8h           \n"  // +128 -> unsigned
1934     "mul        v3.8h, v6.8h, v20.8h           \n"  // R
1935     "mls        v3.8h, v5.8h, v24.8h           \n"  // G
1936     "mls        v3.8h, v4.8h, v23.8h           \n"  // B
1937     "add        v3.8h, v3.8h, v25.8h           \n"  // +128 -> unsigned
1938     "uqshrn     v0.8b, v2.8h, #8               \n"  // 16 bit to 8 bit U
1939     "uqshrn     v1.8b, v3.8h, #8               \n"  // 16 bit to 8 bit V
1940     MEMACCESS(2)
1941     "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
1942     MEMACCESS(3)
1943     "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
1944     "b.gt       1b                             \n"
1945   : "+r"(src_argb4444),  // %0
1946     "+r"(src_argb4444_1),  // %1
1947     "+r"(dst_u),     // %2
1948     "+r"(dst_v),     // %3
1949     "+r"(width)        // %4
1950   :
1951   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
1952     "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
1953     "v26", "v27", "v28"
1954 
1955   );
1956 }
1957 
RGB565ToYRow_NEON(const uint8 * src_rgb565,uint8 * dst_y,int width)1958 void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int width) {
1959   asm volatile (
1960     "movi       v24.8b, #13                    \n"  // B * 0.1016 coefficient
1961     "movi       v25.8b, #65                    \n"  // G * 0.5078 coefficient
1962     "movi       v26.8b, #33                    \n"  // R * 0.2578 coefficient
1963     "movi       v27.8b, #16                    \n"  // Add 16 constant
1964   "1:                                          \n"
1965     MEMACCESS(0)
1966     "ld1        {v0.16b}, [%0], #16            \n"  // load 8 RGB565 pixels.
1967     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
1968     RGB565TOARGB
1969     "umull      v3.8h, v0.8b, v24.8b           \n"  // B
1970     "umlal      v3.8h, v1.8b, v25.8b           \n"  // G
1971     "umlal      v3.8h, v2.8b, v26.8b           \n"  // R
1972     "sqrshrun   v0.8b, v3.8h, #7               \n"  // 16 bit to 8 bit Y
1973     "uqadd      v0.8b, v0.8b, v27.8b           \n"
1974     MEMACCESS(1)
1975     "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
1976     "b.gt       1b                             \n"
1977   : "+r"(src_rgb565),  // %0
1978     "+r"(dst_y),       // %1
1979     "+r"(width)          // %2
1980   :
1981   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6",
1982     "v24", "v25", "v26", "v27"
1983   );
1984 }
1985 
ARGB1555ToYRow_NEON(const uint8 * src_argb1555,uint8 * dst_y,int width)1986 void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int width) {
1987   asm volatile (
1988     "movi       v4.8b, #13                     \n"  // B * 0.1016 coefficient
1989     "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
1990     "movi       v6.8b, #33                     \n"  // R * 0.2578 coefficient
1991     "movi       v7.8b, #16                     \n"  // Add 16 constant
1992   "1:                                          \n"
1993     MEMACCESS(0)
1994     "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB1555 pixels.
1995     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
1996     ARGB1555TOARGB
1997     "umull      v3.8h, v0.8b, v4.8b            \n"  // B
1998     "umlal      v3.8h, v1.8b, v5.8b            \n"  // G
1999     "umlal      v3.8h, v2.8b, v6.8b            \n"  // R
2000     "sqrshrun   v0.8b, v3.8h, #7               \n"  // 16 bit to 8 bit Y
2001     "uqadd      v0.8b, v0.8b, v7.8b            \n"
2002     MEMACCESS(1)
2003     "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
2004     "b.gt       1b                             \n"
2005   : "+r"(src_argb1555),  // %0
2006     "+r"(dst_y),         // %1
2007     "+r"(width)            // %2
2008   :
2009   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
2010   );
2011 }
2012 
ARGB4444ToYRow_NEON(const uint8 * src_argb4444,uint8 * dst_y,int width)2013 void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int width) {
2014   asm volatile (
2015     "movi       v24.8b, #13                    \n"  // B * 0.1016 coefficient
2016     "movi       v25.8b, #65                    \n"  // G * 0.5078 coefficient
2017     "movi       v26.8b, #33                    \n"  // R * 0.2578 coefficient
2018     "movi       v27.8b, #16                    \n"  // Add 16 constant
2019   "1:                                          \n"
2020     MEMACCESS(0)
2021     "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB4444 pixels.
2022     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
2023     ARGB4444TOARGB
2024     "umull      v3.8h, v0.8b, v24.8b           \n"  // B
2025     "umlal      v3.8h, v1.8b, v25.8b           \n"  // G
2026     "umlal      v3.8h, v2.8b, v26.8b           \n"  // R
2027     "sqrshrun   v0.8b, v3.8h, #7               \n"  // 16 bit to 8 bit Y
2028     "uqadd      v0.8b, v0.8b, v27.8b           \n"
2029     MEMACCESS(1)
2030     "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
2031     "b.gt       1b                             \n"
2032   : "+r"(src_argb4444),  // %0
2033     "+r"(dst_y),         // %1
2034     "+r"(width)            // %2
2035   :
2036   : "cc", "memory", "v0", "v1", "v2", "v3", "v24", "v25", "v26", "v27"
2037   );
2038 }
2039 
BGRAToYRow_NEON(const uint8 * src_bgra,uint8 * dst_y,int width)2040 void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int width) {
2041   asm volatile (
2042     "movi       v4.8b, #33                     \n"  // R * 0.2578 coefficient
2043     "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
2044     "movi       v6.8b, #13                     \n"  // B * 0.1016 coefficient
2045     "movi       v7.8b, #16                     \n"  // Add 16 constant
2046   "1:                                          \n"
2047     MEMACCESS(0)
2048     "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 pixels.
2049     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
2050     "umull      v16.8h, v1.8b, v4.8b           \n"  // R
2051     "umlal      v16.8h, v2.8b, v5.8b           \n"  // G
2052     "umlal      v16.8h, v3.8b, v6.8b           \n"  // B
2053     "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
2054     "uqadd      v0.8b, v0.8b, v7.8b            \n"
2055     MEMACCESS(1)
2056     "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
2057     "b.gt       1b                             \n"
2058   : "+r"(src_bgra),  // %0
2059     "+r"(dst_y),     // %1
2060     "+r"(width)        // %2
2061   :
2062   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
2063   );
2064 }
2065 
ABGRToYRow_NEON(const uint8 * src_abgr,uint8 * dst_y,int width)2066 void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int width) {
2067   asm volatile (
2068     "movi       v4.8b, #33                     \n"  // R * 0.2578 coefficient
2069     "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
2070     "movi       v6.8b, #13                     \n"  // B * 0.1016 coefficient
2071     "movi       v7.8b, #16                     \n"  // Add 16 constant
2072   "1:                                          \n"
2073     MEMACCESS(0)
2074     "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 pixels.
2075     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
2076     "umull      v16.8h, v0.8b, v4.8b           \n"  // R
2077     "umlal      v16.8h, v1.8b, v5.8b           \n"  // G
2078     "umlal      v16.8h, v2.8b, v6.8b           \n"  // B
2079     "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
2080     "uqadd      v0.8b, v0.8b, v7.8b            \n"
2081     MEMACCESS(1)
2082     "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
2083     "b.gt       1b                             \n"
2084   : "+r"(src_abgr),  // %0
2085     "+r"(dst_y),     // %1
2086     "+r"(width)        // %2
2087   :
2088   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
2089   );
2090 }
2091 
RGBAToYRow_NEON(const uint8 * src_rgba,uint8 * dst_y,int width)2092 void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int width) {
2093   asm volatile (
2094     "movi       v4.8b, #13                     \n"  // B * 0.1016 coefficient
2095     "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
2096     "movi       v6.8b, #33                     \n"  // R * 0.2578 coefficient
2097     "movi       v7.8b, #16                     \n"  // Add 16 constant
2098   "1:                                          \n"
2099     MEMACCESS(0)
2100     "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 pixels.
2101     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
2102     "umull      v16.8h, v1.8b, v4.8b           \n"  // B
2103     "umlal      v16.8h, v2.8b, v5.8b           \n"  // G
2104     "umlal      v16.8h, v3.8b, v6.8b           \n"  // R
2105     "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
2106     "uqadd      v0.8b, v0.8b, v7.8b            \n"
2107     MEMACCESS(1)
2108     "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
2109     "b.gt       1b                             \n"
2110   : "+r"(src_rgba),  // %0
2111     "+r"(dst_y),     // %1
2112     "+r"(width)        // %2
2113   :
2114   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
2115   );
2116 }
2117 
RGB24ToYRow_NEON(const uint8 * src_rgb24,uint8 * dst_y,int width)2118 void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int width) {
2119   asm volatile (
2120     "movi       v4.8b, #13                     \n"  // B * 0.1016 coefficient
2121     "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
2122     "movi       v6.8b, #33                     \n"  // R * 0.2578 coefficient
2123     "movi       v7.8b, #16                     \n"  // Add 16 constant
2124   "1:                                          \n"
2125     MEMACCESS(0)
2126     "ld3        {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // load 8 pixels.
2127     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
2128     "umull      v16.8h, v0.8b, v4.8b           \n"  // B
2129     "umlal      v16.8h, v1.8b, v5.8b           \n"  // G
2130     "umlal      v16.8h, v2.8b, v6.8b           \n"  // R
2131     "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
2132     "uqadd      v0.8b, v0.8b, v7.8b            \n"
2133     MEMACCESS(1)
2134     "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
2135     "b.gt       1b                             \n"
2136   : "+r"(src_rgb24),  // %0
2137     "+r"(dst_y),      // %1
2138     "+r"(width)         // %2
2139   :
2140   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
2141   );
2142 }
2143 
RAWToYRow_NEON(const uint8 * src_raw,uint8 * dst_y,int width)2144 void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int width) {
2145   asm volatile (
2146     "movi       v4.8b, #33                     \n"  // R * 0.2578 coefficient
2147     "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
2148     "movi       v6.8b, #13                     \n"  // B * 0.1016 coefficient
2149     "movi       v7.8b, #16                     \n"  // Add 16 constant
2150   "1:                                          \n"
2151     MEMACCESS(0)
2152     "ld3        {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // load 8 pixels.
2153     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
2154     "umull      v16.8h, v0.8b, v4.8b           \n"  // B
2155     "umlal      v16.8h, v1.8b, v5.8b           \n"  // G
2156     "umlal      v16.8h, v2.8b, v6.8b           \n"  // R
2157     "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
2158     "uqadd      v0.8b, v0.8b, v7.8b            \n"
2159     MEMACCESS(1)
2160     "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
2161     "b.gt       1b                             \n"
2162   : "+r"(src_raw),  // %0
2163     "+r"(dst_y),    // %1
2164     "+r"(width)       // %2
2165   :
2166   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
2167   );
2168 }
2169 
2170 // Bilinear filter 16x2 -> 16x1
InterpolateRow_NEON(uint8 * dst_ptr,const uint8 * src_ptr,ptrdiff_t src_stride,int dst_width,int source_y_fraction)2171 void InterpolateRow_NEON(uint8* dst_ptr,
2172                          const uint8* src_ptr, ptrdiff_t src_stride,
2173                          int dst_width, int source_y_fraction) {
2174   int y1_fraction = source_y_fraction;
2175   int y0_fraction = 256 - y1_fraction;
2176   const uint8* src_ptr1 = src_ptr + src_stride;
2177   asm volatile (
2178     "cmp        %w4, #0                        \n"
2179     "b.eq       100f                           \n"
2180     "cmp        %w4, #128                      \n"
2181     "b.eq       50f                            \n"
2182 
2183     "dup        v5.16b, %w4                    \n"
2184     "dup        v4.16b, %w5                    \n"
2185     // General purpose row blend.
2186   "1:                                          \n"
2187     MEMACCESS(1)
2188     "ld1        {v0.16b}, [%1], #16            \n"
2189     MEMACCESS(2)
2190     "ld1        {v1.16b}, [%2], #16            \n"
2191     "subs       %w3, %w3, #16                  \n"
2192     "umull      v2.8h, v0.8b,  v4.8b           \n"
2193     "umull2     v3.8h, v0.16b, v4.16b          \n"
2194     "umlal      v2.8h, v1.8b,  v5.8b           \n"
2195     "umlal2     v3.8h, v1.16b, v5.16b          \n"
2196     "rshrn      v0.8b,  v2.8h, #8              \n"
2197     "rshrn2     v0.16b, v3.8h, #8              \n"
2198     MEMACCESS(0)
2199     "st1        {v0.16b}, [%0], #16            \n"
2200     "b.gt       1b                             \n"
2201     "b          99f                            \n"
2202 
2203     // Blend 50 / 50.
2204   "50:                                         \n"
2205     MEMACCESS(1)
2206     "ld1        {v0.16b}, [%1], #16            \n"
2207     MEMACCESS(2)
2208     "ld1        {v1.16b}, [%2], #16            \n"
2209     "subs       %w3, %w3, #16                  \n"
2210     "urhadd     v0.16b, v0.16b, v1.16b         \n"
2211     MEMACCESS(0)
2212     "st1        {v0.16b}, [%0], #16            \n"
2213     "b.gt       50b                            \n"
2214     "b          99f                            \n"
2215 
2216     // Blend 100 / 0 - Copy row unchanged.
2217   "100:                                        \n"
2218     MEMACCESS(1)
2219     "ld1        {v0.16b}, [%1], #16            \n"
2220     "subs       %w3, %w3, #16                  \n"
2221     MEMACCESS(0)
2222     "st1        {v0.16b}, [%0], #16            \n"
2223     "b.gt       100b                           \n"
2224 
2225   "99:                                         \n"
2226   : "+r"(dst_ptr),          // %0
2227     "+r"(src_ptr),          // %1
2228     "+r"(src_ptr1),         // %2
2229     "+r"(dst_width),        // %3
2230     "+r"(y1_fraction),      // %4
2231     "+r"(y0_fraction)       // %5
2232   :
2233   : "cc", "memory", "v0", "v1", "v3", "v4", "v5"
2234   );
2235 }
2236 
2237 // dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr
ARGBBlendRow_NEON(const uint8 * src_argb0,const uint8 * src_argb1,uint8 * dst_argb,int width)2238 void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
2239                        uint8* dst_argb, int width) {
2240   asm volatile (
2241     "subs       %w3, %w3, #8                   \n"
2242     "b.lt       89f                            \n"
2243     // Blend 8 pixels.
2244   "8:                                          \n"
2245     MEMACCESS(0)
2246     "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB0 pixels
2247     MEMACCESS(1)
2248     "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 ARGB1 pixels
2249     "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
2250     "umull      v16.8h, v4.8b, v3.8b           \n"  // db * a
2251     "umull      v17.8h, v5.8b, v3.8b           \n"  // dg * a
2252     "umull      v18.8h, v6.8b, v3.8b           \n"  // dr * a
2253     "uqrshrn    v16.8b, v16.8h, #8             \n"  // db >>= 8
2254     "uqrshrn    v17.8b, v17.8h, #8             \n"  // dg >>= 8
2255     "uqrshrn    v18.8b, v18.8h, #8             \n"  // dr >>= 8
2256     "uqsub      v4.8b, v4.8b, v16.8b           \n"  // db - (db * a / 256)
2257     "uqsub      v5.8b, v5.8b, v17.8b           \n"  // dg - (dg * a / 256)
2258     "uqsub      v6.8b, v6.8b, v18.8b           \n"  // dr - (dr * a / 256)
2259     "uqadd      v0.8b, v0.8b, v4.8b            \n"  // + sb
2260     "uqadd      v1.8b, v1.8b, v5.8b            \n"  // + sg
2261     "uqadd      v2.8b, v2.8b, v6.8b            \n"  // + sr
2262     "movi       v3.8b, #255                    \n"  // a = 255
2263     MEMACCESS(2)
2264     "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels
2265     "b.ge       8b                             \n"
2266 
2267   "89:                                         \n"
2268     "adds       %w3, %w3, #8-1                 \n"
2269     "b.lt       99f                            \n"
2270 
2271     // Blend 1 pixels.
2272   "1:                                          \n"
2273     MEMACCESS(0)
2274     "ld4        {v0.b,v1.b,v2.b,v3.b}[0], [%0], #4 \n"  // load 1 pixel ARGB0.
2275     MEMACCESS(1)
2276     "ld4        {v4.b,v5.b,v6.b,v7.b}[0], [%1], #4 \n"  // load 1 pixel ARGB1.
2277     "subs       %w3, %w3, #1                   \n"  // 1 processed per loop.
2278     "umull      v16.8h, v4.8b, v3.8b           \n"  // db * a
2279     "umull      v17.8h, v5.8b, v3.8b           \n"  // dg * a
2280     "umull      v18.8h, v6.8b, v3.8b           \n"  // dr * a
2281     "uqrshrn    v16.8b, v16.8h, #8             \n"  // db >>= 8
2282     "uqrshrn    v17.8b, v17.8h, #8             \n"  // dg >>= 8
2283     "uqrshrn    v18.8b, v18.8h, #8             \n"  // dr >>= 8
2284     "uqsub      v4.8b, v4.8b, v16.8b           \n"  // db - (db * a / 256)
2285     "uqsub      v5.8b, v5.8b, v17.8b           \n"  // dg - (dg * a / 256)
2286     "uqsub      v6.8b, v6.8b, v18.8b           \n"  // dr - (dr * a / 256)
2287     "uqadd      v0.8b, v0.8b, v4.8b            \n"  // + sb
2288     "uqadd      v1.8b, v1.8b, v5.8b            \n"  // + sg
2289     "uqadd      v2.8b, v2.8b, v6.8b            \n"  // + sr
2290     "movi       v3.8b, #255                    \n"  // a = 255
2291     MEMACCESS(2)
2292     "st4        {v0.b,v1.b,v2.b,v3.b}[0], [%2], #4 \n"  // store 1 pixel.
2293     "b.ge       1b                             \n"
2294 
2295   "99:                                         \n"
2296 
2297   : "+r"(src_argb0),    // %0
2298     "+r"(src_argb1),    // %1
2299     "+r"(dst_argb),     // %2
2300     "+r"(width)         // %3
2301   :
2302   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
2303     "v16", "v17", "v18"
2304   );
2305 }
2306 
2307 // Attenuate 8 pixels at a time.
ARGBAttenuateRow_NEON(const uint8 * src_argb,uint8 * dst_argb,int width)2308 void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
2309   asm volatile (
2310     // Attenuate 8 pixels.
2311   "1:                                          \n"
2312     MEMACCESS(0)
2313     "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels
2314     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
2315     "umull      v4.8h, v0.8b, v3.8b            \n"  // b * a
2316     "umull      v5.8h, v1.8b, v3.8b            \n"  // g * a
2317     "umull      v6.8h, v2.8b, v3.8b            \n"  // r * a
2318     "uqrshrn    v0.8b, v4.8h, #8               \n"  // b >>= 8
2319     "uqrshrn    v1.8b, v5.8h, #8               \n"  // g >>= 8
2320     "uqrshrn    v2.8b, v6.8h, #8               \n"  // r >>= 8
2321     MEMACCESS(1)
2322     "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB pixels
2323     "b.gt       1b                             \n"
2324   : "+r"(src_argb),   // %0
2325     "+r"(dst_argb),   // %1
2326     "+r"(width)       // %2
2327   :
2328   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"
2329   );
2330 }
2331 
2332 // Quantize 8 ARGB pixels (32 bytes).
2333 // dst = (dst * scale >> 16) * interval_size + interval_offset;
ARGBQuantizeRow_NEON(uint8 * dst_argb,int scale,int interval_size,int interval_offset,int width)2334 void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size,
2335                           int interval_offset, int width) {
2336   asm volatile (
2337     "dup        v4.8h, %w2                     \n"
2338     "ushr       v4.8h, v4.8h, #1               \n"  // scale >>= 1
2339     "dup        v5.8h, %w3                     \n"  // interval multiply.
2340     "dup        v6.8h, %w4                     \n"  // interval add
2341 
2342     // 8 pixel loop.
2343   "1:                                          \n"
2344     MEMACCESS(0)
2345     "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0]  \n"  // load 8 pixels of ARGB.
2346     "subs       %w1, %w1, #8                   \n"  // 8 processed per loop.
2347     "uxtl       v0.8h, v0.8b                   \n"  // b (0 .. 255)
2348     "uxtl       v1.8h, v1.8b                   \n"
2349     "uxtl       v2.8h, v2.8b                   \n"
2350     "sqdmulh    v0.8h, v0.8h, v4.8h            \n"  // b * scale
2351     "sqdmulh    v1.8h, v1.8h, v4.8h            \n"  // g
2352     "sqdmulh    v2.8h, v2.8h, v4.8h            \n"  // r
2353     "mul        v0.8h, v0.8h, v5.8h            \n"  // b * interval_size
2354     "mul        v1.8h, v1.8h, v5.8h            \n"  // g
2355     "mul        v2.8h, v2.8h, v5.8h            \n"  // r
2356     "add        v0.8h, v0.8h, v6.8h            \n"  // b + interval_offset
2357     "add        v1.8h, v1.8h, v6.8h            \n"  // g
2358     "add        v2.8h, v2.8h, v6.8h            \n"  // r
2359     "uqxtn      v0.8b, v0.8h                   \n"
2360     "uqxtn      v1.8b, v1.8h                   \n"
2361     "uqxtn      v2.8b, v2.8h                   \n"
2362     MEMACCESS(0)
2363     "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // store 8 ARGB pixels
2364     "b.gt       1b                             \n"
2365   : "+r"(dst_argb),       // %0
2366     "+r"(width)           // %1
2367   : "r"(scale),           // %2
2368     "r"(interval_size),   // %3
2369     "r"(interval_offset)  // %4
2370   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"
2371   );
2372 }
2373 
2374 // Shade 8 pixels at a time by specified value.
2375 // NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8.
2376 // Rounding in vqrdmulh does +1 to high if high bit of low s16 is set.
ARGBShadeRow_NEON(const uint8 * src_argb,uint8 * dst_argb,int width,uint32 value)2377 void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width,
2378                        uint32 value) {
2379   asm volatile (
2380     "dup        v0.4s, %w3                     \n"  // duplicate scale value.
2381     "zip1       v0.8b, v0.8b, v0.8b            \n"  // v0.8b aarrggbb.
2382     "ushr       v0.8h, v0.8h, #1               \n"  // scale / 2.
2383 
2384     // 8 pixel loop.
2385   "1:                                          \n"
2386     MEMACCESS(0)
2387     "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
2388     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
2389     "uxtl       v4.8h, v4.8b                   \n"  // b (0 .. 255)
2390     "uxtl       v5.8h, v5.8b                   \n"
2391     "uxtl       v6.8h, v6.8b                   \n"
2392     "uxtl       v7.8h, v7.8b                   \n"
2393     "sqrdmulh   v4.8h, v4.8h, v0.h[0]          \n"  // b * scale * 2
2394     "sqrdmulh   v5.8h, v5.8h, v0.h[1]          \n"  // g
2395     "sqrdmulh   v6.8h, v6.8h, v0.h[2]          \n"  // r
2396     "sqrdmulh   v7.8h, v7.8h, v0.h[3]          \n"  // a
2397     "uqxtn      v4.8b, v4.8h                   \n"
2398     "uqxtn      v5.8b, v5.8h                   \n"
2399     "uqxtn      v6.8b, v6.8h                   \n"
2400     "uqxtn      v7.8b, v7.8h                   \n"
2401     MEMACCESS(1)
2402     "st4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // store 8 ARGB pixels
2403     "b.gt       1b                             \n"
2404   : "+r"(src_argb),       // %0
2405     "+r"(dst_argb),       // %1
2406     "+r"(width)           // %2
2407   : "r"(value)            // %3
2408   : "cc", "memory", "v0", "v4", "v5", "v6", "v7"
2409   );
2410 }
2411 
2412 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
2413 // Similar to ARGBToYJ but stores ARGB.
2414 // C code is (15 * b + 75 * g + 38 * r + 64) >> 7;
ARGBGrayRow_NEON(const uint8 * src_argb,uint8 * dst_argb,int width)2415 void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
2416   asm volatile (
2417     "movi       v24.8b, #15                    \n"  // B * 0.11400 coefficient
2418     "movi       v25.8b, #75                    \n"  // G * 0.58700 coefficient
2419     "movi       v26.8b, #38                    \n"  // R * 0.29900 coefficient
2420   "1:                                          \n"
2421     MEMACCESS(0)
2422     "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
2423     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
2424     "umull      v4.8h, v0.8b, v24.8b           \n"  // B
2425     "umlal      v4.8h, v1.8b, v25.8b           \n"  // G
2426     "umlal      v4.8h, v2.8b, v26.8b           \n"  // R
2427     "sqrshrun   v0.8b, v4.8h, #7               \n"  // 15 bit to 8 bit B
2428     "orr        v1.8b, v0.8b, v0.8b            \n"  // G
2429     "orr        v2.8b, v0.8b, v0.8b            \n"  // R
2430     MEMACCESS(1)
2431     "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 pixels.
2432     "b.gt       1b                             \n"
2433   : "+r"(src_argb),  // %0
2434     "+r"(dst_argb),  // %1
2435     "+r"(width)      // %2
2436   :
2437   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26"
2438   );
2439 }
2440 
2441 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
2442 //    b = (r * 35 + g * 68 + b * 17) >> 7
2443 //    g = (r * 45 + g * 88 + b * 22) >> 7
2444 //    r = (r * 50 + g * 98 + b * 24) >> 7
2445 
ARGBSepiaRow_NEON(uint8 * dst_argb,int width)2446 void ARGBSepiaRow_NEON(uint8* dst_argb, int width) {
2447   asm volatile (
2448     "movi       v20.8b, #17                    \n"  // BB coefficient
2449     "movi       v21.8b, #68                    \n"  // BG coefficient
2450     "movi       v22.8b, #35                    \n"  // BR coefficient
2451     "movi       v24.8b, #22                    \n"  // GB coefficient
2452     "movi       v25.8b, #88                    \n"  // GG coefficient
2453     "movi       v26.8b, #45                    \n"  // GR coefficient
2454     "movi       v28.8b, #24                    \n"  // BB coefficient
2455     "movi       v29.8b, #98                    \n"  // BG coefficient
2456     "movi       v30.8b, #50                    \n"  // BR coefficient
2457   "1:                                          \n"
2458     MEMACCESS(0)
2459     "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n"  // load 8 ARGB pixels.
2460     "subs       %w1, %w1, #8                   \n"  // 8 processed per loop.
2461     "umull      v4.8h, v0.8b, v20.8b           \n"  // B to Sepia B
2462     "umlal      v4.8h, v1.8b, v21.8b           \n"  // G
2463     "umlal      v4.8h, v2.8b, v22.8b           \n"  // R
2464     "umull      v5.8h, v0.8b, v24.8b           \n"  // B to Sepia G
2465     "umlal      v5.8h, v1.8b, v25.8b           \n"  // G
2466     "umlal      v5.8h, v2.8b, v26.8b           \n"  // R
2467     "umull      v6.8h, v0.8b, v28.8b           \n"  // B to Sepia R
2468     "umlal      v6.8h, v1.8b, v29.8b           \n"  // G
2469     "umlal      v6.8h, v2.8b, v30.8b           \n"  // R
2470     "uqshrn     v0.8b, v4.8h, #7               \n"  // 16 bit to 8 bit B
2471     "uqshrn     v1.8b, v5.8h, #7               \n"  // 16 bit to 8 bit G
2472     "uqshrn     v2.8b, v6.8h, #7               \n"  // 16 bit to 8 bit R
2473     MEMACCESS(0)
2474     "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // store 8 pixels.
2475     "b.gt       1b                             \n"
2476   : "+r"(dst_argb),  // %0
2477     "+r"(width)      // %1
2478   :
2479   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
2480     "v20", "v21", "v22", "v24", "v25", "v26", "v28", "v29", "v30"
2481   );
2482 }
2483 
2484 // Tranform 8 ARGB pixels (32 bytes) with color matrix.
2485 // TODO(fbarchard): Was same as Sepia except matrix is provided.  This function
2486 // needs to saturate.  Consider doing a non-saturating version.
ARGBColorMatrixRow_NEON(const uint8 * src_argb,uint8 * dst_argb,const int8 * matrix_argb,int width)2487 void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb,
2488                              const int8* matrix_argb, int width) {
2489   asm volatile (
2490     MEMACCESS(3)
2491     "ld1        {v2.16b}, [%3]                 \n"  // load 3 ARGB vectors.
2492     "sxtl       v0.8h, v2.8b                   \n"  // B,G coefficients s16.
2493     "sxtl2      v1.8h, v2.16b                  \n"  // R,A coefficients s16.
2494 
2495   "1:                                          \n"
2496     MEMACCESS(0)
2497     "ld4        {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n"  // load 8 pixels.
2498     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
2499     "uxtl       v16.8h, v16.8b                 \n"  // b (0 .. 255) 16 bit
2500     "uxtl       v17.8h, v17.8b                 \n"  // g
2501     "uxtl       v18.8h, v18.8b                 \n"  // r
2502     "uxtl       v19.8h, v19.8b                 \n"  // a
2503     "mul        v22.8h, v16.8h, v0.h[0]        \n"  // B = B * Matrix B
2504     "mul        v23.8h, v16.8h, v0.h[4]        \n"  // G = B * Matrix G
2505     "mul        v24.8h, v16.8h, v1.h[0]        \n"  // R = B * Matrix R
2506     "mul        v25.8h, v16.8h, v1.h[4]        \n"  // A = B * Matrix A
2507     "mul        v4.8h, v17.8h, v0.h[1]         \n"  // B += G * Matrix B
2508     "mul        v5.8h, v17.8h, v0.h[5]         \n"  // G += G * Matrix G
2509     "mul        v6.8h, v17.8h, v1.h[1]         \n"  // R += G * Matrix R
2510     "mul        v7.8h, v17.8h, v1.h[5]         \n"  // A += G * Matrix A
2511     "sqadd      v22.8h, v22.8h, v4.8h          \n"  // Accumulate B
2512     "sqadd      v23.8h, v23.8h, v5.8h          \n"  // Accumulate G
2513     "sqadd      v24.8h, v24.8h, v6.8h          \n"  // Accumulate R
2514     "sqadd      v25.8h, v25.8h, v7.8h          \n"  // Accumulate A
2515     "mul        v4.8h, v18.8h, v0.h[2]         \n"  // B += R * Matrix B
2516     "mul        v5.8h, v18.8h, v0.h[6]         \n"  // G += R * Matrix G
2517     "mul        v6.8h, v18.8h, v1.h[2]         \n"  // R += R * Matrix R
2518     "mul        v7.8h, v18.8h, v1.h[6]         \n"  // A += R * Matrix A
2519     "sqadd      v22.8h, v22.8h, v4.8h          \n"  // Accumulate B
2520     "sqadd      v23.8h, v23.8h, v5.8h          \n"  // Accumulate G
2521     "sqadd      v24.8h, v24.8h, v6.8h          \n"  // Accumulate R
2522     "sqadd      v25.8h, v25.8h, v7.8h          \n"  // Accumulate A
2523     "mul        v4.8h, v19.8h, v0.h[3]         \n"  // B += A * Matrix B
2524     "mul        v5.8h, v19.8h, v0.h[7]         \n"  // G += A * Matrix G
2525     "mul        v6.8h, v19.8h, v1.h[3]         \n"  // R += A * Matrix R
2526     "mul        v7.8h, v19.8h, v1.h[7]         \n"  // A += A * Matrix A
2527     "sqadd      v22.8h, v22.8h, v4.8h          \n"  // Accumulate B
2528     "sqadd      v23.8h, v23.8h, v5.8h          \n"  // Accumulate G
2529     "sqadd      v24.8h, v24.8h, v6.8h          \n"  // Accumulate R
2530     "sqadd      v25.8h, v25.8h, v7.8h          \n"  // Accumulate A
2531     "sqshrun    v16.8b, v22.8h, #6             \n"  // 16 bit to 8 bit B
2532     "sqshrun    v17.8b, v23.8h, #6             \n"  // 16 bit to 8 bit G
2533     "sqshrun    v18.8b, v24.8h, #6             \n"  // 16 bit to 8 bit R
2534     "sqshrun    v19.8b, v25.8h, #6             \n"  // 16 bit to 8 bit A
2535     MEMACCESS(1)
2536     "st4        {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n"  // store 8 pixels.
2537     "b.gt       1b                             \n"
2538   : "+r"(src_argb),   // %0
2539     "+r"(dst_argb),   // %1
2540     "+r"(width)       // %2
2541   : "r"(matrix_argb)  // %3
2542   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17",
2543     "v18", "v19", "v22", "v23", "v24", "v25"
2544   );
2545 }
2546 
2547 // TODO(fbarchard): fix vqshrun in ARGBMultiplyRow_NEON and reenable.
2548 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
ARGBMultiplyRow_NEON(const uint8 * src_argb0,const uint8 * src_argb1,uint8 * dst_argb,int width)2549 void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
2550                           uint8* dst_argb, int width) {
2551   asm volatile (
2552     // 8 pixel loop.
2553   "1:                                          \n"
2554     MEMACCESS(0)
2555     "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
2556     MEMACCESS(1)
2557     "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 more pixels.
2558     "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
2559     "umull      v0.8h, v0.8b, v4.8b            \n"  // multiply B
2560     "umull      v1.8h, v1.8b, v5.8b            \n"  // multiply G
2561     "umull      v2.8h, v2.8b, v6.8b            \n"  // multiply R
2562     "umull      v3.8h, v3.8b, v7.8b            \n"  // multiply A
2563     "rshrn      v0.8b, v0.8h, #8               \n"  // 16 bit to 8 bit B
2564     "rshrn      v1.8b, v1.8h, #8               \n"  // 16 bit to 8 bit G
2565     "rshrn      v2.8b, v2.8h, #8               \n"  // 16 bit to 8 bit R
2566     "rshrn      v3.8b, v3.8h, #8               \n"  // 16 bit to 8 bit A
2567     MEMACCESS(2)
2568     "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels
2569     "b.gt       1b                             \n"
2570 
2571   : "+r"(src_argb0),  // %0
2572     "+r"(src_argb1),  // %1
2573     "+r"(dst_argb),   // %2
2574     "+r"(width)       // %3
2575   :
2576   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
2577   );
2578 }
2579 
2580 // Add 2 rows of ARGB pixels together, 8 pixels at a time.
ARGBAddRow_NEON(const uint8 * src_argb0,const uint8 * src_argb1,uint8 * dst_argb,int width)2581 void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
2582                      uint8* dst_argb, int width) {
2583   asm volatile (
2584     // 8 pixel loop.
2585   "1:                                          \n"
2586     MEMACCESS(0)
2587     "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
2588     MEMACCESS(1)
2589     "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 more pixels.
2590     "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
2591     "uqadd      v0.8b, v0.8b, v4.8b            \n"
2592     "uqadd      v1.8b, v1.8b, v5.8b            \n"
2593     "uqadd      v2.8b, v2.8b, v6.8b            \n"
2594     "uqadd      v3.8b, v3.8b, v7.8b            \n"
2595     MEMACCESS(2)
2596     "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels
2597     "b.gt       1b                             \n"
2598 
2599   : "+r"(src_argb0),  // %0
2600     "+r"(src_argb1),  // %1
2601     "+r"(dst_argb),   // %2
2602     "+r"(width)       // %3
2603   :
2604   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
2605   );
2606 }
2607 
2608 // Subtract 2 rows of ARGB pixels, 8 pixels at a time.
ARGBSubtractRow_NEON(const uint8 * src_argb0,const uint8 * src_argb1,uint8 * dst_argb,int width)2609 void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
2610                           uint8* dst_argb, int width) {
2611   asm volatile (
2612     // 8 pixel loop.
2613   "1:                                          \n"
2614     MEMACCESS(0)
2615     "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
2616     MEMACCESS(1)
2617     "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 more pixels.
2618     "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
2619     "uqsub      v0.8b, v0.8b, v4.8b            \n"
2620     "uqsub      v1.8b, v1.8b, v5.8b            \n"
2621     "uqsub      v2.8b, v2.8b, v6.8b            \n"
2622     "uqsub      v3.8b, v3.8b, v7.8b            \n"
2623     MEMACCESS(2)
2624     "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels
2625     "b.gt       1b                             \n"
2626 
2627   : "+r"(src_argb0),  // %0
2628     "+r"(src_argb1),  // %1
2629     "+r"(dst_argb),   // %2
2630     "+r"(width)       // %3
2631   :
2632   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
2633   );
2634 }
2635 
2636 // Adds Sobel X and Sobel Y and stores Sobel into ARGB.
2637 // A = 255
2638 // R = Sobel
2639 // G = Sobel
2640 // B = Sobel
SobelRow_NEON(const uint8 * src_sobelx,const uint8 * src_sobely,uint8 * dst_argb,int width)2641 void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
2642                      uint8* dst_argb, int width) {
2643   asm volatile (
2644     "movi       v3.8b, #255                    \n"  // alpha
2645     // 8 pixel loop.
2646   "1:                                          \n"
2647     MEMACCESS(0)
2648     "ld1        {v0.8b}, [%0], #8              \n"  // load 8 sobelx.
2649     MEMACCESS(1)
2650     "ld1        {v1.8b}, [%1], #8              \n"  // load 8 sobely.
2651     "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
2652     "uqadd      v0.8b, v0.8b, v1.8b            \n"  // add
2653     "orr        v1.8b, v0.8b, v0.8b            \n"
2654     "orr        v2.8b, v0.8b, v0.8b            \n"
2655     MEMACCESS(2)
2656     "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels
2657     "b.gt       1b                             \n"
2658   : "+r"(src_sobelx),  // %0
2659     "+r"(src_sobely),  // %1
2660     "+r"(dst_argb),    // %2
2661     "+r"(width)        // %3
2662   :
2663   : "cc", "memory", "v0", "v1", "v2", "v3"
2664   );
2665 }
2666 
2667 // Adds Sobel X and Sobel Y and stores Sobel into plane.
SobelToPlaneRow_NEON(const uint8 * src_sobelx,const uint8 * src_sobely,uint8 * dst_y,int width)2668 void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
2669                           uint8* dst_y, int width) {
2670   asm volatile (
2671     // 16 pixel loop.
2672   "1:                                          \n"
2673     MEMACCESS(0)
2674     "ld1        {v0.16b}, [%0], #16            \n"  // load 16 sobelx.
2675     MEMACCESS(1)
2676     "ld1        {v1.16b}, [%1], #16            \n"  // load 16 sobely.
2677     "subs       %w3, %w3, #16                  \n"  // 16 processed per loop.
2678     "uqadd      v0.16b, v0.16b, v1.16b         \n"  // add
2679     MEMACCESS(2)
2680     "st1        {v0.16b}, [%2], #16            \n"  // store 16 pixels.
2681     "b.gt       1b                             \n"
2682   : "+r"(src_sobelx),  // %0
2683     "+r"(src_sobely),  // %1
2684     "+r"(dst_y),       // %2
2685     "+r"(width)        // %3
2686   :
2687   : "cc", "memory", "v0", "v1"
2688   );
2689 }
2690 
2691 // Mixes Sobel X, Sobel Y and Sobel into ARGB.
2692 // A = 255
2693 // R = Sobel X
2694 // G = Sobel
2695 // B = Sobel Y
SobelXYRow_NEON(const uint8 * src_sobelx,const uint8 * src_sobely,uint8 * dst_argb,int width)2696 void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
2697                      uint8* dst_argb, int width) {
2698   asm volatile (
2699     "movi       v3.8b, #255                    \n"  // alpha
2700     // 8 pixel loop.
2701   "1:                                          \n"
2702     MEMACCESS(0)
2703     "ld1        {v2.8b}, [%0], #8              \n"  // load 8 sobelx.
2704     MEMACCESS(1)
2705     "ld1        {v0.8b}, [%1], #8              \n"  // load 8 sobely.
2706     "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
2707     "uqadd      v1.8b, v0.8b, v2.8b            \n"  // add
2708     MEMACCESS(2)
2709     "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels
2710     "b.gt       1b                             \n"
2711   : "+r"(src_sobelx),  // %0
2712     "+r"(src_sobely),  // %1
2713     "+r"(dst_argb),    // %2
2714     "+r"(width)        // %3
2715   :
2716   : "cc", "memory", "v0", "v1", "v2", "v3"
2717   );
2718 }
2719 
2720 // SobelX as a matrix is
2721 // -1  0  1
2722 // -2  0  2
2723 // -1  0  1
SobelXRow_NEON(const uint8 * src_y0,const uint8 * src_y1,const uint8 * src_y2,uint8 * dst_sobelx,int width)2724 void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,
2725                     const uint8* src_y2, uint8* dst_sobelx, int width) {
2726   asm volatile (
2727   "1:                                          \n"
2728     MEMACCESS(0)
2729     "ld1        {v0.8b}, [%0],%5               \n"  // top
2730     MEMACCESS(0)
2731     "ld1        {v1.8b}, [%0],%6               \n"
2732     "usubl      v0.8h, v0.8b, v1.8b            \n"
2733     MEMACCESS(1)
2734     "ld1        {v2.8b}, [%1],%5               \n"  // center * 2
2735     MEMACCESS(1)
2736     "ld1        {v3.8b}, [%1],%6               \n"
2737     "usubl      v1.8h, v2.8b, v3.8b            \n"
2738     "add        v0.8h, v0.8h, v1.8h            \n"
2739     "add        v0.8h, v0.8h, v1.8h            \n"
2740     MEMACCESS(2)
2741     "ld1        {v2.8b}, [%2],%5               \n"  // bottom
2742     MEMACCESS(2)
2743     "ld1        {v3.8b}, [%2],%6               \n"
2744     "subs       %w4, %w4, #8                   \n"  // 8 pixels
2745     "usubl      v1.8h, v2.8b, v3.8b            \n"
2746     "add        v0.8h, v0.8h, v1.8h            \n"
2747     "abs        v0.8h, v0.8h                   \n"
2748     "uqxtn      v0.8b, v0.8h                   \n"
2749     MEMACCESS(3)
2750     "st1        {v0.8b}, [%3], #8              \n"  // store 8 sobelx
2751     "b.gt       1b                             \n"
2752   : "+r"(src_y0),      // %0
2753     "+r"(src_y1),      // %1
2754     "+r"(src_y2),      // %2
2755     "+r"(dst_sobelx),  // %3
2756     "+r"(width)        // %4
2757   : "r"(2LL),          // %5
2758     "r"(6LL)           // %6
2759   : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
2760   );
2761 }
2762 
2763 // SobelY as a matrix is
2764 // -1 -2 -1
2765 //  0  0  0
2766 //  1  2  1
SobelYRow_NEON(const uint8 * src_y0,const uint8 * src_y1,uint8 * dst_sobely,int width)2767 void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
2768                     uint8* dst_sobely, int width) {
2769   asm volatile (
2770   "1:                                          \n"
2771     MEMACCESS(0)
2772     "ld1        {v0.8b}, [%0],%4               \n"  // left
2773     MEMACCESS(1)
2774     "ld1        {v1.8b}, [%1],%4               \n"
2775     "usubl      v0.8h, v0.8b, v1.8b            \n"
2776     MEMACCESS(0)
2777     "ld1        {v2.8b}, [%0],%4               \n"  // center * 2
2778     MEMACCESS(1)
2779     "ld1        {v3.8b}, [%1],%4               \n"
2780     "usubl      v1.8h, v2.8b, v3.8b            \n"
2781     "add        v0.8h, v0.8h, v1.8h            \n"
2782     "add        v0.8h, v0.8h, v1.8h            \n"
2783     MEMACCESS(0)
2784     "ld1        {v2.8b}, [%0],%5               \n"  // right
2785     MEMACCESS(1)
2786     "ld1        {v3.8b}, [%1],%5               \n"
2787     "subs       %w3, %w3, #8                   \n"  // 8 pixels
2788     "usubl      v1.8h, v2.8b, v3.8b            \n"
2789     "add        v0.8h, v0.8h, v1.8h            \n"
2790     "abs        v0.8h, v0.8h                   \n"
2791     "uqxtn      v0.8b, v0.8h                   \n"
2792     MEMACCESS(2)
2793     "st1        {v0.8b}, [%2], #8              \n"  // store 8 sobely
2794     "b.gt       1b                             \n"
2795   : "+r"(src_y0),      // %0
2796     "+r"(src_y1),      // %1
2797     "+r"(dst_sobely),  // %2
2798     "+r"(width)        // %3
2799   : "r"(1LL),          // %4
2800     "r"(6LL)           // %5
2801   : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
2802   );
2803 }
2804 #endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
2805 
2806 #ifdef __cplusplus
2807 }  // extern "C"
2808 }  // namespace libyuv
2809 #endif
2810