• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  *  Copyright 2014 The LibYuv Project Authors. All rights reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS. All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "libyuv/row.h"
12 
13 #ifdef __cplusplus
14 namespace libyuv {
15 extern "C" {
16 #endif
17 
18 // Enable LIBYUV_USE_ST2, LIBYUV_USE_ST3, LIBYUV_USE_ST4 for CPUs that prefer
19 // STn over ZIP1+ST1
20 // Exynos M1, M2, M3 are slow with ST2, ST3 and ST4 instructions.
21 
22 // This module is for GCC Neon armv8 64 bit.
23 #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
24 
25 // v0.8h: Y
26 // v1.16b: 8U, 8V
27 
28 // Read 8 Y, 4 U and 4 V from 422
29 #define READYUV422                               \
30   "ldr        d0, [%[src_y]], #8             \n" \
31   "ld1        {v1.s}[0], [%[src_u]], #4      \n" \
32   "ld1        {v1.s}[1], [%[src_v]], #4      \n" \
33   "zip1       v0.16b, v0.16b, v0.16b         \n" \
34   "prfm       pldl1keep, [%[src_y], 448]     \n" \
35   "zip1       v1.16b, v1.16b, v1.16b         \n" \
36   "prfm       pldl1keep, [%[src_u], 128]     \n" \
37   "prfm       pldl1keep, [%[src_v], 128]     \n"
38 
39 // Read 8 Y, 8 U and 8 V from 444
40 #define READYUV444                               \
41   "ldr        d0, [%[src_y]], #8             \n" \
42   "ld1        {v1.d}[0], [%[src_u]], #8      \n" \
43   "prfm       pldl1keep, [%[src_y], 448]     \n" \
44   "ld1        {v1.d}[1], [%[src_v]], #8      \n" \
45   "prfm       pldl1keep, [%[src_u], 448]     \n" \
46   "zip1       v0.16b, v0.16b, v0.16b         \n" \
47   "prfm       pldl1keep, [%[src_v], 448]     \n"
48 
49 // Read 8 Y, and set 4 U and 4 V to 128
50 #define READYUV400                               \
51   "ldr        d0, [%[src_y]], #8             \n" \
52   "movi       v1.16b, #128                   \n" \
53   "prfm       pldl1keep, [%[src_y], 448]     \n" \
54   "zip1       v0.16b, v0.16b, v0.16b         \n"
55 
56 static const uvec8 kNV12Table = {0, 0, 2, 2, 4, 4, 6, 6,
57                                  1, 1, 3, 3, 5, 5, 7, 7};
58 static const uvec8 kNV21Table = {1, 1, 3, 3, 5, 5, 7, 7,
59                                  0, 0, 2, 2, 4, 4, 6, 6};
60 
61 // Read 8 Y and 4 UV from NV12 or NV21
62 #define READNV12                                 \
63   "ldr        d0, [%[src_y]], #8             \n" \
64   "ldr        d1, [%[src_uv]], #8            \n" \
65   "zip1       v0.16b, v0.16b, v0.16b         \n" \
66   "prfm       pldl1keep, [%[src_y], 448]     \n" \
67   "tbl        v1.16b, {v1.16b}, v2.16b       \n" \
68   "prfm       pldl1keep, [%[src_uv], 448]    \n"
69 
70 // Read 8 YUY2
71 #define READYUY2                                     \
72   "ld2        {v0.8b, v1.8b}, [%[src_yuy2]], #16 \n" \
73   "zip1       v0.16b, v0.16b, v0.16b         \n"     \
74   "prfm       pldl1keep, [%[src_yuy2], 448]  \n"     \
75   "tbl        v1.16b, {v1.16b}, v2.16b       \n"
76 
77 // Read 8 UYVY
78 #define READUYVY                                     \
79   "ld2        {v3.8b, v4.8b}, [%[src_uyvy]], #16 \n" \
80   "zip1       v0.16b, v4.16b, v4.16b         \n"     \
81   "prfm       pldl1keep, [%[src_uyvy], 448]  \n"     \
82   "tbl        v1.16b, {v3.16b}, v2.16b       \n"
83 
84 // UB VR UG VG
85 // YG BB BG BR
86 #define YUVTORGB_SETUP                                                \
87   "ld4r       {v28.16b, v29.16b, v30.16b, v31.16b}, [%[kUVCoeff]] \n" \
88   "ld4r       {v24.8h, v25.8h, v26.8h, v27.8h}, [%[kRGBCoeffBias]] \n"
89 
90 // v16.8h: B
91 // v17.8h: G
92 // v18.8h: R
93 
94 // Convert from YUV to 2.14 fixed point RGB
95 #define YUVTORGB                                          \
96   "umull2     v3.4s, v0.8h, v24.8h           \n"          \
97   "umull      v6.8h, v1.8b, v30.8b           \n"          \
98   "umull      v0.4s, v0.4h, v24.4h           \n"          \
99   "umlal2     v6.8h, v1.16b, v31.16b         \n" /* DG */ \
100   "uqshrn     v0.4h, v0.4s, #16              \n"          \
101   "uqshrn2    v0.8h, v3.4s, #16              \n" /* Y */  \
102   "umull      v4.8h, v1.8b, v28.8b           \n" /* DB */ \
103   "umull2     v5.8h, v1.16b, v29.16b         \n" /* DR */ \
104   "add        v17.8h, v0.8h, v26.8h          \n" /* G */  \
105   "add        v16.8h, v0.8h, v4.8h           \n" /* B */  \
106   "add        v18.8h, v0.8h, v5.8h           \n" /* R */  \
107   "uqsub      v17.8h, v17.8h, v6.8h          \n" /* G */  \
108   "uqsub      v16.8h, v16.8h, v25.8h         \n" /* B */  \
109   "uqsub      v18.8h, v18.8h, v27.8h         \n" /* R */
110 
111 // Convert from 2.14 fixed point RGB To 8 bit RGB
112 #define RGBTORGB8                                \
113   "uqshrn     v17.8b, v17.8h, #6             \n" \
114   "uqshrn     v16.8b, v16.8h, #6             \n" \
115   "uqshrn     v18.8b, v18.8h, #6             \n"
116 
117 #define YUVTORGB_REGS                                                          \
118   "v0", "v1", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v24", "v25", \
119       "v26", "v27", "v28", "v29", "v30", "v31"
120 
I444ToARGBRow_NEON(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)121 void I444ToARGBRow_NEON(const uint8_t* src_y,
122                         const uint8_t* src_u,
123                         const uint8_t* src_v,
124                         uint8_t* dst_argb,
125                         const struct YuvConstants* yuvconstants,
126                         int width) {
127   asm volatile(
128       YUVTORGB_SETUP
129       "movi        v19.8b, #255                  \n" /* A */
130       "1:                                        \n" READYUV444 YUVTORGB
131           RGBTORGB8
132       "subs        %w[width], %w[width], #8      \n"
133       "st4         {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
134       "b.gt        1b                            \n"
135       : [src_y] "+r"(src_y),                               // %[src_y]
136         [src_u] "+r"(src_u),                               // %[src_u]
137         [src_v] "+r"(src_v),                               // %[src_v]
138         [dst_argb] "+r"(dst_argb),                         // %[dst_argb]
139         [width] "+r"(width)                                // %[width]
140       : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
141         [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
142       : "cc", "memory", YUVTORGB_REGS, "v19");
143 }
144 
I444ToRGB24Row_NEON(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_rgb24,const struct YuvConstants * yuvconstants,int width)145 void I444ToRGB24Row_NEON(const uint8_t* src_y,
146                          const uint8_t* src_u,
147                          const uint8_t* src_v,
148                          uint8_t* dst_rgb24,
149                          const struct YuvConstants* yuvconstants,
150                          int width) {
151   asm volatile(
152       YUVTORGB_SETUP
153       "1:                                        \n" READYUV444 YUVTORGB
154           RGBTORGB8
155       "subs        %w[width], %w[width], #8      \n"
156       "st3         {v16.8b,v17.8b,v18.8b}, [%[dst_rgb24]], #24 \n"
157       "b.gt        1b                            \n"
158       : [src_y] "+r"(src_y),                               // %[src_y]
159         [src_u] "+r"(src_u),                               // %[src_u]
160         [src_v] "+r"(src_v),                               // %[src_v]
161         [dst_rgb24] "+r"(dst_rgb24),                       // %[dst_rgb24]
162         [width] "+r"(width)                                // %[width]
163       : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
164         [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
165       : "cc", "memory", YUVTORGB_REGS);
166 }
167 
I422ToARGBRow_NEON(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)168 void I422ToARGBRow_NEON(const uint8_t* src_y,
169                         const uint8_t* src_u,
170                         const uint8_t* src_v,
171                         uint8_t* dst_argb,
172                         const struct YuvConstants* yuvconstants,
173                         int width) {
174   asm volatile(
175       YUVTORGB_SETUP
176       "movi        v19.8b, #255                  \n" /* A */
177       "1:                                        \n" READYUV422 YUVTORGB
178           RGBTORGB8
179       "subs        %w[width], %w[width], #8      \n"
180       "st4         {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
181       "b.gt        1b                            \n"
182       : [src_y] "+r"(src_y),                               // %[src_y]
183         [src_u] "+r"(src_u),                               // %[src_u]
184         [src_v] "+r"(src_v),                               // %[src_v]
185         [dst_argb] "+r"(dst_argb),                         // %[dst_argb]
186         [width] "+r"(width)                                // %[width]
187       : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
188         [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
189       : "cc", "memory", YUVTORGB_REGS, "v19");
190 }
191 
I444AlphaToARGBRow_NEON(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,const uint8_t * src_a,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)192 void I444AlphaToARGBRow_NEON(const uint8_t* src_y,
193                              const uint8_t* src_u,
194                              const uint8_t* src_v,
195                              const uint8_t* src_a,
196                              uint8_t* dst_argb,
197                              const struct YuvConstants* yuvconstants,
198                              int width) {
199   asm volatile(
200       YUVTORGB_SETUP
201       "1:                                        \n"
202       "ld1         {v19.8b}, [%[src_a]], #8      \n" READYUV444
203       "prfm        pldl1keep, [%[src_a], 448]    \n" YUVTORGB RGBTORGB8
204       "subs        %w[width], %w[width], #8      \n"
205       "st4         {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
206       "b.gt        1b                            \n"
207       : [src_y] "+r"(src_y),                               // %[src_y]
208         [src_u] "+r"(src_u),                               // %[src_u]
209         [src_v] "+r"(src_v),                               // %[src_v]
210         [src_a] "+r"(src_a),                               // %[src_a]
211         [dst_argb] "+r"(dst_argb),                         // %[dst_argb]
212         [width] "+r"(width)                                // %[width]
213       : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
214         [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
215       : "cc", "memory", YUVTORGB_REGS, "v19");
216 }
217 
I422AlphaToARGBRow_NEON(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,const uint8_t * src_a,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)218 void I422AlphaToARGBRow_NEON(const uint8_t* src_y,
219                              const uint8_t* src_u,
220                              const uint8_t* src_v,
221                              const uint8_t* src_a,
222                              uint8_t* dst_argb,
223                              const struct YuvConstants* yuvconstants,
224                              int width) {
225   asm volatile(
226       YUVTORGB_SETUP
227       "1:                                        \n"
228       "ld1         {v19.8b}, [%[src_a]], #8      \n" READYUV422
229       "prfm        pldl1keep, [%[src_a], 448]    \n" YUVTORGB RGBTORGB8
230       "subs        %w[width], %w[width], #8      \n"
231       "st4         {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
232       "b.gt        1b                            \n"
233       : [src_y] "+r"(src_y),                               // %[src_y]
234         [src_u] "+r"(src_u),                               // %[src_u]
235         [src_v] "+r"(src_v),                               // %[src_v]
236         [src_a] "+r"(src_a),                               // %[src_a]
237         [dst_argb] "+r"(dst_argb),                         // %[dst_argb]
238         [width] "+r"(width)                                // %[width]
239       : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
240         [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
241       : "cc", "memory", YUVTORGB_REGS, "v19");
242 }
243 
I422ToRGBARow_NEON(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_rgba,const struct YuvConstants * yuvconstants,int width)244 void I422ToRGBARow_NEON(const uint8_t* src_y,
245                         const uint8_t* src_u,
246                         const uint8_t* src_v,
247                         uint8_t* dst_rgba,
248                         const struct YuvConstants* yuvconstants,
249                         int width) {
250   asm volatile(
251       YUVTORGB_SETUP
252       "movi        v15.8b, #255                  \n" /* A */
253       "1:                                        \n" READYUV422 YUVTORGB
254           RGBTORGB8
255       "subs        %w[width], %w[width], #8      \n"
256       "st4         {v15.8b,v16.8b,v17.8b,v18.8b}, [%[dst_rgba]], #32 \n"
257       "b.gt        1b                            \n"
258       : [src_y] "+r"(src_y),                               // %[src_y]
259         [src_u] "+r"(src_u),                               // %[src_u]
260         [src_v] "+r"(src_v),                               // %[src_v]
261         [dst_rgba] "+r"(dst_rgba),                         // %[dst_rgba]
262         [width] "+r"(width)                                // %[width]
263       : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
264         [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
265       : "cc", "memory", YUVTORGB_REGS, "v15");
266 }
267 
I422ToRGB24Row_NEON(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_rgb24,const struct YuvConstants * yuvconstants,int width)268 void I422ToRGB24Row_NEON(const uint8_t* src_y,
269                          const uint8_t* src_u,
270                          const uint8_t* src_v,
271                          uint8_t* dst_rgb24,
272                          const struct YuvConstants* yuvconstants,
273                          int width) {
274   asm volatile(
275       YUVTORGB_SETUP
276       "1:                                        \n" READYUV422 YUVTORGB
277           RGBTORGB8
278       "subs        %w[width], %w[width], #8      \n"
279       "st3         {v16.8b,v17.8b,v18.8b}, [%[dst_rgb24]], #24 \n"
280       "b.gt        1b                            \n"
281       : [src_y] "+r"(src_y),                               // %[src_y]
282         [src_u] "+r"(src_u),                               // %[src_u]
283         [src_v] "+r"(src_v),                               // %[src_v]
284         [dst_rgb24] "+r"(dst_rgb24),                       // %[dst_rgb24]
285         [width] "+r"(width)                                // %[width]
286       : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
287         [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
288       : "cc", "memory", YUVTORGB_REGS);
289 }
290 
291 #define ARGBTORGB565                                                        \
292   "shll       v18.8h, v18.8b, #8             \n" /* R                    */ \
293   "shll       v17.8h, v17.8b, #8             \n" /* G                    */ \
294   "shll       v16.8h, v16.8b, #8             \n" /* B                    */ \
295   "sri        v18.8h, v17.8h, #5             \n" /* RG                   */ \
296   "sri        v18.8h, v16.8h, #11            \n" /* RGB                  */
297 
I422ToRGB565Row_NEON(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_rgb565,const struct YuvConstants * yuvconstants,int width)298 void I422ToRGB565Row_NEON(const uint8_t* src_y,
299                           const uint8_t* src_u,
300                           const uint8_t* src_v,
301                           uint8_t* dst_rgb565,
302                           const struct YuvConstants* yuvconstants,
303                           int width) {
304   asm volatile(
305       YUVTORGB_SETUP
306       "1:                                        \n" READYUV422 YUVTORGB
307       RGBTORGB8 "subs        %w[width], %w[width], #8      \n" ARGBTORGB565
308       "st1         {v18.8h}, [%[dst_rgb565]], #16 \n"  // store 8 pixels RGB565.
309       "b.gt        1b                            \n"
310       : [src_y] "+r"(src_y),                               // %[src_y]
311         [src_u] "+r"(src_u),                               // %[src_u]
312         [src_v] "+r"(src_v),                               // %[src_v]
313         [dst_rgb565] "+r"(dst_rgb565),                     // %[dst_rgb565]
314         [width] "+r"(width)                                // %[width]
315       : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
316         [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
317       : "cc", "memory", YUVTORGB_REGS);
318 }
319 
320 #define ARGBTOARGB1555                                                      \
321   "shll       v0.8h,  v19.8b, #8             \n" /* A                    */ \
322   "shll       v18.8h, v18.8b, #8             \n" /* R                    */ \
323   "shll       v17.8h, v17.8b, #8             \n" /* G                    */ \
324   "shll       v16.8h, v16.8b, #8             \n" /* B                    */ \
325   "sri        v0.8h,  v18.8h, #1             \n" /* AR                   */ \
326   "sri        v0.8h,  v17.8h, #6             \n" /* ARG                  */ \
327   "sri        v0.8h,  v16.8h, #11            \n" /* ARGB                 */
328 
I422ToARGB1555Row_NEON(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_argb1555,const struct YuvConstants * yuvconstants,int width)329 void I422ToARGB1555Row_NEON(const uint8_t* src_y,
330                             const uint8_t* src_u,
331                             const uint8_t* src_v,
332                             uint8_t* dst_argb1555,
333                             const struct YuvConstants* yuvconstants,
334                             int width) {
335   asm volatile(
336       YUVTORGB_SETUP
337       "movi        v19.8b, #255                  \n"
338       "1:                                        \n" READYUV422 YUVTORGB
339           RGBTORGB8
340       "subs        %w[width], %w[width], #8      \n" ARGBTOARGB1555
341       "st1         {v0.8h}, [%[dst_argb1555]], #16 \n"  // store 8 pixels
342                                                         // RGB565.
343       "b.gt        1b                            \n"
344       : [src_y] "+r"(src_y),                               // %[src_y]
345         [src_u] "+r"(src_u),                               // %[src_u]
346         [src_v] "+r"(src_v),                               // %[src_v]
347         [dst_argb1555] "+r"(dst_argb1555),                 // %[dst_argb1555]
348         [width] "+r"(width)                                // %[width]
349       : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
350         [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
351       : "cc", "memory", YUVTORGB_REGS, "v19");
352 }
353 
354 #define ARGBTOARGB4444                                                       \
355   /* Input v16.8b<=B, v17.8b<=G, v18.8b<=R, v19.8b<=A, v23.8b<=0x0f       */ \
356   "ushr       v16.8b, v16.8b, #4             \n" /* B                    */  \
357   "bic        v17.8b, v17.8b, v23.8b         \n" /* G                    */  \
358   "ushr       v18.8b, v18.8b, #4             \n" /* R                    */  \
359   "bic        v19.8b, v19.8b, v23.8b         \n" /* A                    */  \
360   "orr        v0.8b,  v16.8b, v17.8b         \n" /* BG                   */  \
361   "orr        v1.8b,  v18.8b, v19.8b         \n" /* RA                   */  \
362   "zip1       v0.16b, v0.16b, v1.16b         \n" /* BGRA                 */
363 
I422ToARGB4444Row_NEON(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_argb4444,const struct YuvConstants * yuvconstants,int width)364 void I422ToARGB4444Row_NEON(const uint8_t* src_y,
365                             const uint8_t* src_u,
366                             const uint8_t* src_v,
367                             uint8_t* dst_argb4444,
368                             const struct YuvConstants* yuvconstants,
369                             int width) {
370   asm volatile(
371       YUVTORGB_SETUP
372       "movi        v23.16b, #0x0f                \n"  // bits to clear with
373                                                       // vbic.
374       "1:                                        \n" READYUV422 YUVTORGB
375           RGBTORGB8
376       "subs        %w[width], %w[width], #8      \n"
377       "movi        v19.8b, #255                  \n" ARGBTOARGB4444
378       "st1         {v0.8h}, [%[dst_argb4444]], #16 \n"  // store 8
379                                                         // pixels
380                                                         // ARGB4444.
381       "b.gt        1b                            \n"
382       : [src_y] "+r"(src_y),                               // %[src_y]
383         [src_u] "+r"(src_u),                               // %[src_u]
384         [src_v] "+r"(src_v),                               // %[src_v]
385         [dst_argb4444] "+r"(dst_argb4444),                 // %[dst_argb4444]
386         [width] "+r"(width)                                // %[width]
387       : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
388         [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
389       : "cc", "memory", YUVTORGB_REGS, "v19", "v23");
390 }
391 
I400ToARGBRow_NEON(const uint8_t * src_y,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)392 void I400ToARGBRow_NEON(const uint8_t* src_y,
393                         uint8_t* dst_argb,
394                         const struct YuvConstants* yuvconstants,
395                         int width) {
396   asm volatile(
397       YUVTORGB_SETUP
398       "movi        v19.8b, #255                  \n"
399       "1:                                        \n" READYUV400 YUVTORGB
400           RGBTORGB8
401       "subs        %w[width], %w[width], #8      \n"
402       "st4         {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
403       "b.gt        1b                            \n"
404       : [src_y] "+r"(src_y),                               // %[src_y]
405         [dst_argb] "+r"(dst_argb),                         // %[dst_argb]
406         [width] "+r"(width)                                // %[width]
407       : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),           // %[kUVCoeff]
408         [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias)  // %[kRGBCoeffBias]
409       : "cc", "memory", YUVTORGB_REGS, "v19");
410 }
411 
412 #if LIBYUV_USE_ST4
J400ToARGBRow_NEON(const uint8_t * src_y,uint8_t * dst_argb,int width)413 void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
414   asm volatile(
415       "movi        v23.8b, #255                  \n"
416       "1:                                        \n"
417       "ld1         {v20.8b}, [%0], #8            \n"
418       "prfm        pldl1keep, [%0, 448]          \n"
419       "orr         v21.8b, v20.8b, v20.8b        \n"
420       "orr         v22.8b, v20.8b, v20.8b        \n"
421       "subs        %w2, %w2, #8                  \n"
422       "st4         {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
423       "b.gt        1b                            \n"
424       : "+r"(src_y),     // %0
425         "+r"(dst_argb),  // %1
426         "+r"(width)      // %2
427       :
428       : "cc", "memory", "v20", "v21", "v22", "v23");
429 }
430 #else
J400ToARGBRow_NEON(const uint8_t * src_y,uint8_t * dst_argb,int width)431 void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
432   asm volatile(
433       "movi        v20.8b, #255                  \n"
434       "1:                                        \n"
435       "ldr         d16, [%0], #8                 \n"
436       "subs        %w2, %w2, #8                  \n"
437       "zip1        v18.16b, v16.16b, v16.16b     \n"  // YY
438       "zip1        v19.16b, v16.16b, v20.16b     \n"  // YA
439       "prfm        pldl1keep, [%0, 448]          \n"
440       "zip1        v16.16b, v18.16b, v19.16b     \n"  // YYYA
441       "zip2        v17.16b, v18.16b, v19.16b     \n"
442       "stp         q16, q17, [%1], #32           \n"
443       "b.gt        1b                            \n"
444       : "+r"(src_y),     // %0
445         "+r"(dst_argb),  // %1
446         "+r"(width)      // %2
447       :
448       : "cc", "memory", "v16", "v17", "v18", "v19", "v20");
449 }
450 #endif  // LIBYUV_USE_ST4
451 
NV12ToARGBRow_NEON(const uint8_t * src_y,const uint8_t * src_uv,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)452 void NV12ToARGBRow_NEON(const uint8_t* src_y,
453                         const uint8_t* src_uv,
454                         uint8_t* dst_argb,
455                         const struct YuvConstants* yuvconstants,
456                         int width) {
457   asm volatile(
458       YUVTORGB_SETUP
459       "movi        v19.8b, #255                  \n"
460       "ldr         q2, [%[kNV12Table]]           \n"
461       "1:                                        \n" READNV12 YUVTORGB RGBTORGB8
462       "subs        %w[width], %w[width], #8      \n"
463       "st4         {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
464       "b.gt        1b                            \n"
465       : [src_y] "+r"(src_y),                                // %[src_y]
466         [src_uv] "+r"(src_uv),                              // %[src_uv]
467         [dst_argb] "+r"(dst_argb),                          // %[dst_argb]
468         [width] "+r"(width)                                 // %[width]
469       : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),            // %[kUVCoeff]
470         [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias),  // %[kRGBCoeffBias]
471         [kNV12Table] "r"(&kNV12Table)
472       : "cc", "memory", YUVTORGB_REGS, "v2", "v19");
473 }
474 
NV21ToARGBRow_NEON(const uint8_t * src_y,const uint8_t * src_vu,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)475 void NV21ToARGBRow_NEON(const uint8_t* src_y,
476                         const uint8_t* src_vu,
477                         uint8_t* dst_argb,
478                         const struct YuvConstants* yuvconstants,
479                         int width) {
480   asm volatile(
481       YUVTORGB_SETUP
482       "movi        v19.8b, #255                  \n"
483       "ldr         q2, [%[kNV12Table]]           \n"
484       "1:                                        \n" READNV12 YUVTORGB RGBTORGB8
485       "subs        %w[width], %w[width], #8      \n"
486       "st4         {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
487       "b.gt        1b                            \n"
488       : [src_y] "+r"(src_y),                                // %[src_y]
489         [src_uv] "+r"(src_vu),                              // %[src_uv]
490         [dst_argb] "+r"(dst_argb),                          // %[dst_argb]
491         [width] "+r"(width)                                 // %[width]
492       : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),            // %[kUVCoeff]
493         [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias),  // %[kRGBCoeffBias]
494         [kNV12Table] "r"(&kNV21Table)
495       : "cc", "memory", YUVTORGB_REGS, "v2", "v19");
496 }
497 
NV12ToRGB24Row_NEON(const uint8_t * src_y,const uint8_t * src_uv,uint8_t * dst_rgb24,const struct YuvConstants * yuvconstants,int width)498 void NV12ToRGB24Row_NEON(const uint8_t* src_y,
499                          const uint8_t* src_uv,
500                          uint8_t* dst_rgb24,
501                          const struct YuvConstants* yuvconstants,
502                          int width) {
503   asm volatile(
504       YUVTORGB_SETUP
505       "ldr         q2, [%[kNV12Table]]           \n"
506       "1:                                        \n" READNV12 YUVTORGB RGBTORGB8
507       "subs        %w[width], %w[width], #8      \n"
508       "st3         {v16.8b,v17.8b,v18.8b}, [%[dst_rgb24]], #24 \n"
509       "b.gt        1b                            \n"
510       : [src_y] "+r"(src_y),                                // %[src_y]
511         [src_uv] "+r"(src_uv),                              // %[src_uv]
512         [dst_rgb24] "+r"(dst_rgb24),                        // %[dst_rgb24]
513         [width] "+r"(width)                                 // %[width]
514       : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),            // %[kUVCoeff]
515         [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias),  // %[kRGBCoeffBias]
516         [kNV12Table] "r"(&kNV12Table)
517       : "cc", "memory", YUVTORGB_REGS, "v2");
518 }
519 
NV21ToRGB24Row_NEON(const uint8_t * src_y,const uint8_t * src_vu,uint8_t * dst_rgb24,const struct YuvConstants * yuvconstants,int width)520 void NV21ToRGB24Row_NEON(const uint8_t* src_y,
521                          const uint8_t* src_vu,
522                          uint8_t* dst_rgb24,
523                          const struct YuvConstants* yuvconstants,
524                          int width) {
525   asm volatile(
526       YUVTORGB_SETUP
527       "ldr         q2, [%[kNV12Table]]           \n"
528       "1:                                        \n" READNV12 YUVTORGB RGBTORGB8
529       "subs        %w[width], %w[width], #8      \n"
530       "st3         {v16.8b,v17.8b,v18.8b}, [%[dst_rgb24]], #24 \n"
531       "b.gt        1b                            \n"
532       : [src_y] "+r"(src_y),                                // %[src_y]
533         [src_uv] "+r"(src_vu),                              // %[src_uv]
534         [dst_rgb24] "+r"(dst_rgb24),                        // %[dst_rgb24]
535         [width] "+r"(width)                                 // %[width]
536       : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),            // %[kUVCoeff]
537         [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias),  // %[kRGBCoeffBias]
538         [kNV12Table] "r"(&kNV21Table)
539       : "cc", "memory", YUVTORGB_REGS, "v2");
540 }
541 
NV12ToRGB565Row_NEON(const uint8_t * src_y,const uint8_t * src_uv,uint8_t * dst_rgb565,const struct YuvConstants * yuvconstants,int width)542 void NV12ToRGB565Row_NEON(const uint8_t* src_y,
543                           const uint8_t* src_uv,
544                           uint8_t* dst_rgb565,
545                           const struct YuvConstants* yuvconstants,
546                           int width) {
547   asm volatile(
548       YUVTORGB_SETUP
549       "ldr         q2, [%[kNV12Table]]           \n"
550       "1:                                        \n" READNV12 YUVTORGB RGBTORGB8
551       "subs        %w[width], %w[width], #8      \n" ARGBTORGB565
552       "st1         {v18.8h}, [%[dst_rgb565]], #16 \n"  // store 8
553                                                        // pixels
554                                                        // RGB565.
555       "b.gt        1b                            \n"
556       : [src_y] "+r"(src_y),                                // %[src_y]
557         [src_uv] "+r"(src_uv),                              // %[src_uv]
558         [dst_rgb565] "+r"(dst_rgb565),                      // %[dst_rgb565]
559         [width] "+r"(width)                                 // %[width]
560       : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),            // %[kUVCoeff]
561         [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias),  // %[kRGBCoeffBias]
562         [kNV12Table] "r"(&kNV12Table)
563       : "cc", "memory", YUVTORGB_REGS, "v2");
564 }
565 
YUY2ToARGBRow_NEON(const uint8_t * src_yuy2,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)566 void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2,
567                         uint8_t* dst_argb,
568                         const struct YuvConstants* yuvconstants,
569                         int width) {
570   asm volatile(
571       YUVTORGB_SETUP
572       "movi        v19.8b, #255                  \n"
573       "ldr         q2, [%[kNV12Table]]           \n"
574       "1:                                        \n" READYUY2 YUVTORGB RGBTORGB8
575       "subs        %w[width], %w[width], #8      \n"
576       "st4         {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
577       "b.gt        1b                            \n"
578       : [src_yuy2] "+r"(src_yuy2),                          // %[src_yuy2]
579         [dst_argb] "+r"(dst_argb),                          // %[dst_argb]
580         [width] "+r"(width)                                 // %[width]
581       : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),            // %[kUVCoeff]
582         [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias),  // %[kRGBCoeffBias]
583         [kNV12Table] "r"(&kNV12Table)
584       : "cc", "memory", YUVTORGB_REGS, "v2", "v19");
585 }
586 
UYVYToARGBRow_NEON(const uint8_t * src_uyvy,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)587 void UYVYToARGBRow_NEON(const uint8_t* src_uyvy,
588                         uint8_t* dst_argb,
589                         const struct YuvConstants* yuvconstants,
590                         int width) {
591   asm volatile(
592       YUVTORGB_SETUP
593       "movi        v19.8b, #255                  \n"
594       "ldr         q2, [%[kNV12Table]]           \n"
595       "1:                                        \n" READUYVY YUVTORGB RGBTORGB8
596       "subs        %w[width], %w[width], #8      \n"
597       "st4         {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
598       "b.gt        1b                            \n"
599       : [src_uyvy] "+r"(src_uyvy),                          // %[src_yuy2]
600         [dst_argb] "+r"(dst_argb),                          // %[dst_argb]
601         [width] "+r"(width)                                 // %[width]
602       : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),            // %[kUVCoeff]
603         [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias),  // %[kRGBCoeffBias]
604         [kNV12Table] "r"(&kNV12Table)
605       : "cc", "memory", YUVTORGB_REGS, "v2", "v19");
606 }
607 
608 // Reads 16 pairs of UV and write even values to dst_u and odd to dst_v.
SplitUVRow_NEON(const uint8_t * src_uv,uint8_t * dst_u,uint8_t * dst_v,int width)609 void SplitUVRow_NEON(const uint8_t* src_uv,
610                      uint8_t* dst_u,
611                      uint8_t* dst_v,
612                      int width) {
613   asm volatile(
614       "1:                                        \n"
615       "ld2         {v0.16b,v1.16b}, [%0], #32    \n"  // load 16 pairs of UV
616       "subs        %w3, %w3, #16                 \n"  // 16 processed per loop
617       "prfm        pldl1keep, [%0, 448]          \n"
618       "st1         {v0.16b}, [%1], #16           \n"  // store U
619       "st1         {v1.16b}, [%2], #16           \n"  // store V
620       "b.gt        1b                            \n"
621       : "+r"(src_uv),               // %0
622         "+r"(dst_u),                // %1
623         "+r"(dst_v),                // %2
624         "+r"(width)                 // %3  // Output registers
625       :                             // Input registers
626       : "cc", "memory", "v0", "v1"  // Clobber List
627   );
628 }
629 
630 // Reads 16 byte Y's from tile and writes out 16 Y's.
631 // MM21 Y tiles are 16x32 so src_tile_stride = 512 bytes
632 // MM21 UV tiles are 8x16 so src_tile_stride = 256 bytes
633 // width measured in bytes so 8 UV = 16.
DetileRow_NEON(const uint8_t * src,ptrdiff_t src_tile_stride,uint8_t * dst,int width)634 void DetileRow_NEON(const uint8_t* src,
635                     ptrdiff_t src_tile_stride,
636                     uint8_t* dst,
637                     int width) {
638   asm volatile(
639       "1:                                        \n"
640       "ld1         {v0.16b}, [%0], %3            \n"  // load 16 bytes
641       "subs        %w2, %w2, #16                 \n"  // 16 processed per loop
642       "prfm        pldl1keep, [%0, 1792]         \n"  // 7 tiles of 256b ahead
643       "st1         {v0.16b}, [%1], #16           \n"  // store 16 bytes
644       "b.gt        1b                            \n"
645       : "+r"(src),            // %0
646         "+r"(dst),            // %1
647         "+r"(width)           // %2
648       : "r"(src_tile_stride)  // %3
649       : "cc", "memory", "v0"  // Clobber List
650   );
651 }
652 
653 // Reads 16 byte Y's of 16 bits from tile and writes out 16 Y's.
DetileRow_16_NEON(const uint16_t * src,ptrdiff_t src_tile_stride,uint16_t * dst,int width)654 void DetileRow_16_NEON(const uint16_t* src,
655                        ptrdiff_t src_tile_stride,
656                        uint16_t* dst,
657                        int width) {
658   asm volatile(
659       "1:                                        \n"
660       "ld1         {v0.8h,v1.8h}, [%0], %3       \n"  // load 16 pixels
661       "subs        %w2, %w2, #16                 \n"  // 16 processed per loop
662       "prfm        pldl1keep, [%0, 3584]         \n"  // 7 tiles of 512b ahead
663       "st1         {v0.8h,v1.8h}, [%1], #32      \n"  // store 16 pixels
664       "b.gt        1b                            \n"
665       : "+r"(src),                  // %0
666         "+r"(dst),                  // %1
667         "+r"(width)                 // %2
668       : "r"(src_tile_stride * 2)    // %3
669       : "cc", "memory", "v0", "v1"  // Clobber List
670   );
671 }
672 
673 // Read 16 bytes of UV, detile, and write 8 bytes of U and 8 bytes of V.
DetileSplitUVRow_NEON(const uint8_t * src_uv,ptrdiff_t src_tile_stride,uint8_t * dst_u,uint8_t * dst_v,int width)674 void DetileSplitUVRow_NEON(const uint8_t* src_uv,
675                            ptrdiff_t src_tile_stride,
676                            uint8_t* dst_u,
677                            uint8_t* dst_v,
678                            int width) {
679   asm volatile(
680       "1:                                        \n"
681       "ld2         {v0.8b,v1.8b}, [%0], %4       \n"
682       "subs        %w3, %w3, #16                 \n"
683       "prfm        pldl1keep, [%0, 1792]         \n"
684       "st1         {v0.8b}, [%1], #8             \n"
685       "st1         {v1.8b}, [%2], #8             \n"
686       "b.gt        1b                            \n"
687       : "+r"(src_uv),               // %0
688         "+r"(dst_u),                // %1
689         "+r"(dst_v),                // %2
690         "+r"(width)                 // %3
691       : "r"(src_tile_stride)        // %4
692       : "cc", "memory", "v0", "v1"  // Clobber List
693   );
694 }
695 
696 #if LIBYUV_USE_ST2
697 // Read 16 Y, 8 UV, and write 8 YUY2
DetileToYUY2_NEON(const uint8_t * src_y,ptrdiff_t src_y_tile_stride,const uint8_t * src_uv,ptrdiff_t src_uv_tile_stride,uint8_t * dst_yuy2,int width)698 void DetileToYUY2_NEON(const uint8_t* src_y,
699                        ptrdiff_t src_y_tile_stride,
700                        const uint8_t* src_uv,
701                        ptrdiff_t src_uv_tile_stride,
702                        uint8_t* dst_yuy2,
703                        int width) {
704   asm volatile(
705       "1:                                        \n"
706       "ld1         {v0.16b}, [%0], %4            \n"  // load 16 Ys
707       "prfm        pldl1keep, [%0, 1792]         \n"
708       "ld1         {v1.16b}, [%1], %5            \n"  // load 8 UVs
709       "prfm        pldl1keep, [%1, 1792]         \n"
710       "subs        %w3, %w3, #16                 \n"  // store 8 YUY2
711       "st2         {v0.16b,v1.16b}, [%2], #32    \n"
712       "b.gt        1b                            \n"
713       : "+r"(src_y),                // %0
714         "+r"(src_uv),               // %1
715         "+r"(dst_yuy2),             // %2
716         "+r"(width)                 // %3
717       : "r"(src_y_tile_stride),     // %4
718         "r"(src_uv_tile_stride)     // %5
719       : "cc", "memory", "v0", "v1"  // Clobber list
720   );
721 }
722 #else
723 // Read 16 Y, 8 UV, and write 8 YUY2
DetileToYUY2_NEON(const uint8_t * src_y,ptrdiff_t src_y_tile_stride,const uint8_t * src_uv,ptrdiff_t src_uv_tile_stride,uint8_t * dst_yuy2,int width)724 void DetileToYUY2_NEON(const uint8_t* src_y,
725                        ptrdiff_t src_y_tile_stride,
726                        const uint8_t* src_uv,
727                        ptrdiff_t src_uv_tile_stride,
728                        uint8_t* dst_yuy2,
729                        int width) {
730   asm volatile(
731       "1:                                        \n"
732       "ld1         {v0.16b}, [%0], %4            \n"  // load 16 Ys
733       "ld1         {v1.16b}, [%1], %5            \n"  // load 8 UVs
734       "subs        %w3, %w3, #16                 \n"
735       "prfm        pldl1keep, [%0, 1792]         \n"
736       "zip1        v2.16b, v0.16b, v1.16b        \n"
737       "prfm        pldl1keep, [%1, 1792]         \n"
738       "zip2        v3.16b, v0.16b, v1.16b        \n"
739       "st1         {v2.16b,v3.16b}, [%2], #32    \n"  // store 8 YUY2
740       "b.gt        1b                            \n"
741       : "+r"(src_y),                            // %0
742         "+r"(src_uv),                           // %1
743         "+r"(dst_yuy2),                         // %2
744         "+r"(width)                             // %3
745       : "r"(src_y_tile_stride),                 // %4
746         "r"(src_uv_tile_stride)                 // %5
747       : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber list
748   );
749 }
750 #endif
751 
752 // Unpack MT2T into tiled P010 64 pixels at a time. See
753 // tinyurl.com/mtk-10bit-video-format for format documentation.
UnpackMT2T_NEON(const uint8_t * src,uint16_t * dst,size_t size)754 void UnpackMT2T_NEON(const uint8_t* src, uint16_t* dst, size_t size) {
755   asm volatile(
756       "1:                                        \n"
757       "ld1         {v7.16b}, [%0], #16           \n"
758       "ld1         {v0.16b-v3.16b}, [%0], #64    \n"
759       "shl         v4.16b, v7.16b, #6            \n"
760       "shl         v5.16b, v7.16b, #4            \n"
761       "shl         v6.16b, v7.16b, #2            \n"
762       "subs        %2, %2, #80                   \n"
763       "zip1        v16.16b, v4.16b, v0.16b       \n"
764       "zip1        v18.16b, v5.16b, v1.16b       \n"
765       "zip1        v20.16b, v6.16b, v2.16b       \n"
766       "zip1        v22.16b, v7.16b, v3.16b       \n"
767       "zip2        v17.16b, v4.16b, v0.16b       \n"
768       "zip2        v19.16b, v5.16b, v1.16b       \n"
769       "zip2        v21.16b, v6.16b, v2.16b       \n"
770       "zip2        v23.16b, v7.16b, v3.16b       \n"
771       "sri         v16.8h, v16.8h, #10           \n"
772       "sri         v17.8h, v17.8h, #10           \n"
773       "sri         v18.8h, v18.8h, #10           \n"
774       "sri         v19.8h, v19.8h, #10           \n"
775       "st1         {v16.8h-v19.8h}, [%1], #64    \n"
776       "sri         v20.8h, v20.8h, #10           \n"
777       "sri         v21.8h, v21.8h, #10           \n"
778       "sri         v22.8h, v22.8h, #10           \n"
779       "sri         v23.8h, v23.8h, #10           \n"
780       "st1         {v20.8h-v23.8h}, [%1], #64    \n"
781       "b.gt        1b                            \n"
782       : "+r"(src),  // %0
783         "+r"(dst),  // %1
784         "+r"(size)  // %2
785       :
786       : "cc", "memory", "w0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
787         "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23");
788 }
789 
790 #if LIBYUV_USE_ST2
791 // Reads 16 U's and V's and writes out 16 pairs of UV.
MergeUVRow_NEON(const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_uv,int width)792 void MergeUVRow_NEON(const uint8_t* src_u,
793                      const uint8_t* src_v,
794                      uint8_t* dst_uv,
795                      int width) {
796   asm volatile(
797       "1:                                        \n"
798       "ld1         {v0.16b}, [%0], #16           \n"  // load U
799       "ld1         {v1.16b}, [%1], #16           \n"  // load V
800       "subs        %w3, %w3, #16                 \n"  // 16 processed per loop
801       "prfm        pldl1keep, [%0, 448]          \n"
802       "prfm        pldl1keep, [%1, 448]          \n"
803       "st2         {v0.16b,v1.16b}, [%2], #32    \n"  // store 16 pairs of UV
804       "b.gt        1b                            \n"
805       : "+r"(src_u),                // %0
806         "+r"(src_v),                // %1
807         "+r"(dst_uv),               // %2
808         "+r"(width)                 // %3  // Output registers
809       :                             // Input registers
810       : "cc", "memory", "v0", "v1"  // Clobber List
811   );
812 }
813 
MergeUVRow_16_NEON(const uint16_t * src_u,const uint16_t * src_v,uint16_t * dst_uv,int depth,int width)814 void MergeUVRow_16_NEON(const uint16_t* src_u,
815                         const uint16_t* src_v,
816                         uint16_t* dst_uv,
817                         int depth,
818                         int width) {
819   int shift = 16 - depth;
820   asm volatile(
821       "dup         v2.8h, %w4                    \n"
822       "1:                                        \n"
823       "ld1         {v0.8h}, [%0], #16            \n"  // load 8 U
824       "subs        %w3, %w3, #8                  \n"  // 8 src pixels per loop
825       "ld1         {v1.8h}, [%1], #16            \n"  // load 8 V
826       "ushl        v0.8h, v0.8h, v2.8h           \n"
827       "prfm        pldl1keep, [%0, 448]          \n"
828       "ushl        v1.8h, v1.8h, v2.8h           \n"
829       "prfm        pldl1keep, [%1, 448]          \n"
830       "st2         {v0.8h, v1.8h}, [%2], #32     \n"  // store 8 UV pixels
831       "b.gt        1b                            \n"
832       : "+r"(src_u),   // %0
833         "+r"(src_v),   // %1
834         "+r"(dst_uv),  // %2
835         "+r"(width)    // %3
836       : "r"(shift)     // %4
837       : "cc", "memory", "v0", "v1", "v2");
838 }
839 #else
840 // Reads 16 U's and V's and writes out 16 pairs of UV.
MergeUVRow_NEON(const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_uv,int width)841 void MergeUVRow_NEON(const uint8_t* src_u,
842                      const uint8_t* src_v,
843                      uint8_t* dst_uv,
844                      int width) {
845   asm volatile(
846       "1:                                        \n"
847       "ld1         {v0.16b}, [%0], #16           \n"  // load U
848       "ld1         {v1.16b}, [%1], #16           \n"  // load V
849       "subs        %w3, %w3, #16                 \n"  // 16 processed per loop
850       "zip1        v2.16b, v0.16b, v1.16b        \n"
851       "prfm        pldl1keep, [%0, 448]          \n"
852       "zip2        v3.16b, v0.16b, v1.16b        \n"
853       "prfm        pldl1keep, [%1, 448]          \n"
854       "st1         {v2.16b,v3.16b}, [%2], #32    \n"  // store 16 pairs of UV
855       "b.gt        1b                            \n"
856       : "+r"(src_u),                            // %0
857         "+r"(src_v),                            // %1
858         "+r"(dst_uv),                           // %2
859         "+r"(width)                             // %3  // Output registers
860       :                                         // Input registers
861       : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
862   );
863 }
864 
MergeUVRow_16_NEON(const uint16_t * src_u,const uint16_t * src_v,uint16_t * dst_uv,int depth,int width)865 void MergeUVRow_16_NEON(const uint16_t* src_u,
866                         const uint16_t* src_v,
867                         uint16_t* dst_uv,
868                         int depth,
869                         int width) {
870   int shift = 16 - depth;
871   asm volatile(
872       "dup         v4.8h, %w4                    \n"
873       "1:                                        \n"
874       "ld1         {v0.8h}, [%0], #16            \n"  // load 8 U
875       "subs        %w3, %w3, #8                  \n"  // 8 src pixels per loop
876       "ld1         {v1.8h}, [%1], #16            \n"  // load 8 V
877       "ushl        v0.8h, v0.8h, v4.8h           \n"
878       "ushl        v1.8h, v1.8h, v4.8h           \n"
879       "prfm        pldl1keep, [%0, 448]          \n"
880       "zip1        v2.8h, v0.8h, v1.8h           \n"
881       "zip2        v3.8h, v0.8h, v1.8h           \n"
882       "prfm        pldl1keep, [%1, 448]          \n"
883       "st1         {v2.8h, v3.8h}, [%2], #32     \n"  // store 8 UV pixels
884       "b.gt        1b                            \n"
885       : "+r"(src_u),   // %0
886         "+r"(src_v),   // %1
887         "+r"(dst_uv),  // %2
888         "+r"(width)    // %3
889       : "r"(shift)     // %4
890       : "cc", "memory", "v0", "v1", "v2", "v1", "v2", "v3", "v4");
891 }
892 #endif  // LIBYUV_USE_ST2
893 
894 // Reads 16 packed RGB and write to planar dst_r, dst_g, dst_b.
SplitRGBRow_NEON(const uint8_t * src_rgb,uint8_t * dst_r,uint8_t * dst_g,uint8_t * dst_b,int width)895 void SplitRGBRow_NEON(const uint8_t* src_rgb,
896                       uint8_t* dst_r,
897                       uint8_t* dst_g,
898                       uint8_t* dst_b,
899                       int width) {
900   asm volatile(
901       "1:                                        \n"
902       "ld3         {v0.16b,v1.16b,v2.16b}, [%0], #48 \n"  // load 16 RGB
903       "subs        %w4, %w4, #16                 \n"  // 16 processed per loop
904       "prfm        pldl1keep, [%0, 448]          \n"
905       "st1         {v0.16b}, [%1], #16           \n"  // store R
906       "st1         {v1.16b}, [%2], #16           \n"  // store G
907       "st1         {v2.16b}, [%3], #16           \n"  // store B
908       "b.gt        1b                            \n"
909       : "+r"(src_rgb),                    // %0
910         "+r"(dst_r),                      // %1
911         "+r"(dst_g),                      // %2
912         "+r"(dst_b),                      // %3
913         "+r"(width)                       // %4
914       :                                   // Input registers
915       : "cc", "memory", "v0", "v1", "v2"  // Clobber List
916   );
917 }
918 
919 // Reads 16 planar R's, G's and B's and writes out 16 packed RGB at a time
MergeRGBRow_NEON(const uint8_t * src_r,const uint8_t * src_g,const uint8_t * src_b,uint8_t * dst_rgb,int width)920 void MergeRGBRow_NEON(const uint8_t* src_r,
921                       const uint8_t* src_g,
922                       const uint8_t* src_b,
923                       uint8_t* dst_rgb,
924                       int width) {
925   asm volatile(
926       "1:                                        \n"
927       "ld1         {v0.16b}, [%0], #16           \n"  // load R
928       "ld1         {v1.16b}, [%1], #16           \n"  // load G
929       "ld1         {v2.16b}, [%2], #16           \n"  // load B
930       "subs        %w4, %w4, #16                 \n"  // 16 processed per loop
931       "prfm        pldl1keep, [%0, 448]          \n"
932       "prfm        pldl1keep, [%1, 448]          \n"
933       "prfm        pldl1keep, [%2, 448]          \n"
934       "st3         {v0.16b,v1.16b,v2.16b}, [%3], #48 \n"  // store 16 RGB
935       "b.gt        1b                            \n"
936       : "+r"(src_r),                      // %0
937         "+r"(src_g),                      // %1
938         "+r"(src_b),                      // %2
939         "+r"(dst_rgb),                    // %3
940         "+r"(width)                       // %4
941       :                                   // Input registers
942       : "cc", "memory", "v0", "v1", "v2"  // Clobber List
943   );
944 }
945 
946 // Reads 16 packed ARGB and write to planar dst_r, dst_g, dst_b, dst_a.
SplitARGBRow_NEON(const uint8_t * src_rgba,uint8_t * dst_r,uint8_t * dst_g,uint8_t * dst_b,uint8_t * dst_a,int width)947 void SplitARGBRow_NEON(const uint8_t* src_rgba,
948                        uint8_t* dst_r,
949                        uint8_t* dst_g,
950                        uint8_t* dst_b,
951                        uint8_t* dst_a,
952                        int width) {
953   asm volatile(
954       "1:                                        \n"
955       "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 ARGB
956       "subs        %w5, %w5, #16                 \n"  // 16 processed per loop
957       "prfm        pldl1keep, [%0, 448]          \n"
958       "st1         {v0.16b}, [%3], #16           \n"  // store B
959       "st1         {v1.16b}, [%2], #16           \n"  // store G
960       "st1         {v2.16b}, [%1], #16           \n"  // store R
961       "st1         {v3.16b}, [%4], #16           \n"  // store A
962       "b.gt        1b                            \n"
963       : "+r"(src_rgba),                         // %0
964         "+r"(dst_r),                            // %1
965         "+r"(dst_g),                            // %2
966         "+r"(dst_b),                            // %3
967         "+r"(dst_a),                            // %4
968         "+r"(width)                             // %5
969       :                                         // Input registers
970       : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
971   );
972 }
973 
974 #if LIBYUV_USE_ST4
975 // Reads 16 planar R's, G's, B's and A's and writes out 16 packed ARGB at a time
MergeARGBRow_NEON(const uint8_t * src_r,const uint8_t * src_g,const uint8_t * src_b,const uint8_t * src_a,uint8_t * dst_argb,int width)976 void MergeARGBRow_NEON(const uint8_t* src_r,
977                        const uint8_t* src_g,
978                        const uint8_t* src_b,
979                        const uint8_t* src_a,
980                        uint8_t* dst_argb,
981                        int width) {
982   asm volatile(
983       "1:                                        \n"
984       "ld1         {v0.16b}, [%2], #16           \n"  // load B
985       "ld1         {v1.16b}, [%1], #16           \n"  // load G
986       "ld1         {v2.16b}, [%0], #16           \n"  // load R
987       "ld1         {v3.16b}, [%3], #16           \n"  // load A
988       "subs        %w5, %w5, #16                 \n"  // 16 processed per loop
989       "prfm        pldl1keep, [%0, 448]          \n"
990       "prfm        pldl1keep, [%1, 448]          \n"
991       "prfm        pldl1keep, [%2, 448]          \n"
992       "prfm        pldl1keep, [%3, 448]          \n"
993       "st4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%4], #64 \n"  // store 16ARGB
994       "b.gt        1b                            \n"
995       : "+r"(src_r),                            // %0
996         "+r"(src_g),                            // %1
997         "+r"(src_b),                            // %2
998         "+r"(src_a),                            // %3
999         "+r"(dst_argb),                         // %4
1000         "+r"(width)                             // %5
1001       :                                         // Input registers
1002       : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
1003   );
1004 }
1005 #else
1006 // Reads 16 planar R's, G's, B's and A's and writes out 16 packed ARGB at a time
MergeARGBRow_NEON(const uint8_t * src_r,const uint8_t * src_g,const uint8_t * src_b,const uint8_t * src_a,uint8_t * dst_argb,int width)1007 void MergeARGBRow_NEON(const uint8_t* src_r,
1008                        const uint8_t* src_g,
1009                        const uint8_t* src_b,
1010                        const uint8_t* src_a,
1011                        uint8_t* dst_argb,
1012                        int width) {
1013   asm volatile(
1014       "1:                                        \n"
1015       "ld1         {v0.16b}, [%2], #16           \n"  // load B
1016       "ld1         {v1.16b}, [%1], #16           \n"  // load G
1017       "ld1         {v2.16b}, [%0], #16           \n"  // load R
1018       "ld1         {v3.16b}, [%3], #16           \n"  // load A
1019       "subs        %w5, %w5, #16                 \n"  // 16 processed per loop
1020       "prfm        pldl1keep, [%2, 448]          \n"
1021       "zip1        v4.16b, v0.16b, v1.16b        \n"  // BG
1022       "zip1        v5.16b, v2.16b, v3.16b        \n"  // RA
1023       "prfm        pldl1keep, [%1, 448]          \n"
1024       "zip2        v6.16b, v0.16b, v1.16b        \n"  // BG
1025       "zip2        v7.16b, v2.16b, v3.16b        \n"  // RA
1026       "prfm        pldl1keep, [%0, 448]          \n"
1027       "zip1        v0.8h, v4.8h, v5.8h           \n"  // BGRA
1028       "zip2        v1.8h, v4.8h, v5.8h           \n"
1029       "prfm        pldl1keep, [%3, 448]          \n"
1030       "zip1        v2.8h, v6.8h, v7.8h           \n"
1031       "zip2        v3.8h, v6.8h, v7.8h           \n"
1032       "st1         {v0.16b,v1.16b,v2.16b,v3.16b}, [%4], #64 \n"  // store 16ARGB
1033       "b.gt        1b                            \n"
1034       : "+r"(src_r),     // %0
1035         "+r"(src_g),     // %1
1036         "+r"(src_b),     // %2
1037         "+r"(src_a),     // %3
1038         "+r"(dst_argb),  // %4
1039         "+r"(width)      // %5
1040       :                  // Input registers
1041       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
1042         "v7"  // Clobber List
1043   );
1044 }
1045 #endif  // LIBYUV_USE_ST4
1046 
1047 // Reads 16 packed ARGB and write to planar dst_r, dst_g, dst_b.
SplitXRGBRow_NEON(const uint8_t * src_rgba,uint8_t * dst_r,uint8_t * dst_g,uint8_t * dst_b,int width)1048 void SplitXRGBRow_NEON(const uint8_t* src_rgba,
1049                        uint8_t* dst_r,
1050                        uint8_t* dst_g,
1051                        uint8_t* dst_b,
1052                        int width) {
1053   asm volatile(
1054       "1:                                        \n"
1055       "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 ARGB
1056       "subs        %w4, %w4, #16                 \n"  // 16 processed per loop
1057       "prfm        pldl1keep, [%0, 448]          \n"
1058       "st1         {v0.16b}, [%3], #16           \n"  // store B
1059       "st1         {v1.16b}, [%2], #16           \n"  // store G
1060       "st1         {v2.16b}, [%1], #16           \n"  // store R
1061       "b.gt        1b                            \n"
1062       : "+r"(src_rgba),                         // %0
1063         "+r"(dst_r),                            // %1
1064         "+r"(dst_g),                            // %2
1065         "+r"(dst_b),                            // %3
1066         "+r"(width)                             // %4
1067       :                                         // Input registers
1068       : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
1069   );
1070 }
1071 
1072 // Reads 16 planar R's, G's and B's and writes out 16 packed ARGB at a time
MergeXRGBRow_NEON(const uint8_t * src_r,const uint8_t * src_g,const uint8_t * src_b,uint8_t * dst_argb,int width)1073 void MergeXRGBRow_NEON(const uint8_t* src_r,
1074                        const uint8_t* src_g,
1075                        const uint8_t* src_b,
1076                        uint8_t* dst_argb,
1077                        int width) {
1078   asm volatile(
1079       "movi        v3.16b, #255                  \n"  // load A(255)
1080       "1:                                        \n"
1081       "ld1         {v2.16b}, [%0], #16           \n"  // load R
1082       "ld1         {v1.16b}, [%1], #16           \n"  // load G
1083       "ld1         {v0.16b}, [%2], #16           \n"  // load B
1084       "subs        %w4, %w4, #16                 \n"  // 16 processed per loop
1085       "prfm        pldl1keep, [%0, 448]          \n"
1086       "prfm        pldl1keep, [%1, 448]          \n"
1087       "prfm        pldl1keep, [%2, 448]          \n"
1088       "st4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%3], #64 \n"  // store 16ARGB
1089       "b.gt        1b                            \n"
1090       : "+r"(src_r),                            // %0
1091         "+r"(src_g),                            // %1
1092         "+r"(src_b),                            // %2
1093         "+r"(dst_argb),                         // %3
1094         "+r"(width)                             // %4
1095       :                                         // Input registers
1096       : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
1097   );
1098 }
1099 
MergeXR30Row_NEON(const uint16_t * src_r,const uint16_t * src_g,const uint16_t * src_b,uint8_t * dst_ar30,int depth,int width)1100 void MergeXR30Row_NEON(const uint16_t* src_r,
1101                        const uint16_t* src_g,
1102                        const uint16_t* src_b,
1103                        uint8_t* dst_ar30,
1104                        int depth,
1105                        int width) {
1106   int shift = 10 - depth;
1107   asm volatile(
1108       "movi        v30.16b, #255                 \n"
1109       "ushr        v30.4s, v30.4s, #22           \n"  // 1023
1110       "dup         v31.4s, %w5                   \n"
1111       "1:                                        \n"
1112       "ldr         d2, [%2], #8                  \n"  // B
1113       "ldr         d1, [%1], #8                  \n"  // G
1114       "ldr         d0, [%0], #8                  \n"  // R
1115       "ushll       v2.4s, v2.4h, #0              \n"  // B
1116       "ushll       v1.4s, v1.4h, #0              \n"  // G
1117       "ushll       v0.4s, v0.4h, #0              \n"  // R
1118       "ushl        v2.4s, v2.4s, v31.4s          \n"  // 000B
1119       "ushl        v1.4s, v1.4s, v31.4s          \n"  // G
1120       "ushl        v0.4s, v0.4s, v31.4s          \n"  // R
1121       "umin        v2.4s, v2.4s, v30.4s          \n"
1122       "umin        v1.4s, v1.4s, v30.4s          \n"
1123       "umin        v0.4s, v0.4s, v30.4s          \n"
1124       "sli         v2.4s, v1.4s, #10             \n"  // 00GB
1125       "sli         v2.4s, v0.4s, #20             \n"  // 0RGB
1126       "orr         v2.4s, #0xc0, lsl #24         \n"  // ARGB (AR30)
1127       "subs        %w4, %w4, #4                  \n"
1128       "str         q2, [%3], #16                 \n"
1129       "b.gt        1b                            \n"
1130       : "+r"(src_r),     // %0
1131         "+r"(src_g),     // %1
1132         "+r"(src_b),     // %2
1133         "+r"(dst_ar30),  // %3
1134         "+r"(width)      // %4
1135       : "r"(shift)       // %5
1136       : "memory", "cc", "v0", "v1", "v2", "v30", "v31");
1137 }
1138 
MergeXR30Row_10_NEON(const uint16_t * src_r,const uint16_t * src_g,const uint16_t * src_b,uint8_t * dst_ar30,int,int width)1139 void MergeXR30Row_10_NEON(const uint16_t* src_r,
1140                           const uint16_t* src_g,
1141                           const uint16_t* src_b,
1142                           uint8_t* dst_ar30,
1143                           int /* depth */,
1144                           int width) {
1145   asm volatile(
1146       "movi        v30.16b, #255                 \n"
1147       "ushr        v30.4s, v30.4s, #22           \n"  // 1023
1148       "1:                                        \n"
1149       "ldr         d2, [%2], #8                  \n"  // B
1150       "ldr         d1, [%1], #8                  \n"  // G
1151       "ldr         d0, [%0], #8                  \n"  // R
1152       "ushll       v2.4s, v2.4h, #0              \n"  // 000B
1153       "ushll       v1.4s, v1.4h, #0              \n"  // G
1154       "ushll       v0.4s, v0.4h, #0              \n"  // R
1155       "umin        v2.4s, v2.4s, v30.4s          \n"
1156       "umin        v1.4s, v1.4s, v30.4s          \n"
1157       "umin        v0.4s, v0.4s, v30.4s          \n"
1158       "sli         v2.4s, v1.4s, #10             \n"  // 00GB
1159       "sli         v2.4s, v0.4s, #20             \n"  // 0RGB
1160       "orr         v2.4s, #0xc0, lsl #24         \n"  // ARGB (AR30)
1161       "subs        %w4, %w4, #4                  \n"
1162       "str         q2, [%3], #16                 \n"
1163       "b.gt        1b                            \n"
1164       : "+r"(src_r),     // %0
1165         "+r"(src_g),     // %1
1166         "+r"(src_b),     // %2
1167         "+r"(dst_ar30),  // %3
1168         "+r"(width)      // %4
1169       :
1170       : "memory", "cc", "v0", "v1", "v2", "v30");
1171 }
1172 
MergeAR64Row_NEON(const uint16_t * src_r,const uint16_t * src_g,const uint16_t * src_b,const uint16_t * src_a,uint16_t * dst_ar64,int depth,int width)1173 void MergeAR64Row_NEON(const uint16_t* src_r,
1174                        const uint16_t* src_g,
1175                        const uint16_t* src_b,
1176                        const uint16_t* src_a,
1177                        uint16_t* dst_ar64,
1178                        int depth,
1179                        int width) {
1180   int shift = 16 - depth;
1181   int mask = (1 << depth) - 1;
1182   asm volatile(
1183 
1184       "dup         v30.8h, %w7                   \n"
1185       "dup         v31.8h, %w6                   \n"
1186       "1:                                        \n"
1187       "ldr         q2, [%0], #16                 \n"  // R
1188       "ldr         q1, [%1], #16                 \n"  // G
1189       "ldr         q0, [%2], #16                 \n"  // B
1190       "ldr         q3, [%3], #16                 \n"  // A
1191       "umin        v2.8h, v2.8h, v30.8h          \n"
1192       "prfm        pldl1keep, [%0, 448]          \n"
1193       "umin        v1.8h, v1.8h, v30.8h          \n"
1194       "prfm        pldl1keep, [%1, 448]          \n"
1195       "umin        v0.8h, v0.8h, v30.8h          \n"
1196       "prfm        pldl1keep, [%2, 448]          \n"
1197       "umin        v3.8h, v3.8h, v30.8h          \n"
1198       "prfm        pldl1keep, [%3, 448]          \n"
1199       "ushl        v2.8h, v2.8h, v31.8h          \n"
1200       "ushl        v1.8h, v1.8h, v31.8h          \n"
1201       "ushl        v0.8h, v0.8h, v31.8h          \n"
1202       "ushl        v3.8h, v3.8h, v31.8h          \n"
1203       "subs        %w5, %w5, #8                  \n"
1204       "st4         {v0.8h, v1.8h, v2.8h, v3.8h}, [%4], #64 \n"
1205       "b.gt        1b                            \n"
1206       : "+r"(src_r),     // %0
1207         "+r"(src_g),     // %1
1208         "+r"(src_b),     // %2
1209         "+r"(src_a),     // %3
1210         "+r"(dst_ar64),  // %4
1211         "+r"(width)      // %5
1212       : "r"(shift),      // %6
1213         "r"(mask)        // %7
1214       : "memory", "cc", "v0", "v1", "v2", "v3", "v31");
1215 }
1216 
MergeXR64Row_NEON(const uint16_t * src_r,const uint16_t * src_g,const uint16_t * src_b,uint16_t * dst_ar64,int depth,int width)1217 void MergeXR64Row_NEON(const uint16_t* src_r,
1218                        const uint16_t* src_g,
1219                        const uint16_t* src_b,
1220                        uint16_t* dst_ar64,
1221                        int depth,
1222                        int width) {
1223   int shift = 16 - depth;
1224   int mask = (1 << depth) - 1;
1225   asm volatile(
1226 
1227       "movi        v3.16b, #0xff                 \n"  // A (0xffff)
1228       "dup         v30.8h, %w6                   \n"
1229       "dup         v31.8h, %w5                   \n"
1230 
1231       "1:                                        \n"
1232       "ldr         q2, [%0], #16                 \n"  // R
1233       "ldr         q1, [%1], #16                 \n"  // G
1234       "ldr         q0, [%2], #16                 \n"  // B
1235       "umin        v2.8h, v2.8h, v30.8h          \n"
1236       "prfm        pldl1keep, [%0, 448]          \n"
1237       "umin        v1.8h, v1.8h, v30.8h          \n"
1238       "prfm        pldl1keep, [%1, 448]          \n"
1239       "umin        v0.8h, v0.8h, v30.8h          \n"
1240       "prfm        pldl1keep, [%2, 448]          \n"
1241       "ushl        v2.8h, v2.8h, v31.8h          \n"
1242       "ushl        v1.8h, v1.8h, v31.8h          \n"
1243       "ushl        v0.8h, v0.8h, v31.8h          \n"
1244       "subs        %w4, %w4, #8                  \n"
1245       "st4         {v0.8h, v1.8h, v2.8h, v3.8h}, [%3], #64 \n"
1246       "b.gt        1b                            \n"
1247       : "+r"(src_r),     // %0
1248         "+r"(src_g),     // %1
1249         "+r"(src_b),     // %2
1250         "+r"(dst_ar64),  // %3
1251         "+r"(width)      // %4
1252       : "r"(shift),      // %5
1253         "r"(mask)        // %6
1254       : "memory", "cc", "v0", "v1", "v2", "v3", "v31");
1255 }
1256 
MergeARGB16To8Row_NEON(const uint16_t * src_r,const uint16_t * src_g,const uint16_t * src_b,const uint16_t * src_a,uint8_t * dst_argb,int depth,int width)1257 void MergeARGB16To8Row_NEON(const uint16_t* src_r,
1258                             const uint16_t* src_g,
1259                             const uint16_t* src_b,
1260                             const uint16_t* src_a,
1261                             uint8_t* dst_argb,
1262                             int depth,
1263                             int width) {
1264   int shift = 8 - depth;
1265   asm volatile(
1266 
1267       "dup         v31.8h, %w6                   \n"
1268       "1:                                        \n"
1269       "ldr         q2, [%0], #16                 \n"  // R
1270       "ldr         q1, [%1], #16                 \n"  // G
1271       "ldr         q0, [%2], #16                 \n"  // B
1272       "ldr         q3, [%3], #16                 \n"  // A
1273       "ushl        v2.8h, v2.8h, v31.8h          \n"
1274       "prfm        pldl1keep, [%0, 448]          \n"
1275       "ushl        v1.8h, v1.8h, v31.8h          \n"
1276       "prfm        pldl1keep, [%1, 448]          \n"
1277       "ushl        v0.8h, v0.8h, v31.8h          \n"
1278       "prfm        pldl1keep, [%2, 448]          \n"
1279       "ushl        v3.8h, v3.8h, v31.8h          \n"
1280       "prfm        pldl1keep, [%3, 448]          \n"
1281       "uqxtn       v2.8b, v2.8h                  \n"
1282       "uqxtn       v1.8b, v1.8h                  \n"
1283       "uqxtn       v0.8b, v0.8h                  \n"
1284       "uqxtn       v3.8b, v3.8h                  \n"
1285       "subs        %w5, %w5, #8                  \n"
1286       "st4         {v0.8b, v1.8b, v2.8b, v3.8b}, [%4], #32 \n"
1287       "b.gt        1b                            \n"
1288       : "+r"(src_r),     // %0
1289         "+r"(src_g),     // %1
1290         "+r"(src_b),     // %2
1291         "+r"(src_a),     // %3
1292         "+r"(dst_argb),  // %4
1293         "+r"(width)      // %5
1294       : "r"(shift)       // %6
1295       : "memory", "cc", "v0", "v1", "v2", "v3", "v31");
1296 }
1297 
MergeXRGB16To8Row_NEON(const uint16_t * src_r,const uint16_t * src_g,const uint16_t * src_b,uint8_t * dst_argb,int depth,int width)1298 void MergeXRGB16To8Row_NEON(const uint16_t* src_r,
1299                             const uint16_t* src_g,
1300                             const uint16_t* src_b,
1301                             uint8_t* dst_argb,
1302                             int depth,
1303                             int width) {
1304   int shift = 8 - depth;
1305   asm volatile(
1306 
1307       "dup         v31.8h, %w5                   \n"
1308       "movi        v3.8b, #0xff                  \n"  // A (0xff)
1309       "1:                                        \n"
1310       "ldr         q2, [%0], #16                 \n"  // R
1311       "ldr         q1, [%1], #16                 \n"  // G
1312       "ldr         q0, [%2], #16                 \n"  // B
1313       "ushl        v2.8h, v2.8h, v31.8h          \n"
1314       "prfm        pldl1keep, [%0, 448]          \n"
1315       "ushl        v1.8h, v1.8h, v31.8h          \n"
1316       "prfm        pldl1keep, [%1, 448]          \n"
1317       "ushl        v0.8h, v0.8h, v31.8h          \n"
1318       "prfm        pldl1keep, [%2, 448]          \n"
1319       "uqxtn       v2.8b, v2.8h                  \n"
1320       "uqxtn       v1.8b, v1.8h                  \n"
1321       "uqxtn       v0.8b, v0.8h                  \n"
1322       "subs        %w4, %w4, #8                  \n"
1323       "st4         {v0.8b, v1.8b, v2.8b, v3.8b}, [%3], #32 \n"
1324       "b.gt        1b                            \n"
1325       : "+r"(src_r),     // %0
1326         "+r"(src_g),     // %1
1327         "+r"(src_b),     // %2
1328         "+r"(dst_argb),  // %3
1329         "+r"(width)      // %4
1330       : "r"(shift)       // %5
1331       : "memory", "cc", "v0", "v1", "v2", "v3", "v31");
1332 }
1333 
1334 // Copy multiple of 32.
CopyRow_NEON(const uint8_t * src,uint8_t * dst,int width)1335 void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
1336   asm volatile(
1337       "1:                                        \n"
1338       "ldp         q0, q1, [%0], #32             \n"
1339       "prfm        pldl1keep, [%0, 448]          \n"
1340       "subs        %w2, %w2, #32                 \n"  // 32 processed per loop
1341       "stp         q0, q1, [%1], #32             \n"
1342       "b.gt        1b                            \n"
1343       : "+r"(src),                  // %0
1344         "+r"(dst),                  // %1
1345         "+r"(width)                 // %2  // Output registers
1346       :                             // Input registers
1347       : "cc", "memory", "v0", "v1"  // Clobber List
1348   );
1349 }
1350 
1351 // SetRow writes 'width' bytes using an 8 bit value repeated.
SetRow_NEON(uint8_t * dst,uint8_t v8,int width)1352 void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) {
1353   asm volatile(
1354       "dup         v0.16b, %w2                   \n"  // duplicate 16 bytes
1355       "1:                                        \n"
1356       "subs        %w1, %w1, #16                 \n"  // 16 bytes per loop
1357       "st1         {v0.16b}, [%0], #16           \n"  // store
1358       "b.gt        1b                            \n"
1359       : "+r"(dst),   // %0
1360         "+r"(width)  // %1
1361       : "r"(v8)      // %2
1362       : "cc", "memory", "v0");
1363 }
1364 
ARGBSetRow_NEON(uint8_t * dst,uint32_t v32,int width)1365 void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width) {
1366   asm volatile(
1367       "dup         v0.4s, %w2                    \n"  // duplicate 4 ints
1368       "1:                                        \n"
1369       "subs        %w1, %w1, #4                  \n"  // 4 ints per loop
1370       "st1         {v0.16b}, [%0], #16           \n"  // store
1371       "b.gt        1b                            \n"
1372       : "+r"(dst),   // %0
1373         "+r"(width)  // %1
1374       : "r"(v32)     // %2
1375       : "cc", "memory", "v0");
1376 }
1377 
1378 // Shuffle table for reversing the bytes.
1379 static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u,
1380                                      7u,  6u,  5u,  4u,  3u,  2u,  1u, 0u};
1381 
MirrorRow_NEON(const uint8_t * src,uint8_t * dst,int width)1382 void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
1383   asm volatile(
1384       // Start at end of source row.
1385       "ld1         {v3.16b}, [%3]                \n"  // shuffler
1386       "add         %0, %0, %w2, sxtw             \n"
1387       "sub         %0, %0, #32                   \n"
1388       "1:                                        \n"
1389       "ldr         q2, [%0, 16]                  \n"
1390       "ldr         q1, [%0], -32                 \n"  // src -= 32
1391       "subs        %w2, %w2, #32                 \n"  // 32 pixels per loop.
1392       "tbl         v0.16b, {v2.16b}, v3.16b      \n"
1393       "tbl         v1.16b, {v1.16b}, v3.16b      \n"
1394       "st1         {v0.16b, v1.16b}, [%1], #32   \n"  // store 32 pixels
1395       "b.gt        1b                            \n"
1396       : "+r"(src),            // %0
1397         "+r"(dst),            // %1
1398         "+r"(width)           // %2
1399       : "r"(&kShuffleMirror)  // %3
1400       : "cc", "memory", "v0", "v1", "v2", "v3");
1401 }
1402 
1403 // Shuffle table for reversing the UV.
1404 static const uvec8 kShuffleMirrorUV = {14u, 15u, 12u, 13u, 10u, 11u, 8u, 9u,
1405                                        6u,  7u,  4u,  5u,  2u,  3u,  0u, 1u};
1406 
MirrorUVRow_NEON(const uint8_t * src_uv,uint8_t * dst_uv,int width)1407 void MirrorUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
1408   asm volatile(
1409       // Start at end of source row.
1410       "ld1         {v4.16b}, [%3]                \n"  // shuffler
1411       "add         %0, %0, %w2, sxtw #1          \n"
1412       "sub         %0, %0, #32                   \n"
1413       "1:                                        \n"
1414       "ldr         q1, [%0, 16]                  \n"
1415       "ldr         q0, [%0], -32                 \n"  // src -= 32
1416       "subs        %w2, %w2, #16                 \n"  // 16 pixels per loop.
1417       "tbl         v2.16b, {v1.16b}, v4.16b      \n"
1418       "tbl         v3.16b, {v0.16b}, v4.16b      \n"
1419       "st1         {v2.16b, v3.16b}, [%1], #32   \n"  // dst += 32
1420       "b.gt        1b                            \n"
1421       : "+r"(src_uv),           // %0
1422         "+r"(dst_uv),           // %1
1423         "+r"(width)             // %2
1424       : "r"(&kShuffleMirrorUV)  // %3
1425       : "cc", "memory", "v0", "v1", "v2", "v3", "v4");
1426 }
1427 
MirrorSplitUVRow_NEON(const uint8_t * src_uv,uint8_t * dst_u,uint8_t * dst_v,int width)1428 void MirrorSplitUVRow_NEON(const uint8_t* src_uv,
1429                            uint8_t* dst_u,
1430                            uint8_t* dst_v,
1431                            int width) {
1432   asm volatile(
1433       // Start at end of source row.
1434       "ld1         {v4.16b}, [%4]                \n"  // shuffler
1435       "add         %0, %0, %w3, sxtw #1          \n"
1436       "sub         %0, %0, #32                   \n"
1437       "1:                                        \n"
1438       "ldr         q1, [%0, 16]                  \n"
1439       "ldr         q0, [%0], -32                 \n"  // src -= 32
1440       "subs        %w3, %w3, #16                 \n"  // 16 pixels per loop.
1441       "tbl         v2.16b, {v1.16b}, v4.16b      \n"
1442       "tbl         v3.16b, {v0.16b}, v4.16b      \n"
1443       "uzp1        v0.16b, v2.16b, v3.16b        \n"  // U
1444       "uzp2        v1.16b, v2.16b, v3.16b        \n"  // V
1445       "st1         {v0.16b}, [%1], #16           \n"  // dst += 16
1446       "st1         {v1.16b}, [%2], #16           \n"
1447       "b.gt        1b                            \n"
1448       : "+r"(src_uv),           // %0
1449         "+r"(dst_u),            // %1
1450         "+r"(dst_v),            // %2
1451         "+r"(width)             // %3
1452       : "r"(&kShuffleMirrorUV)  // %4
1453       : "cc", "memory", "v0", "v1", "v2", "v3", "v4");
1454 }
1455 
1456 // Shuffle table for reversing the ARGB.
1457 static const uvec8 kShuffleMirrorARGB = {12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u,
1458                                          4u,  5u,  6u,  7u,  0u, 1u, 2u,  3u};
1459 
ARGBMirrorRow_NEON(const uint8_t * src_argb,uint8_t * dst_argb,int width)1460 void ARGBMirrorRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
1461   asm volatile(
1462       // Start at end of source row.
1463       "ld1         {v4.16b}, [%3]                \n"  // shuffler
1464       "add         %0, %0, %w2, sxtw #2          \n"
1465       "sub         %0, %0, #32                   \n"
1466       "1:                                        \n"
1467       "ldr         q1, [%0, 16]                  \n"
1468       "ldr         q0, [%0], -32                 \n"  // src -= 32
1469       "subs        %w2, %w2, #8                  \n"  // 8 pixels per loop.
1470       "tbl         v2.16b, {v1.16b}, v4.16b      \n"
1471       "tbl         v3.16b, {v0.16b}, v4.16b      \n"
1472       "st1         {v2.16b, v3.16b}, [%1], #32   \n"  // dst += 32
1473       "b.gt        1b                            \n"
1474       : "+r"(src_argb),           // %0
1475         "+r"(dst_argb),           // %1
1476         "+r"(width)               // %2
1477       : "r"(&kShuffleMirrorARGB)  // %3
1478       : "cc", "memory", "v0", "v1", "v2", "v3", "v4");
1479 }
1480 
RGB24MirrorRow_NEON(const uint8_t * src_rgb24,uint8_t * dst_rgb24,int width)1481 void RGB24MirrorRow_NEON(const uint8_t* src_rgb24,
1482                          uint8_t* dst_rgb24,
1483                          int width) {
1484   asm volatile(
1485       "ld1         {v3.16b}, [%4]                \n"  // shuffler
1486       "add         %0, %0, %w2, sxtw #1          \n"  // Start at end of row.
1487       "add         %0, %0, %w2, sxtw             \n"
1488       "sub         %0, %0, #48                   \n"
1489 
1490       "1:                                        \n"
1491       "ld3         {v0.16b, v1.16b, v2.16b}, [%0], %3 \n"  // src -= 48
1492       "subs        %w2, %w2, #16                 \n"  // 16 pixels per loop.
1493       "tbl         v0.16b, {v0.16b}, v3.16b      \n"
1494       "tbl         v1.16b, {v1.16b}, v3.16b      \n"
1495       "tbl         v2.16b, {v2.16b}, v3.16b      \n"
1496       "st3         {v0.16b, v1.16b, v2.16b}, [%1], #48 \n"  // dst += 48
1497       "b.gt        1b                            \n"
1498       : "+r"(src_rgb24),      // %0
1499         "+r"(dst_rgb24),      // %1
1500         "+r"(width)           // %2
1501       : "r"((ptrdiff_t)-48),  // %3
1502         "r"(&kShuffleMirror)  // %4
1503       : "cc", "memory", "v0", "v1", "v2", "v3");
1504 }
1505 
RGB24ToARGBRow_NEON(const uint8_t * src_rgb24,uint8_t * dst_argb,int width)1506 void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24,
1507                          uint8_t* dst_argb,
1508                          int width) {
1509   asm volatile(
1510       "movi        v4.8b, #255                   \n"  // Alpha
1511       "1:                                        \n"
1512       "ld3         {v1.8b,v2.8b,v3.8b}, [%0], #24 \n"  // load 8 pixels of
1513                                                        // RGB24.
1514       "prfm        pldl1keep, [%0, 448]          \n"
1515       "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
1516       "st4         {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n"  // store 8 ARGB
1517       "b.gt        1b                            \n"
1518       : "+r"(src_rgb24),  // %0
1519         "+r"(dst_argb),   // %1
1520         "+r"(width)       // %2
1521       :
1522       : "cc", "memory", "v1", "v2", "v3", "v4"  // Clobber List
1523   );
1524 }
1525 
RAWToARGBRow_NEON(const uint8_t * src_raw,uint8_t * dst_argb,int width)1526 void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
1527   asm volatile(
1528       "movi        v5.8b, #255                   \n"  // Alpha
1529       "1:                                        \n"
1530       "ld3         {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // read r g b
1531       "subs        %w2, %w2, #8                  \n"   // 8 processed per loop.
1532       "orr         v3.8b, v1.8b, v1.8b           \n"   // move g
1533       "prfm        pldl1keep, [%0, 448]          \n"
1534       "orr         v4.8b, v0.8b, v0.8b           \n"         // move r
1535       "st4         {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n"  // store b g r a
1536       "b.gt        1b                            \n"
1537       : "+r"(src_raw),   // %0
1538         "+r"(dst_argb),  // %1
1539         "+r"(width)      // %2
1540       :
1541       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5"  // Clobber List
1542   );
1543 }
1544 
RAWToRGBARow_NEON(const uint8_t * src_raw,uint8_t * dst_rgba,int width)1545 void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
1546   asm volatile(
1547       "movi        v0.8b, #255                   \n"  // Alpha
1548       "1:                                        \n"
1549       "ld3         {v3.8b,v4.8b,v5.8b}, [%0], #24 \n"  // read r g b
1550       "subs        %w2, %w2, #8                  \n"   // 8 processed per loop.
1551       "orr         v2.8b, v4.8b, v4.8b           \n"   // move g
1552       "prfm        pldl1keep, [%0, 448]          \n"
1553       "orr         v1.8b, v5.8b, v5.8b           \n"         // move r
1554       "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store a b g r
1555       "b.gt        1b                            \n"
1556       : "+r"(src_raw),   // %0
1557         "+r"(dst_rgba),  // %1
1558         "+r"(width)      // %2
1559       :
1560       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5"  // Clobber List
1561   );
1562 }
1563 
RAWToRGB24Row_NEON(const uint8_t * src_raw,uint8_t * dst_rgb24,int width)1564 void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
1565   asm volatile(
1566       "1:                                        \n"
1567       "ld3         {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // read r g b
1568       "subs        %w2, %w2, #8                  \n"   // 8 processed per loop.
1569       "orr         v3.8b, v1.8b, v1.8b           \n"   // move g
1570       "prfm        pldl1keep, [%0, 448]          \n"
1571       "orr         v4.8b, v0.8b, v0.8b           \n"   // move r
1572       "st3         {v2.8b,v3.8b,v4.8b}, [%1], #24 \n"  // store b g r
1573       "b.gt        1b                            \n"
1574       : "+r"(src_raw),    // %0
1575         "+r"(dst_rgb24),  // %1
1576         "+r"(width)       // %2
1577       :
1578       : "cc", "memory", "v0", "v1", "v2", "v3", "v4"  // Clobber List
1579   );
1580 }
1581 
1582 #define RGB565TOARGB                                                        \
1583   "shrn       v6.8b, v0.8h, #5               \n" /* G xxGGGGGG           */ \
1584   "shl        v6.8b, v6.8b, #2               \n" /* G GGGGGG00 upper 6   */ \
1585   "ushr       v4.8b, v6.8b, #6               \n" /* G 000000GG lower 2   */ \
1586   "orr        v1.8b, v4.8b, v6.8b            \n" /* G                    */ \
1587   "xtn        v2.8b, v0.8h                   \n" /* B xxxBBBBB           */ \
1588   "ushr       v0.8h, v0.8h, #11              \n" /* R 000RRRRR           */ \
1589   "xtn2       v2.16b,v0.8h                   \n" /* R in upper part      */ \
1590   "shl        v2.16b, v2.16b, #3             \n" /* R,B BBBBB000 upper 5 */ \
1591   "ushr       v0.16b, v2.16b, #5             \n" /* R,B 00000BBB lower 3 */ \
1592   "orr        v0.16b, v0.16b, v2.16b         \n" /* R,B                  */ \
1593   "dup        v2.2D, v0.D[1]                 \n" /* R                    */
1594 
RGB565ToARGBRow_NEON(const uint8_t * src_rgb565,uint8_t * dst_argb,int width)1595 void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565,
1596                           uint8_t* dst_argb,
1597                           int width) {
1598   asm volatile(
1599       "movi        v3.8b, #255                   \n"  // Alpha
1600       "1:                                        \n"
1601       "ld1         {v0.16b}, [%0], #16           \n"  // load 8 RGB565 pixels.
1602       "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
1603       "prfm        pldl1keep, [%0, 448]          \n" RGB565TOARGB
1604       "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB
1605       "b.gt        1b                            \n"
1606       : "+r"(src_rgb565),  // %0
1607         "+r"(dst_argb),    // %1
1608         "+r"(width)        // %2
1609       :
1610       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6"  // Clobber List
1611   );
1612 }
1613 
1614 #define ARGB1555TOARGB                                                      \
1615   "ushr       v2.8h, v0.8h, #10              \n" /* R xxxRRRRR           */ \
1616   "shl        v2.8h, v2.8h, #3               \n" /* R RRRRR000 upper 5   */ \
1617   "xtn        v3.8b, v2.8h                   \n" /* RRRRR000 AAAAAAAA    */ \
1618                                                                             \
1619   "sshr       v2.8h, v0.8h, #15              \n" /* A AAAAAAAA           */ \
1620   "xtn2       v3.16b, v2.8h                  \n"                            \
1621                                                                             \
1622   "xtn        v2.8b, v0.8h                   \n" /* B xxxBBBBB           */ \
1623   "shrn2      v2.16b,v0.8h, #5               \n" /* G xxxGGGGG           */ \
1624                                                                             \
1625   "ushr       v1.16b, v3.16b, #5             \n" /* R,A 00000RRR lower 3 */ \
1626   "shl        v0.16b, v2.16b, #3             \n" /* B,G BBBBB000 upper 5 */ \
1627   "ushr       v2.16b, v0.16b, #5             \n" /* B,G 00000BBB lower 3 */ \
1628                                                                             \
1629   "orr        v0.16b, v0.16b, v2.16b         \n" /* B,G                  */ \
1630   "orr        v2.16b, v1.16b, v3.16b         \n" /* R,A                  */ \
1631   "dup        v1.2D, v0.D[1]                 \n"                            \
1632   "dup        v3.2D, v2.D[1]                 \n"
1633 
1634 // RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha.
1635 #define RGB555TOARGB                                                        \
1636   "ushr       v2.8h, v0.8h, #10              \n" /* R xxxRRRRR           */ \
1637   "shl        v2.8h, v2.8h, #3               \n" /* R RRRRR000 upper 5   */ \
1638   "xtn        v3.8b, v2.8h                   \n" /* RRRRR000             */ \
1639                                                                             \
1640   "xtn        v2.8b, v0.8h                   \n" /* B xxxBBBBB           */ \
1641   "shrn2      v2.16b,v0.8h, #5               \n" /* G xxxGGGGG           */ \
1642                                                                             \
1643   "ushr       v1.16b, v3.16b, #5             \n" /* R   00000RRR lower 3 */ \
1644   "shl        v0.16b, v2.16b, #3             \n" /* B,G BBBBB000 upper 5 */ \
1645   "ushr       v2.16b, v0.16b, #5             \n" /* B,G 00000BBB lower 3 */ \
1646                                                                             \
1647   "orr        v0.16b, v0.16b, v2.16b         \n" /* B,G                  */ \
1648   "orr        v2.16b, v1.16b, v3.16b         \n" /* R                    */ \
1649   "dup        v1.2D, v0.D[1]                 \n" /* G */
1650 
ARGB1555ToARGBRow_NEON(const uint8_t * src_argb1555,uint8_t * dst_argb,int width)1651 void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555,
1652                             uint8_t* dst_argb,
1653                             int width) {
1654   asm volatile(
1655       "movi        v3.8b, #255                   \n"  // Alpha
1656       "1:                                        \n"
1657       "ld1         {v0.16b}, [%0], #16           \n"  // load 8 ARGB1555 pixels.
1658       "prfm        pldl1keep, [%0, 448]          \n"
1659       "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
1660       ARGB1555TOARGB
1661       "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB
1662       "b.gt        1b                            \n"
1663       : "+r"(src_argb1555),  // %0
1664         "+r"(dst_argb),      // %1
1665         "+r"(width)          // %2
1666       :
1667       : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
1668   );
1669 }
1670 
1671 // Convert v0.8h to b = v0.8b g = v1.8b r = v2.8b
1672 // clobbers v3
1673 #define ARGB4444TOARGB                                                      \
1674   "shrn       v1.8b,  v0.8h, #8              \n" /* v1(l) AR             */ \
1675   "xtn2       v1.16b, v0.8h                  \n" /* v1(h) GB             */ \
1676   "shl        v2.16b, v1.16b, #4             \n" /* B,R BBBB0000         */ \
1677   "ushr       v3.16b, v1.16b, #4             \n" /* G,A 0000GGGG         */ \
1678   "ushr       v0.16b, v2.16b, #4             \n" /* B,R 0000BBBB         */ \
1679   "shl        v1.16b, v3.16b, #4             \n" /* G,A GGGG0000         */ \
1680   "orr        v2.16b, v0.16b, v2.16b         \n" /* B,R BBBBBBBB         */ \
1681   "orr        v3.16b, v1.16b, v3.16b         \n" /* G,A GGGGGGGG         */ \
1682   "dup        v0.2D, v2.D[1]                 \n"                            \
1683   "dup        v1.2D, v3.D[1]                 \n"
1684 
ARGB4444ToARGBRow_NEON(const uint8_t * src_argb4444,uint8_t * dst_argb,int width)1685 void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444,
1686                             uint8_t* dst_argb,
1687                             int width) {
1688   asm volatile(
1689       "1:                                        \n"
1690       "ld1         {v0.16b}, [%0], #16           \n"  // load 8 ARGB4444 pixels.
1691       "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
1692       "prfm        pldl1keep, [%0, 448]          \n" ARGB4444TOARGB
1693       "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB
1694       "b.gt        1b                            \n"
1695       : "+r"(src_argb4444),  // %0
1696         "+r"(dst_argb),      // %1
1697         "+r"(width)          // %2
1698       :
1699       : "cc", "memory", "v0", "v1", "v2", "v3", "v4"  // Clobber List
1700   );
1701 }
1702 
ARGBToRGB24Row_NEON(const uint8_t * src_argb,uint8_t * dst_rgb24,int width)1703 void ARGBToRGB24Row_NEON(const uint8_t* src_argb,
1704                          uint8_t* dst_rgb24,
1705                          int width) {
1706   asm volatile(
1707       "1:                                        \n"
1708       "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 ARGB
1709       "subs        %w2, %w2, #16                 \n"  // 16 pixels per loop.
1710       "prfm        pldl1keep, [%0, 448]          \n"
1711       "st3         {v0.16b,v1.16b,v2.16b}, [%1], #48 \n"  // store 8 RGB24
1712       "b.gt        1b                            \n"
1713       : "+r"(src_argb),   // %0
1714         "+r"(dst_rgb24),  // %1
1715         "+r"(width)       // %2
1716       :
1717       : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
1718   );
1719 }
1720 
ARGBToRAWRow_NEON(const uint8_t * src_argb,uint8_t * dst_raw,int width)1721 void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) {
1722   asm volatile(
1723       "1:                                        \n"
1724       "ld4         {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n"  // load b g r a
1725       "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
1726       "orr         v4.8b, v2.8b, v2.8b           \n"  // mov g
1727       "prfm        pldl1keep, [%0, 448]          \n"
1728       "orr         v5.8b, v1.8b, v1.8b           \n"   // mov b
1729       "st3         {v3.8b,v4.8b,v5.8b}, [%1], #24 \n"  // store r g b
1730       "b.gt        1b                            \n"
1731       : "+r"(src_argb),  // %0
1732         "+r"(dst_raw),   // %1
1733         "+r"(width)      // %2
1734       :
1735       : "cc", "memory", "v1", "v2", "v3", "v4", "v5"  // Clobber List
1736   );
1737 }
1738 
YUY2ToYRow_NEON(const uint8_t * src_yuy2,uint8_t * dst_y,int width)1739 void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
1740   asm volatile(
1741       "1:                                        \n"
1742       "ld2         {v0.16b,v1.16b}, [%0], #32    \n"  // load 16 pixels of YUY2.
1743       "subs        %w2, %w2, #16                 \n"  // 16 processed per loop.
1744       "prfm        pldl1keep, [%0, 448]          \n"
1745       "st1         {v0.16b}, [%1], #16           \n"  // store 16 pixels of Y.
1746       "b.gt        1b                            \n"
1747       : "+r"(src_yuy2),  // %0
1748         "+r"(dst_y),     // %1
1749         "+r"(width)      // %2
1750       :
1751       : "cc", "memory", "v0", "v1"  // Clobber List
1752   );
1753 }
1754 
UYVYToYRow_NEON(const uint8_t * src_uyvy,uint8_t * dst_y,int width)1755 void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
1756   asm volatile(
1757       "1:                                        \n"
1758       "ld2         {v0.16b,v1.16b}, [%0], #32    \n"  // load 16 pixels of UYVY.
1759       "subs        %w2, %w2, #16                 \n"  // 16 processed per loop.
1760       "prfm        pldl1keep, [%0, 448]          \n"
1761       "st1         {v1.16b}, [%1], #16           \n"  // store 16 pixels of Y.
1762       "b.gt        1b                            \n"
1763       : "+r"(src_uyvy),  // %0
1764         "+r"(dst_y),     // %1
1765         "+r"(width)      // %2
1766       :
1767       : "cc", "memory", "v0", "v1"  // Clobber List
1768   );
1769 }
1770 
YUY2ToUV422Row_NEON(const uint8_t * src_yuy2,uint8_t * dst_u,uint8_t * dst_v,int width)1771 void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2,
1772                          uint8_t* dst_u,
1773                          uint8_t* dst_v,
1774                          int width) {
1775   asm volatile(
1776       "1:                                        \n"
1777       "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 YUY2
1778       "subs        %w3, %w3, #16                 \n"  // 16 pixels = 8 UVs.
1779       "prfm        pldl1keep, [%0, 448]          \n"
1780       "st1         {v1.8b}, [%1], #8             \n"  // store 8 U.
1781       "st1         {v3.8b}, [%2], #8             \n"  // store 8 V.
1782       "b.gt        1b                            \n"
1783       : "+r"(src_yuy2),  // %0
1784         "+r"(dst_u),     // %1
1785         "+r"(dst_v),     // %2
1786         "+r"(width)      // %3
1787       :
1788       : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
1789   );
1790 }
1791 
UYVYToUV422Row_NEON(const uint8_t * src_uyvy,uint8_t * dst_u,uint8_t * dst_v,int width)1792 void UYVYToUV422Row_NEON(const uint8_t* src_uyvy,
1793                          uint8_t* dst_u,
1794                          uint8_t* dst_v,
1795                          int width) {
1796   asm volatile(
1797       "1:                                        \n"
1798       "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 UYVY
1799       "subs        %w3, %w3, #16                 \n"  // 16 pixels = 8 UVs.
1800       "prfm        pldl1keep, [%0, 448]          \n"
1801       "st1         {v0.8b}, [%1], #8             \n"  // store 8 U.
1802       "st1         {v2.8b}, [%2], #8             \n"  // store 8 V.
1803       "b.gt        1b                            \n"
1804       : "+r"(src_uyvy),  // %0
1805         "+r"(dst_u),     // %1
1806         "+r"(dst_v),     // %2
1807         "+r"(width)      // %3
1808       :
1809       : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
1810   );
1811 }
1812 
YUY2ToUVRow_NEON(const uint8_t * src_yuy2,int stride_yuy2,uint8_t * dst_u,uint8_t * dst_v,int width)1813 void YUY2ToUVRow_NEON(const uint8_t* src_yuy2,
1814                       int stride_yuy2,
1815                       uint8_t* dst_u,
1816                       uint8_t* dst_v,
1817                       int width) {
1818   const uint8_t* src_yuy2b = src_yuy2 + stride_yuy2;
1819   asm volatile(
1820       "1:                                        \n"
1821       "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 pixels
1822       "subs        %w4, %w4, #16                 \n"  // 16 pixels = 8 UVs.
1823       "ld4         {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load next row
1824       "urhadd      v1.8b, v1.8b, v5.8b           \n"  // average rows of U
1825       "prfm        pldl1keep, [%0, 448]          \n"
1826       "urhadd      v3.8b, v3.8b, v7.8b           \n"  // average rows of V
1827       "st1         {v1.8b}, [%2], #8             \n"  // store 8 U.
1828       "st1         {v3.8b}, [%3], #8             \n"  // store 8 V.
1829       "b.gt        1b                            \n"
1830       : "+r"(src_yuy2),   // %0
1831         "+r"(src_yuy2b),  // %1
1832         "+r"(dst_u),      // %2
1833         "+r"(dst_v),      // %3
1834         "+r"(width)       // %4
1835       :
1836       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
1837         "v7"  // Clobber List
1838   );
1839 }
1840 
UYVYToUVRow_NEON(const uint8_t * src_uyvy,int stride_uyvy,uint8_t * dst_u,uint8_t * dst_v,int width)1841 void UYVYToUVRow_NEON(const uint8_t* src_uyvy,
1842                       int stride_uyvy,
1843                       uint8_t* dst_u,
1844                       uint8_t* dst_v,
1845                       int width) {
1846   const uint8_t* src_uyvyb = src_uyvy + stride_uyvy;
1847   asm volatile(
1848       "1:                                        \n"
1849       "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 pixels
1850       "subs        %w4, %w4, #16                 \n"  // 16 pixels = 8 UVs.
1851       "ld4         {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load next row
1852       "urhadd      v0.8b, v0.8b, v4.8b           \n"  // average rows of U
1853       "prfm        pldl1keep, [%0, 448]          \n"
1854       "urhadd      v2.8b, v2.8b, v6.8b           \n"  // average rows of V
1855       "st1         {v0.8b}, [%2], #8             \n"  // store 8 U.
1856       "st1         {v2.8b}, [%3], #8             \n"  // store 8 V.
1857       "b.gt        1b                            \n"
1858       : "+r"(src_uyvy),   // %0
1859         "+r"(src_uyvyb),  // %1
1860         "+r"(dst_u),      // %2
1861         "+r"(dst_v),      // %3
1862         "+r"(width)       // %4
1863       :
1864       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
1865         "v7"  // Clobber List
1866   );
1867 }
1868 
YUY2ToNVUVRow_NEON(const uint8_t * src_yuy2,int stride_yuy2,uint8_t * dst_uv,int width)1869 void YUY2ToNVUVRow_NEON(const uint8_t* src_yuy2,
1870                         int stride_yuy2,
1871                         uint8_t* dst_uv,
1872                         int width) {
1873   const uint8_t* src_yuy2b = src_yuy2 + stride_yuy2;
1874   asm volatile(
1875       "1:                                        \n"
1876       "ld2         {v0.16b,v1.16b}, [%0], #32    \n"  // load 16 pixels
1877       "subs        %w3, %w3, #16                 \n"  // 16 pixels = 8 UVs.
1878       "ld2         {v2.16b,v3.16b}, [%1], #32    \n"  // load next row
1879       "urhadd      v4.16b, v1.16b, v3.16b        \n"  // average rows of UV
1880       "prfm        pldl1keep, [%0, 448]          \n"
1881       "st1         {v4.16b}, [%2], #16           \n"  // store 8 UV.
1882       "b.gt        1b                            \n"
1883       : "+r"(src_yuy2),   // %0
1884         "+r"(src_yuy2b),  // %1
1885         "+r"(dst_uv),     // %2
1886         "+r"(width)       // %3
1887       :
1888       : "cc", "memory", "v0", "v1", "v2", "v3", "v4"  // Clobber List
1889   );
1890 }
1891 
1892 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
ARGBShuffleRow_NEON(const uint8_t * src_argb,uint8_t * dst_argb,const uint8_t * shuffler,int width)1893 void ARGBShuffleRow_NEON(const uint8_t* src_argb,
1894                          uint8_t* dst_argb,
1895                          const uint8_t* shuffler,
1896                          int width) {
1897   asm volatile(
1898       "ld1         {v2.16b}, [%3]                \n"  // shuffler
1899       "1:                                        \n"
1900       "ld1         {v0.16b}, [%0], #16           \n"  // load 4 pixels.
1901       "subs        %w2, %w2, #4                  \n"  // 4 processed per loop
1902       "prfm        pldl1keep, [%0, 448]          \n"
1903       "tbl         v1.16b, {v0.16b}, v2.16b      \n"  // look up 4 pixels
1904       "st1         {v1.16b}, [%1], #16           \n"  // store 4.
1905       "b.gt        1b                            \n"
1906       : "+r"(src_argb),                   // %0
1907         "+r"(dst_argb),                   // %1
1908         "+r"(width)                       // %2
1909       : "r"(shuffler)                     // %3
1910       : "cc", "memory", "v0", "v1", "v2"  // Clobber List
1911   );
1912 }
1913 
I422ToYUY2Row_NEON(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_yuy2,int width)1914 void I422ToYUY2Row_NEON(const uint8_t* src_y,
1915                         const uint8_t* src_u,
1916                         const uint8_t* src_v,
1917                         uint8_t* dst_yuy2,
1918                         int width) {
1919   asm volatile(
1920       "1:                                        \n"
1921       "ld2         {v0.8b, v1.8b}, [%0], #16     \n"  // load 16 Ys
1922       "subs        %w4, %w4, #16                 \n"  // 16 pixels
1923       "orr         v2.8b, v1.8b, v1.8b           \n"
1924       "prfm        pldl1keep, [%0, 448]          \n"
1925       "ld1         {v1.8b}, [%1], #8             \n"         // load 8 Us
1926       "ld1         {v3.8b}, [%2], #8             \n"         // load 8 Vs
1927       "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n"  // Store 16 pixels.
1928       "b.gt        1b                            \n"
1929       : "+r"(src_y),     // %0
1930         "+r"(src_u),     // %1
1931         "+r"(src_v),     // %2
1932         "+r"(dst_yuy2),  // %3
1933         "+r"(width)      // %4
1934       :
1935       : "cc", "memory", "v0", "v1", "v2", "v3");
1936 }
1937 
I422ToUYVYRow_NEON(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_uyvy,int width)1938 void I422ToUYVYRow_NEON(const uint8_t* src_y,
1939                         const uint8_t* src_u,
1940                         const uint8_t* src_v,
1941                         uint8_t* dst_uyvy,
1942                         int width) {
1943   asm volatile(
1944       "1:                                        \n"
1945       "ld2         {v1.8b,v2.8b}, [%0], #16      \n"  // load 16 Ys
1946       "orr         v3.8b, v2.8b, v2.8b           \n"
1947       "prfm        pldl1keep, [%0, 448]          \n"
1948       "ld1         {v0.8b}, [%1], #8             \n"         // load 8 Us
1949       "ld1         {v2.8b}, [%2], #8             \n"         // load 8 Vs
1950       "subs        %w4, %w4, #16                 \n"         // 16 pixels
1951       "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n"  // Store 16 pixels.
1952       "b.gt        1b                            \n"
1953       : "+r"(src_y),     // %0
1954         "+r"(src_u),     // %1
1955         "+r"(src_v),     // %2
1956         "+r"(dst_uyvy),  // %3
1957         "+r"(width)      // %4
1958       :
1959       : "cc", "memory", "v0", "v1", "v2", "v3");
1960 }
1961 
ARGBToRGB565Row_NEON(const uint8_t * src_argb,uint8_t * dst_rgb565,int width)1962 void ARGBToRGB565Row_NEON(const uint8_t* src_argb,
1963                           uint8_t* dst_rgb565,
1964                           int width) {
1965   asm volatile(
1966       "1:                                        \n"
1967       "ld4         {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n"  // load 8
1968                                                                  // pixels
1969       "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
1970       "prfm        pldl1keep, [%0, 448]          \n" ARGBTORGB565
1971       "st1         {v18.16b}, [%1], #16          \n"  // store 8 pixels RGB565.
1972       "b.gt        1b                            \n"
1973       : "+r"(src_argb),    // %0
1974         "+r"(dst_rgb565),  // %1
1975         "+r"(width)        // %2
1976       :
1977       : "cc", "memory", "v16", "v17", "v18", "v19");
1978 }
1979 
ARGBToRGB565DitherRow_NEON(const uint8_t * src_argb,uint8_t * dst_rgb,uint32_t dither4,int width)1980 void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb,
1981                                 uint8_t* dst_rgb,
1982                                 uint32_t dither4,
1983                                 int width) {
1984   asm volatile(
1985       "dup         v1.4s, %w3                    \n"  // dither4
1986       "1:                                        \n"
1987       "ld4         {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n"  // load 8 ARGB
1988       "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
1989       "uqadd       v16.8b, v16.8b, v1.8b         \n"
1990       "prfm        pldl1keep, [%0, 448]          \n"
1991       "uqadd       v17.8b, v17.8b, v1.8b         \n"
1992       "uqadd       v18.8b, v18.8b, v1.8b         \n" ARGBTORGB565
1993       "st1         {v18.16b}, [%1], #16          \n"  // store 8 pixels RGB565.
1994       "b.gt        1b                            \n"
1995       : "+r"(src_argb),  // %0
1996         "+r"(dst_rgb),   // %1
1997         "+r"(width)      // %2
1998       : "r"(dither4)     // %3
1999       : "cc", "memory", "v1", "v16", "v17", "v18", "v19");
2000 }
2001 
ARGBToARGB1555Row_NEON(const uint8_t * src_argb,uint8_t * dst_argb1555,int width)2002 void ARGBToARGB1555Row_NEON(const uint8_t* src_argb,
2003                             uint8_t* dst_argb1555,
2004                             int width) {
2005   asm volatile(
2006       "1:                                        \n"
2007       "ld4         {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n"  // load 8
2008                                                                  // pixels
2009       "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
2010       "prfm        pldl1keep, [%0, 448]          \n" ARGBTOARGB1555
2011       "st1         {v0.16b}, [%1], #16           \n"  // store 8 pixels
2012       "b.gt        1b                            \n"
2013       : "+r"(src_argb),      // %0
2014         "+r"(dst_argb1555),  // %1
2015         "+r"(width)          // %2
2016       :
2017       : "cc", "memory", "v0", "v16", "v17", "v18", "v19");
2018 }
2019 
ARGBToARGB4444Row_NEON(const uint8_t * src_argb,uint8_t * dst_argb4444,int width)2020 void ARGBToARGB4444Row_NEON(const uint8_t* src_argb,
2021                             uint8_t* dst_argb4444,
2022                             int width) {
2023   asm volatile(
2024       "movi        v23.16b, #0x0f                \n"  // bits to clear with
2025                                                       // vbic.
2026       "1:                                        \n"
2027       "ld4         {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n"  // load 8
2028                                                                  // pixels
2029       "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
2030       "prfm        pldl1keep, [%0, 448]          \n" ARGBTOARGB4444
2031       "st1         {v0.16b}, [%1], #16           \n"  // store 8 pixels
2032       "b.gt        1b                            \n"
2033       : "+r"(src_argb),      // %0
2034         "+r"(dst_argb4444),  // %1
2035         "+r"(width)          // %2
2036       :
2037       : "cc", "memory", "v0", "v1", "v16", "v17", "v18", "v19", "v23");
2038 }
2039 
2040 #if LIBYUV_USE_ST2
ARGBToAR64Row_NEON(const uint8_t * src_argb,uint16_t * dst_ar64,int width)2041 void ARGBToAR64Row_NEON(const uint8_t* src_argb,
2042                         uint16_t* dst_ar64,
2043                         int width) {
2044   asm volatile(
2045       "1:                                        \n"
2046       "ldp         q0, q2, [%0], #32             \n"  // load 8 pixels
2047       "mov         v1.16b, v0.16b                \n"
2048       "prfm        pldl1keep, [%0, 448]          \n"
2049       "mov         v3.16b, v2.16b                \n"
2050       "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
2051       "st2         {v0.16b, v1.16b}, [%1], #32   \n"  // store 4 pixels
2052       "st2         {v2.16b, v3.16b}, [%1], #32   \n"  // store 4 pixels
2053       "b.gt        1b                            \n"
2054       : "+r"(src_argb),  // %0
2055         "+r"(dst_ar64),  // %1
2056         "+r"(width)      // %2
2057       :
2058       : "cc", "memory", "v0", "v1", "v2", "v3");
2059 }
2060 
2061 static const uvec8 kShuffleARGBToABGR = {2,  1, 0, 3,  6,  5,  4,  7,
2062                                          10, 9, 8, 11, 14, 13, 12, 15};
2063 
ARGBToAB64Row_NEON(const uint8_t * src_argb,uint16_t * dst_ab64,int width)2064 void ARGBToAB64Row_NEON(const uint8_t* src_argb,
2065                         uint16_t* dst_ab64,
2066                         int width) {
2067   asm volatile(
2068       "ldr         q4, [%3]                      \n"  // shuffler
2069       "1:                                        \n"
2070       "ldp         q0, q2, [%0], #32             \n"  // load 8 pixels
2071       "tbl         v0.16b, {v0.16b}, v4.16b      \n"
2072       "tbl         v2.16b, {v2.16b}, v4.16b      \n"
2073       "prfm        pldl1keep, [%0, 448]          \n"
2074       "mov         v1.16b, v0.16b                \n"
2075       "mov         v3.16b, v2.16b                \n"
2076       "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
2077       "st2         {v0.16b, v1.16b}, [%1], #32   \n"  // store 4 pixels
2078       "st2         {v2.16b, v3.16b}, [%1], #32   \n"  // store 4 pixels
2079       "b.gt        1b                            \n"
2080       : "+r"(src_argb),           // %0
2081         "+r"(dst_ab64),           // %1
2082         "+r"(width)               // %2
2083       : "r"(&kShuffleARGBToABGR)  // %3
2084       : "cc", "memory", "v0", "v1", "v2", "v3", "v4");
2085 }
2086 #else
ARGBToAR64Row_NEON(const uint8_t * src_argb,uint16_t * dst_ar64,int width)2087 void ARGBToAR64Row_NEON(const uint8_t* src_argb,
2088                         uint16_t* dst_ar64,
2089                         int width) {
2090   asm volatile(
2091       "1:                                        \n"
2092       "ldp         q0, q1, [%0], #32             \n"  // load 8 ARGB pixels
2093       "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
2094       "zip1        v2.16b, v0.16b, v0.16b        \n"
2095       "zip2        v3.16b, v0.16b, v0.16b        \n"
2096       "prfm        pldl1keep, [%0, 448]          \n"
2097       "zip1        v4.16b, v1.16b, v1.16b        \n"
2098       "zip2        v5.16b, v1.16b, v1.16b        \n"
2099       "st1         {v2.8h, v3.8h, v4.8h, v5.8h}, [%1], #64 \n"  // 8 AR64
2100       "b.gt        1b                            \n"
2101       : "+r"(src_argb),  // %0
2102         "+r"(dst_ar64),  // %1
2103         "+r"(width)      // %2
2104       :
2105       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5");
2106 }
2107 
2108 static const uvec8 kShuffleARGBToAB64[2] = {
2109     {2, 2, 1, 1, 0, 0, 3, 3, 6, 6, 5, 5, 4, 4, 7, 7},
2110     {10, 10, 9, 9, 8, 8, 11, 11, 14, 14, 13, 13, 12, 12, 15, 15}};
2111 
ARGBToAB64Row_NEON(const uint8_t * src_argb,uint16_t * dst_ab64,int width)2112 void ARGBToAB64Row_NEON(const uint8_t* src_argb,
2113                         uint16_t* dst_ab64,
2114                         int width) {
2115   asm volatile(
2116       "ldp         q6, q7, [%3]                  \n"  // 2 shufflers
2117       "1:                                        \n"
2118       "ldp         q0, q1, [%0], #32             \n"  // load 8 pixels
2119       "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
2120       "tbl         v2.16b, {v0.16b}, v6.16b      \n"  // ARGB to AB64
2121       "tbl         v3.16b, {v0.16b}, v7.16b      \n"
2122       "prfm        pldl1keep, [%0, 448]          \n"
2123       "tbl         v4.16b, {v1.16b}, v6.16b      \n"
2124       "tbl         v5.16b, {v1.16b}, v7.16b      \n"
2125       "st1         {v2.8h, v3.8h, v4.8h, v5.8h}, [%1], #64 \n"  // 8 AR64
2126       "b.gt        1b                            \n"
2127       : "+r"(src_argb),              // %0
2128         "+r"(dst_ab64),              // %1
2129         "+r"(width)                  // %2
2130       : "r"(&kShuffleARGBToAB64[0])  // %3
2131       : "cc", "memory", "v0", "v1", "v2", "v3", "v4");
2132 }
2133 #endif  // LIBYUV_USE_ST2
2134 
2135 static const uvec8 kShuffleAR64ToARGB = {1,  3,  5,  7,  9,  11, 13, 15,
2136                                          17, 19, 21, 23, 25, 27, 29, 31};
2137 
AR64ToARGBRow_NEON(const uint16_t * src_ar64,uint8_t * dst_argb,int width)2138 void AR64ToARGBRow_NEON(const uint16_t* src_ar64,
2139                         uint8_t* dst_argb,
2140                         int width) {
2141   asm volatile(
2142       "ldr         q4, [%3]                      \n"  // shuffler
2143       "1:                                        \n"
2144       "ldp         q0, q1, [%0], #32             \n"  // load 4 pixels
2145       "ldp         q2, q3, [%0], #32             \n"  // load 4 pixels
2146       "tbl         v0.16b, {v0.16b, v1.16b}, v4.16b \n"
2147       "prfm        pldl1keep, [%0, 448]          \n"
2148       "tbl         v2.16b, {v2.16b, v3.16b}, v4.16b \n"
2149       "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
2150       "stp         q0, q2, [%1], #32             \n"  // store 8 pixels
2151       "b.gt        1b                            \n"
2152       : "+r"(src_ar64),           // %0
2153         "+r"(dst_argb),           // %1
2154         "+r"(width)               // %2
2155       : "r"(&kShuffleAR64ToARGB)  // %3
2156       : "cc", "memory", "v0", "v1", "v2", "v3", "v4");
2157 }
2158 
2159 static const uvec8 kShuffleAB64ToARGB = {5,  3,  1,  7,  13, 11, 9,  15,
2160                                          21, 19, 17, 23, 29, 27, 25, 31};
2161 
AB64ToARGBRow_NEON(const uint16_t * src_ab64,uint8_t * dst_argb,int width)2162 void AB64ToARGBRow_NEON(const uint16_t* src_ab64,
2163                         uint8_t* dst_argb,
2164                         int width) {
2165   asm volatile(
2166       "ldr         q4, [%3]                      \n"  // shuffler
2167       "1:                                        \n"
2168       "ldp         q0, q1, [%0], #32             \n"  // load 4 pixels
2169       "ldp         q2, q3, [%0], #32             \n"  // load 4 pixels
2170       "tbl         v0.16b, {v0.16b, v1.16b}, v4.16b \n"
2171       "prfm        pldl1keep, [%0, 448]          \n"
2172       "tbl         v2.16b, {v2.16b, v3.16b}, v4.16b \n"
2173       "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
2174       "stp         q0, q2, [%1], #32             \n"  // store 8 pixels
2175       "b.gt        1b                            \n"
2176       : "+r"(src_ab64),           // %0
2177         "+r"(dst_argb),           // %1
2178         "+r"(width)               // %2
2179       : "r"(&kShuffleAB64ToARGB)  // %3
2180       : "cc", "memory", "v0", "v1", "v2", "v3", "v4");
2181 }
2182 
ARGBExtractAlphaRow_NEON(const uint8_t * src_argb,uint8_t * dst_a,int width)2183 void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb,
2184                               uint8_t* dst_a,
2185                               int width) {
2186   asm volatile(
2187       "1:                                        \n"
2188       "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16
2189       "prfm        pldl1keep, [%0, 448]          \n"
2190       "subs        %w2, %w2, #16                 \n"  // 16 processed per loop
2191       "st1         {v3.16b}, [%1], #16           \n"  // store 16 A's.
2192       "b.gt        1b                            \n"
2193       : "+r"(src_argb),  // %0
2194         "+r"(dst_a),     // %1
2195         "+r"(width)      // %2
2196       :
2197       : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
2198   );
2199 }
2200 
2201 struct RgbUVConstants {
2202   uint8_t kRGBToU[4];
2203   uint8_t kRGBToV[4];
2204 };
2205 
2206 // 8x1 pixels.
ARGBToUV444MatrixRow_NEON(const uint8_t * src_argb,uint8_t * dst_u,uint8_t * dst_v,int width,const struct RgbUVConstants * rgbuvconstants)2207 void ARGBToUV444MatrixRow_NEON(const uint8_t* src_argb,
2208                                uint8_t* dst_u,
2209                                uint8_t* dst_v,
2210                                int width,
2211                                const struct RgbUVConstants* rgbuvconstants) {
2212   asm volatile(
2213       "ldr         d0, [%4]                      \n"  // load rgbuvconstants
2214       "dup         v24.16b, v0.b[0]              \n"  // UB  0.875 coefficient
2215       "dup         v25.16b, v0.b[1]              \n"  // UG -0.5781 coefficient
2216       "dup         v26.16b, v0.b[2]              \n"  // UR -0.2969 coefficient
2217       "dup         v27.16b, v0.b[4]              \n"  // VB -0.1406 coefficient
2218       "dup         v28.16b, v0.b[5]              \n"  // VG -0.7344 coefficient
2219       "movi        v29.16b, #0x80                \n"  // 128.5
2220 
2221       "1:                                        \n"
2222       "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
2223       "subs        %w3, %w3, #8                  \n"  // 8 processed per loop.
2224       "umull       v4.8h, v0.8b, v24.8b          \n"  // B
2225       "umlsl       v4.8h, v1.8b, v25.8b          \n"  // G
2226       "umlsl       v4.8h, v2.8b, v26.8b          \n"  // R
2227       "prfm        pldl1keep, [%0, 448]          \n"
2228 
2229       "umull       v3.8h, v2.8b, v24.8b          \n"  // R
2230       "umlsl       v3.8h, v1.8b, v28.8b          \n"  // G
2231       "umlsl       v3.8h, v0.8b, v27.8b          \n"  // B
2232 
2233       "addhn       v0.8b, v4.8h, v29.8h          \n"  // +128 -> unsigned
2234       "addhn       v1.8b, v3.8h, v29.8h          \n"  // +128 -> unsigned
2235 
2236       "st1         {v0.8b}, [%1], #8             \n"  // store 8 pixels U.
2237       "st1         {v1.8b}, [%2], #8             \n"  // store 8 pixels V.
2238       "b.gt        1b                            \n"
2239       : "+r"(src_argb),      // %0
2240         "+r"(dst_u),         // %1
2241         "+r"(dst_v),         // %2
2242         "+r"(width)          // %3
2243       : "r"(rgbuvconstants)  // %4
2244       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26",
2245         "v27", "v28", "v29");
2246 }
2247 
2248 // RGB to bt601 coefficients
2249 // UB   0.875 coefficient = 112
2250 // UG -0.5781 coefficient = 74
2251 // UR -0.2969 coefficient = 38
2252 // VB -0.1406 coefficient = 18
2253 // VG -0.7344 coefficient = 94
2254 // VR   0.875 coefficient = 112 (ignored)
2255 
2256 static const struct RgbUVConstants kRgb24I601UVConstants = {{112, 74, 38, 0},
2257                                                             {18, 94, 112, 0}};
2258 
2259 // RGB to JPeg coefficients
2260 // UB coeff 0.500    = 127
2261 // UG coeff -0.33126 = 84
2262 // UR coeff -0.16874 = 43
2263 // VB coeff -0.08131 = 20
2264 // VG coeff -0.41869 = 107
2265 // VR coeff 0.500    = 127 (ignored)
2266 
2267 static const struct RgbUVConstants kRgb24JPegUVConstants = {{127, 84, 43, 0},
2268                                                             {20, 107, 127, 0}};
2269 
ARGBToUV444Row_NEON(const uint8_t * src_argb,uint8_t * dst_u,uint8_t * dst_v,int width)2270 void ARGBToUV444Row_NEON(const uint8_t* src_argb,
2271                          uint8_t* dst_u,
2272                          uint8_t* dst_v,
2273                          int width) {
2274   ARGBToUV444MatrixRow_NEON(src_argb, dst_u, dst_v, width,
2275                             &kRgb24I601UVConstants);
2276 }
2277 
ARGBToUVJ444Row_NEON(const uint8_t * src_argb,uint8_t * dst_u,uint8_t * dst_v,int width)2278 void ARGBToUVJ444Row_NEON(const uint8_t* src_argb,
2279                           uint8_t* dst_u,
2280                           uint8_t* dst_v,
2281                           int width) {
2282   ARGBToUV444MatrixRow_NEON(src_argb, dst_u, dst_v, width,
2283                             &kRgb24JPegUVConstants);
2284 }
2285 
2286 #define RGBTOUV_SETUP_REG                                                  \
2287   "movi       v20.8h, #56, lsl #0  \n" /* UB/VR coefficient (0.875) / 2 */ \
2288   "movi       v21.8h, #37, lsl #0  \n" /* UG coefficient (-0.5781) / 2  */ \
2289   "movi       v22.8h, #19, lsl #0  \n" /* UR coefficient (-0.2969) / 2  */ \
2290   "movi       v23.8h, #9,  lsl #0  \n" /* VB coefficient (-0.1406) / 2  */ \
2291   "movi       v24.8h, #47, lsl #0  \n" /* VG coefficient (-0.7344) / 2  */ \
2292   "movi       v25.16b, #0x80       \n" /* 128.5 (0x8080 in 16-bit)      */
2293 
2294 // 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
2295 // clang-format off
2296 #define RGBTOUV(QB, QG, QR)                                                 \
2297   "mul        v3.8h, " #QB ",v20.8h          \n" /* B                    */ \
2298   "mul        v4.8h, " #QR ",v20.8h          \n" /* R                    */ \
2299   "mls        v3.8h, " #QG ",v21.8h          \n" /* G                    */ \
2300   "mls        v4.8h, " #QG ",v24.8h          \n" /* G                    */ \
2301   "mls        v3.8h, " #QR ",v22.8h          \n" /* R                    */ \
2302   "mls        v4.8h, " #QB ",v23.8h          \n" /* B                    */ \
2303   "addhn      v0.8b, v3.8h, v25.8h           \n" /* +128 -> unsigned     */ \
2304   "addhn      v1.8b, v4.8h, v25.8h           \n" /* +128 -> unsigned     */
2305 // clang-format on
2306 
2307 // TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
2308 // TODO(fbarchard): consider ptrdiff_t for all strides.
2309 
ARGBToUVRow_NEON(const uint8_t * src_argb,int src_stride_argb,uint8_t * dst_u,uint8_t * dst_v,int width)2310 void ARGBToUVRow_NEON(const uint8_t* src_argb,
2311                       int src_stride_argb,
2312                       uint8_t* dst_u,
2313                       uint8_t* dst_v,
2314                       int width) {
2315   const uint8_t* src_argb_1 = src_argb + src_stride_argb;
2316   asm volatile (
2317     RGBTOUV_SETUP_REG
2318       "1:                                        \n"
2319       "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
2320       "uaddlp      v0.8h, v0.16b                 \n"  // B 16 bytes -> 8 shorts.
2321       "prfm        pldl1keep, [%0, 448]          \n"
2322       "uaddlp      v1.8h, v1.16b                 \n"  // G 16 bytes -> 8 shorts.
2323       "uaddlp      v2.8h, v2.16b                 \n"  // R 16 bytes -> 8 shorts.
2324 
2325       "ld4         {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load next 16
2326       "uadalp      v0.8h, v4.16b                 \n"  // B 16 bytes -> 8 shorts.
2327       "prfm        pldl1keep, [%1, 448]          \n"
2328       "uadalp      v1.8h, v5.16b                 \n"  // G 16 bytes -> 8 shorts.
2329       "uadalp      v2.8h, v6.16b                 \n"  // R 16 bytes -> 8 shorts.
2330 
2331       "urshr       v0.8h, v0.8h, #1              \n"  // 2x average
2332       "urshr       v1.8h, v1.8h, #1              \n"
2333       "urshr       v2.8h, v2.8h, #1              \n"
2334 
2335       "subs        %w4, %w4, #16                 \n"  // 16 processed per loop.
2336     RGBTOUV(v0.8h, v1.8h, v2.8h)
2337       "st1         {v0.8b}, [%2], #8             \n"  // store 8 pixels U.
2338       "st1         {v1.8b}, [%3], #8             \n"  // store 8 pixels V.
2339       "b.gt        1b                            \n"
2340   : "+r"(src_argb),  // %0
2341     "+r"(src_argb_1),  // %1
2342     "+r"(dst_u),     // %2
2343     "+r"(dst_v),     // %3
2344     "+r"(width)        // %4
2345   :
2346   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
2347     "v20", "v21", "v22", "v23", "v24", "v25"
2348   );
2349 }
2350 
2351 // TODO(fbarchard): Subsample match Intel code.
ARGBToUVJRow_NEON(const uint8_t * src_argb,int src_stride_argb,uint8_t * dst_u,uint8_t * dst_v,int width)2352 void ARGBToUVJRow_NEON(const uint8_t* src_argb,
2353                        int src_stride_argb,
2354                        uint8_t* dst_u,
2355                        uint8_t* dst_v,
2356                        int width) {
2357   const uint8_t* src_argb_1 = src_argb + src_stride_argb;
2358   asm volatile (
2359       "movi        v20.8h, #63, lsl #0           \n"  // UB/VR coeff (0.500) / 2
2360       "movi        v21.8h, #42, lsl #0           \n"  // UG coeff (-0.33126) / 2
2361       "movi        v22.8h, #21, lsl #0           \n"  // UR coeff (-0.16874) / 2
2362       "movi        v23.8h, #10, lsl #0           \n"  // VB coeff (-0.08131) / 2
2363       "movi        v24.8h, #53, lsl #0           \n"  // VG coeff (-0.41869) / 2
2364       "movi        v25.16b, #0x80                \n"  // 128.5 (0x8080 in 16-bit)
2365       "1:                                        \n"
2366       "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
2367       "uaddlp      v0.8h, v0.16b                 \n"  // B 16 bytes -> 8 shorts.
2368       "prfm        pldl1keep, [%0, 448]          \n"
2369       "uaddlp      v1.8h, v1.16b                 \n"  // G 16 bytes -> 8 shorts.
2370       "uaddlp      v2.8h, v2.16b                 \n"  // R 16 bytes -> 8 shorts.
2371       "ld4         {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load next 16
2372       "uadalp      v0.8h, v4.16b                 \n"  // B 16 bytes -> 8 shorts.
2373       "prfm        pldl1keep, [%1, 448]          \n"
2374       "uadalp      v1.8h, v5.16b                 \n"  // G 16 bytes -> 8 shorts.
2375       "uadalp      v2.8h, v6.16b                 \n"  // R 16 bytes -> 8 shorts.
2376 
2377       "urshr       v0.8h, v0.8h, #1              \n"  // 2x average
2378       "urshr       v1.8h, v1.8h, #1              \n"
2379       "urshr       v2.8h, v2.8h, #1              \n"
2380 
2381       "subs        %w4, %w4, #16                 \n"  // 16 processed per loop.
2382     RGBTOUV(v0.8h, v1.8h, v2.8h)
2383       "st1         {v0.8b}, [%2], #8             \n"  // store 8 pixels U.
2384       "st1         {v1.8b}, [%3], #8             \n"  // store 8 pixels V.
2385       "b.gt        1b                            \n"
2386   : "+r"(src_argb),  // %0
2387     "+r"(src_argb_1),  // %1
2388     "+r"(dst_u),     // %2
2389     "+r"(dst_v),     // %3
2390     "+r"(width)        // %4
2391   :
2392   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
2393     "v20", "v21", "v22", "v23", "v24", "v25"
2394   );
2395 }
2396 
ABGRToUVJRow_NEON(const uint8_t * src_abgr,int src_stride_abgr,uint8_t * dst_uj,uint8_t * dst_vj,int width)2397 void ABGRToUVJRow_NEON(const uint8_t* src_abgr,
2398                        int src_stride_abgr,
2399                        uint8_t* dst_uj,
2400                        uint8_t* dst_vj,
2401                        int width) {
2402   const uint8_t* src_abgr_1 = src_abgr + src_stride_abgr;
2403   asm volatile (
2404       "movi        v20.8h, #63, lsl #0           \n"  // UB/VR coeff (0.500) / 2
2405       "movi        v21.8h, #42, lsl #0           \n"  // UG coeff (-0.33126) / 2
2406       "movi        v22.8h, #21, lsl #0           \n"  // UR coeff (-0.16874) / 2
2407       "movi        v23.8h, #10, lsl #0           \n"  // VB coeff (-0.08131) / 2
2408       "movi        v24.8h, #53, lsl #0           \n"  // VG coeff (-0.41869) / 2
2409       "movi        v25.16b, #0x80                \n"  // 128.5 (0x8080 in 16-bit)
2410       "1:                                        \n"
2411       "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
2412       "uaddlp      v0.8h, v0.16b                 \n"  // R 16 bytes -> 8 shorts.
2413       "prfm        pldl1keep, [%0, 448]          \n"
2414       "uaddlp      v1.8h, v1.16b                 \n"  // G 16 bytes -> 8 shorts.
2415       "uaddlp      v2.8h, v2.16b                 \n"  // B 16 bytes -> 8 shorts.
2416       "ld4         {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load next 16
2417       "uadalp      v0.8h, v4.16b                 \n"  // R 16 bytes -> 8 shorts.
2418       "prfm        pldl1keep, [%1, 448]          \n"
2419       "uadalp      v1.8h, v5.16b                 \n"  // G 16 bytes -> 8 shorts.
2420       "uadalp      v2.8h, v6.16b                 \n"  // B 16 bytes -> 8 shorts.
2421 
2422       "urshr       v0.8h, v0.8h, #1              \n"  // 2x average
2423       "urshr       v1.8h, v1.8h, #1              \n"
2424       "urshr       v2.8h, v2.8h, #1              \n"
2425 
2426       "subs        %w4, %w4, #16                 \n"  // 16 processed per loop.
2427     RGBTOUV(v2.8h, v1.8h, v0.8h)
2428       "st1         {v0.8b}, [%2], #8             \n"  // store 8 pixels U.
2429       "st1         {v1.8b}, [%3], #8             \n"  // store 8 pixels V.
2430       "b.gt        1b                            \n"
2431   : "+r"(src_abgr),  // %0
2432     "+r"(src_abgr_1),  // %1
2433     "+r"(dst_uj),     // %2
2434     "+r"(dst_vj),     // %3
2435     "+r"(width)        // %4
2436   :
2437   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
2438     "v20", "v21", "v22", "v23", "v24", "v25"
2439   );
2440 }
2441 
RGB24ToUVJRow_NEON(const uint8_t * src_rgb24,int src_stride_rgb24,uint8_t * dst_u,uint8_t * dst_v,int width)2442 void RGB24ToUVJRow_NEON(const uint8_t* src_rgb24,
2443                         int src_stride_rgb24,
2444                         uint8_t* dst_u,
2445                         uint8_t* dst_v,
2446                         int width) {
2447   const uint8_t* src_rgb24_1 = src_rgb24 + src_stride_rgb24;
2448   asm volatile (
2449       "movi        v20.8h, #63, lsl #0           \n"  // UB/VR coeff (0.500) / 2
2450       "movi        v21.8h, #42, lsl #0           \n"  // UG coeff (-0.33126) / 2
2451       "movi        v22.8h, #21, lsl #0           \n"  // UR coeff (-0.16874) / 2
2452       "movi        v23.8h, #10, lsl #0           \n"  // VB coeff (-0.08131) / 2
2453       "movi        v24.8h, #53, lsl #0           \n"  // VG coeff (-0.41869) / 2
2454       "movi        v25.16b, #0x80                \n"  // 128.5 (0x8080 in 16-bit)
2455       "1:                                        \n"
2456       "ld3         {v0.16b,v1.16b,v2.16b}, [%0], #48 \n"  // load 16 pixels.
2457       "uaddlp      v0.8h, v0.16b                 \n"  // B 16 bytes -> 8 shorts.
2458       "prfm        pldl1keep, [%0, 448]          \n"
2459       "uaddlp      v1.8h, v1.16b                 \n"  // G 16 bytes -> 8 shorts.
2460       "uaddlp      v2.8h, v2.16b                 \n"  // R 16 bytes -> 8 shorts.
2461       "ld3         {v4.16b,v5.16b,v6.16b}, [%1], #48 \n"  // load next 16
2462       "uadalp      v0.8h, v4.16b                 \n"  // B 16 bytes -> 8 shorts.
2463       "prfm        pldl1keep, [%1, 448]          \n"
2464       "uadalp      v1.8h, v5.16b                 \n"  // G 16 bytes -> 8 shorts.
2465       "uadalp      v2.8h, v6.16b                 \n"  // R 16 bytes -> 8 shorts.
2466 
2467       "urshr       v0.8h, v0.8h, #1              \n"  // 2x average
2468       "urshr       v1.8h, v1.8h, #1              \n"
2469       "urshr       v2.8h, v2.8h, #1              \n"
2470 
2471       "subs        %w4, %w4, #16                 \n"  // 16 processed per loop.
2472     RGBTOUV(v0.8h, v1.8h, v2.8h)
2473       "st1         {v0.8b}, [%2], #8             \n"  // store 8 pixels U.
2474       "st1         {v1.8b}, [%3], #8             \n"  // store 8 pixels V.
2475       "b.gt        1b                            \n"
2476   : "+r"(src_rgb24),  // %0
2477     "+r"(src_rgb24_1),  // %1
2478     "+r"(dst_u),     // %2
2479     "+r"(dst_v),     // %3
2480     "+r"(width)        // %4
2481   :
2482   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
2483     "v20", "v21", "v22", "v23", "v24", "v25"
2484   );
2485 }
2486 
RAWToUVJRow_NEON(const uint8_t * src_raw,int src_stride_raw,uint8_t * dst_u,uint8_t * dst_v,int width)2487 void RAWToUVJRow_NEON(const uint8_t* src_raw,
2488                       int src_stride_raw,
2489                       uint8_t* dst_u,
2490                       uint8_t* dst_v,
2491                       int width) {
2492   const uint8_t* src_raw_1 = src_raw + src_stride_raw;
2493   asm volatile (
2494       "movi        v20.8h, #63, lsl #0           \n"  // UB/VR coeff (0.500) / 2
2495       "movi        v21.8h, #42, lsl #0           \n"  // UG coeff (-0.33126) / 2
2496       "movi        v22.8h, #21, lsl #0           \n"  // UR coeff (-0.16874) / 2
2497       "movi        v23.8h, #10, lsl #0           \n"  // VB coeff (-0.08131) / 2
2498       "movi        v24.8h, #53, lsl #0           \n"  // VG coeff (-0.41869) / 2
2499       "movi        v25.16b, #0x80                \n"  // 128.5 (0x8080 in 16-bit)
2500       "1:                                        \n"
2501       "ld3         {v0.16b,v1.16b,v2.16b}, [%0], #48 \n"  // load 16 pixels.
2502       "uaddlp      v0.8h, v0.16b                 \n"  // B 16 bytes -> 8 shorts.
2503       "prfm        pldl1keep, [%0, 448]          \n"
2504       "uaddlp      v1.8h, v1.16b                 \n"  // G 16 bytes -> 8 shorts.
2505       "uaddlp      v2.8h, v2.16b                 \n"  // R 16 bytes -> 8 shorts.
2506       "ld3         {v4.16b,v5.16b,v6.16b}, [%1], #48 \n"  // load next 16
2507       "uadalp      v0.8h, v4.16b                 \n"  // B 16 bytes -> 8 shorts.
2508       "prfm        pldl1keep, [%1, 448]          \n"
2509       "uadalp      v1.8h, v5.16b                 \n"  // G 16 bytes -> 8 shorts.
2510       "uadalp      v2.8h, v6.16b                 \n"  // R 16 bytes -> 8 shorts.
2511 
2512       "urshr       v0.8h, v0.8h, #1              \n"  // 2x average
2513       "urshr       v1.8h, v1.8h, #1              \n"
2514       "urshr       v2.8h, v2.8h, #1              \n"
2515 
2516       "subs        %w4, %w4, #16                 \n"  // 16 processed per loop.
2517     RGBTOUV(v2.8h, v1.8h, v0.8h)
2518       "st1         {v0.8b}, [%2], #8             \n"  // store 8 pixels U.
2519       "st1         {v1.8b}, [%3], #8             \n"  // store 8 pixels V.
2520       "b.gt        1b                            \n"
2521   : "+r"(src_raw),  // %0
2522     "+r"(src_raw_1),  // %1
2523     "+r"(dst_u),     // %2
2524     "+r"(dst_v),     // %3
2525     "+r"(width)        // %4
2526   :
2527   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
2528     "v20", "v21", "v22", "v23", "v24", "v25"
2529   );
2530 }
2531 
BGRAToUVRow_NEON(const uint8_t * src_bgra,int src_stride_bgra,uint8_t * dst_u,uint8_t * dst_v,int width)2532 void BGRAToUVRow_NEON(const uint8_t* src_bgra,
2533                       int src_stride_bgra,
2534                       uint8_t* dst_u,
2535                       uint8_t* dst_v,
2536                       int width) {
2537   const uint8_t* src_bgra_1 = src_bgra + src_stride_bgra;
2538   asm volatile (
2539     RGBTOUV_SETUP_REG
2540       "1:                                        \n"
2541       "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
2542       "uaddlp      v0.8h, v3.16b                 \n"  // B 16 bytes -> 8 shorts.
2543       "prfm        pldl1keep, [%0, 448]          \n"
2544       "uaddlp      v3.8h, v2.16b                 \n"  // G 16 bytes -> 8 shorts.
2545       "uaddlp      v2.8h, v1.16b                 \n"  // R 16 bytes -> 8 shorts.
2546       "ld4         {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load 16 more
2547       "uadalp      v0.8h, v7.16b                 \n"  // B 16 bytes -> 8 shorts.
2548       "prfm        pldl1keep, [%1, 448]          \n"
2549       "uadalp      v3.8h, v6.16b                 \n"  // G 16 bytes -> 8 shorts.
2550       "uadalp      v2.8h, v5.16b                 \n"  // R 16 bytes -> 8 shorts.
2551 
2552       "urshr       v0.8h, v0.8h, #1              \n"  // 2x average
2553       "urshr       v1.8h, v3.8h, #1              \n"
2554       "urshr       v2.8h, v2.8h, #1              \n"
2555 
2556       "subs        %w4, %w4, #16                 \n"  // 16 processed per loop.
2557     RGBTOUV(v0.8h, v1.8h, v2.8h)
2558       "st1         {v0.8b}, [%2], #8             \n"  // store 8 pixels U.
2559       "st1         {v1.8b}, [%3], #8             \n"  // store 8 pixels V.
2560       "b.gt        1b                            \n"
2561   : "+r"(src_bgra),  // %0
2562     "+r"(src_bgra_1),  // %1
2563     "+r"(dst_u),     // %2
2564     "+r"(dst_v),     // %3
2565     "+r"(width)        // %4
2566   :
2567   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
2568     "v20", "v21", "v22", "v23", "v24", "v25"
2569   );
2570 }
2571 
ABGRToUVRow_NEON(const uint8_t * src_abgr,int src_stride_abgr,uint8_t * dst_u,uint8_t * dst_v,int width)2572 void ABGRToUVRow_NEON(const uint8_t* src_abgr,
2573                       int src_stride_abgr,
2574                       uint8_t* dst_u,
2575                       uint8_t* dst_v,
2576                       int width) {
2577   const uint8_t* src_abgr_1 = src_abgr + src_stride_abgr;
2578   asm volatile (
2579     RGBTOUV_SETUP_REG
2580       "1:                                        \n"
2581       "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
2582       "uaddlp      v3.8h, v2.16b                 \n"  // B 16 bytes -> 8 shorts.
2583       "prfm        pldl1keep, [%0, 448]          \n"
2584       "uaddlp      v2.8h, v1.16b                 \n"  // G 16 bytes -> 8 shorts.
2585       "uaddlp      v1.8h, v0.16b                 \n"  // R 16 bytes -> 8 shorts.
2586       "ld4         {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load 16 more.
2587       "uadalp      v3.8h, v6.16b                 \n"  // B 16 bytes -> 8 shorts.
2588       "prfm        pldl1keep, [%1, 448]          \n"
2589       "uadalp      v2.8h, v5.16b                 \n"  // G 16 bytes -> 8 shorts.
2590       "uadalp      v1.8h, v4.16b                 \n"  // R 16 bytes -> 8 shorts.
2591 
2592       "urshr       v0.8h, v3.8h, #1              \n"  // 2x average
2593       "urshr       v2.8h, v2.8h, #1              \n"
2594       "urshr       v1.8h, v1.8h, #1              \n"
2595 
2596       "subs        %w4, %w4, #16                 \n"  // 16 processed per loop.
2597     RGBTOUV(v0.8h, v2.8h, v1.8h)
2598       "st1         {v0.8b}, [%2], #8             \n"  // store 8 pixels U.
2599       "st1         {v1.8b}, [%3], #8             \n"  // store 8 pixels V.
2600       "b.gt        1b                            \n"
2601   : "+r"(src_abgr),  // %0
2602     "+r"(src_abgr_1),  // %1
2603     "+r"(dst_u),     // %2
2604     "+r"(dst_v),     // %3
2605     "+r"(width)        // %4
2606   :
2607   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
2608     "v20", "v21", "v22", "v23", "v24", "v25"
2609   );
2610 }
2611 
RGBAToUVRow_NEON(const uint8_t * src_rgba,int src_stride_rgba,uint8_t * dst_u,uint8_t * dst_v,int width)2612 void RGBAToUVRow_NEON(const uint8_t* src_rgba,
2613                       int src_stride_rgba,
2614                       uint8_t* dst_u,
2615                       uint8_t* dst_v,
2616                       int width) {
2617   const uint8_t* src_rgba_1 = src_rgba + src_stride_rgba;
2618   asm volatile (
2619     RGBTOUV_SETUP_REG
2620       "1:                                        \n"
2621       "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
2622       "uaddlp      v0.8h, v1.16b                 \n"  // B 16 bytes -> 8 shorts.
2623       "prfm        pldl1keep, [%0, 448]          \n"
2624       "uaddlp      v1.8h, v2.16b                 \n"  // G 16 bytes -> 8 shorts.
2625       "uaddlp      v2.8h, v3.16b                 \n"  // R 16 bytes -> 8 shorts.
2626       "ld4         {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load 16 more.
2627       "uadalp      v0.8h, v5.16b                 \n"  // B 16 bytes -> 8 shorts.
2628       "prfm        pldl1keep, [%1, 448]          \n"
2629       "uadalp      v1.8h, v6.16b                 \n"  // G 16 bytes -> 8 shorts.
2630       "uadalp      v2.8h, v7.16b                 \n"  // R 16 bytes -> 8 shorts.
2631 
2632       "urshr       v0.8h, v0.8h, #1              \n"  // 2x average
2633       "urshr       v1.8h, v1.8h, #1              \n"
2634       "urshr       v2.8h, v2.8h, #1              \n"
2635 
2636       "subs        %w4, %w4, #16                 \n"  // 16 processed per loop.
2637     RGBTOUV(v0.8h, v1.8h, v2.8h)
2638       "st1         {v0.8b}, [%2], #8             \n"  // store 8 pixels U.
2639       "st1         {v1.8b}, [%3], #8             \n"  // store 8 pixels V.
2640       "b.gt        1b                            \n"
2641   : "+r"(src_rgba),  // %0
2642     "+r"(src_rgba_1),  // %1
2643     "+r"(dst_u),     // %2
2644     "+r"(dst_v),     // %3
2645     "+r"(width)        // %4
2646   :
2647   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
2648     "v20", "v21", "v22", "v23", "v24", "v25"
2649   );
2650 }
2651 
RGB24ToUVRow_NEON(const uint8_t * src_rgb24,int src_stride_rgb24,uint8_t * dst_u,uint8_t * dst_v,int width)2652 void RGB24ToUVRow_NEON(const uint8_t* src_rgb24,
2653                        int src_stride_rgb24,
2654                        uint8_t* dst_u,
2655                        uint8_t* dst_v,
2656                        int width) {
2657   const uint8_t* src_rgb24_1 = src_rgb24 + src_stride_rgb24;
2658   asm volatile (
2659     RGBTOUV_SETUP_REG
2660       "1:                                        \n"
2661       "ld3         {v0.16b,v1.16b,v2.16b}, [%0], #48 \n"  // load 16 pixels.
2662       "uaddlp      v0.8h, v0.16b                 \n"  // B 16 bytes -> 8 shorts.
2663       "prfm        pldl1keep, [%0, 448]          \n"
2664       "uaddlp      v1.8h, v1.16b                 \n"  // G 16 bytes -> 8 shorts.
2665       "uaddlp      v2.8h, v2.16b                 \n"  // R 16 bytes -> 8 shorts.
2666       "ld3         {v4.16b,v5.16b,v6.16b}, [%1], #48 \n"  // load 16 more.
2667       "uadalp      v0.8h, v4.16b                 \n"  // B 16 bytes -> 8 shorts.
2668       "prfm        pldl1keep, [%1, 448]          \n"
2669       "uadalp      v1.8h, v5.16b                 \n"  // G 16 bytes -> 8 shorts.
2670       "uadalp      v2.8h, v6.16b                 \n"  // R 16 bytes -> 8 shorts.
2671 
2672       "urshr       v0.8h, v0.8h, #1              \n"  // 2x average
2673       "urshr       v1.8h, v1.8h, #1              \n"
2674       "urshr       v2.8h, v2.8h, #1              \n"
2675 
2676       "subs        %w4, %w4, #16                 \n"  // 16 processed per loop.
2677     RGBTOUV(v0.8h, v1.8h, v2.8h)
2678       "st1         {v0.8b}, [%2], #8             \n"  // store 8 pixels U.
2679       "st1         {v1.8b}, [%3], #8             \n"  // store 8 pixels V.
2680       "b.gt        1b                            \n"
2681   : "+r"(src_rgb24),  // %0
2682     "+r"(src_rgb24_1),  // %1
2683     "+r"(dst_u),     // %2
2684     "+r"(dst_v),     // %3
2685     "+r"(width)        // %4
2686   :
2687   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
2688     "v20", "v21", "v22", "v23", "v24", "v25"
2689   );
2690 }
2691 
RAWToUVRow_NEON(const uint8_t * src_raw,int src_stride_raw,uint8_t * dst_u,uint8_t * dst_v,int width)2692 void RAWToUVRow_NEON(const uint8_t* src_raw,
2693                      int src_stride_raw,
2694                      uint8_t* dst_u,
2695                      uint8_t* dst_v,
2696                      int width) {
2697   const uint8_t* src_raw_1 = src_raw + src_stride_raw;
2698   asm volatile (
2699     RGBTOUV_SETUP_REG
2700       "1:                                        \n"
2701       "ld3         {v0.16b,v1.16b,v2.16b}, [%0], #48 \n"  // load 16 RAW pixels.
2702       "uaddlp      v2.8h, v2.16b                 \n"  // B 16 bytes -> 8 shorts.
2703       "prfm        pldl1keep, [%0, 448]          \n"
2704       "uaddlp      v1.8h, v1.16b                 \n"  // G 16 bytes -> 8 shorts.
2705       "uaddlp      v0.8h, v0.16b                 \n"  // R 16 bytes -> 8 shorts.
2706       "ld3         {v4.16b,v5.16b,v6.16b}, [%1], #48 \n"  // load 8 more RAW pixels
2707       "uadalp      v2.8h, v6.16b                 \n"  // B 16 bytes -> 8 shorts.
2708       "prfm        pldl1keep, [%1, 448]          \n"
2709       "uadalp      v1.8h, v5.16b                 \n"  // G 16 bytes -> 8 shorts.
2710       "uadalp      v0.8h, v4.16b                 \n"  // R 16 bytes -> 8 shorts.
2711 
2712       "urshr       v2.8h, v2.8h, #1              \n"  // 2x average
2713       "urshr       v1.8h, v1.8h, #1              \n"
2714       "urshr       v0.8h, v0.8h, #1              \n"
2715 
2716       "subs        %w4, %w4, #16                 \n"  // 16 processed per loop.
2717     RGBTOUV(v2.8h, v1.8h, v0.8h)
2718       "st1         {v0.8b}, [%2], #8             \n"  // store 8 pixels U.
2719       "st1         {v1.8b}, [%3], #8             \n"  // store 8 pixels V.
2720       "b.gt        1b                            \n"
2721   : "+r"(src_raw),  // %0
2722     "+r"(src_raw_1),  // %1
2723     "+r"(dst_u),     // %2
2724     "+r"(dst_v),     // %3
2725     "+r"(width)        // %4
2726   :
2727   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
2728     "v20", "v21", "v22", "v23", "v24", "v25"
2729   );
2730 }
2731 
2732 // 16x2 pixels -> 8x1.  width is number of rgb pixels. e.g. 16.
RGB565ToUVRow_NEON(const uint8_t * src_rgb565,int src_stride_rgb565,uint8_t * dst_u,uint8_t * dst_v,int width)2733 void RGB565ToUVRow_NEON(const uint8_t* src_rgb565,
2734                         int src_stride_rgb565,
2735                         uint8_t* dst_u,
2736                         uint8_t* dst_v,
2737                         int width) {
2738   const uint8_t* src_rgb565_1 = src_rgb565 + src_stride_rgb565;
2739   asm volatile(
2740       RGBTOUV_SETUP_REG
2741       "1:                                        \n"
2742       "ld1         {v0.16b}, [%0], #16           \n"  // load 8 RGB565 pixels.
2743       RGB565TOARGB
2744       "uaddlp      v16.4h, v0.8b                 \n"  // B 8 bytes -> 4 shorts.
2745       "prfm        pldl1keep, [%0, 448]          \n"
2746       "uaddlp      v17.4h, v1.8b                 \n"  // G 8 bytes -> 4 shorts.
2747       "uaddlp      v18.4h, v2.8b                 \n"  // R 8 bytes -> 4 shorts.
2748       "ld1         {v0.16b}, [%0], #16           \n"  // next 8 RGB565 pixels.
2749       RGB565TOARGB
2750       "uaddlp      v26.4h, v0.8b                 \n"  // B 8 bytes -> 4 shorts.
2751       "uaddlp      v27.4h, v1.8b                 \n"  // G 8 bytes -> 4 shorts.
2752       "uaddlp      v28.4h, v2.8b                 \n"  // R 8 bytes -> 4 shorts.
2753 
2754       "ld1         {v0.16b}, [%1], #16           \n"  // load 8 RGB565 pixels.
2755       RGB565TOARGB
2756       "uadalp      v16.4h, v0.8b                 \n"  // B 8 bytes -> 4 shorts.
2757       "prfm        pldl1keep, [%1, 448]          \n"
2758       "uadalp      v17.4h, v1.8b                 \n"  // G 8 bytes -> 4 shorts.
2759       "uadalp      v18.4h, v2.8b                 \n"  // R 8 bytes -> 4 shorts.
2760       "ld1         {v0.16b}, [%1], #16           \n"  // next 8 RGB565 pixels.
2761       RGB565TOARGB
2762       "uadalp      v26.4h, v0.8b                 \n"  // B 8 bytes -> 4 shorts.
2763       "uadalp      v27.4h, v1.8b                 \n"  // G 8 bytes -> 4 shorts.
2764       "uadalp      v28.4h, v2.8b                 \n"  // R 8 bytes -> 4 shorts.
2765 
2766       "ins         v16.D[1], v26.D[0]            \n"
2767       "ins         v17.D[1], v27.D[0]            \n"
2768       "ins         v18.D[1], v28.D[0]            \n"
2769 
2770       "urshr       v0.8h, v16.8h, #1             \n"  // 2x average
2771       "urshr       v1.8h, v17.8h, #1             \n"
2772       "urshr       v2.8h, v18.8h, #1             \n"
2773 
2774       "subs        %w4, %w4, #16                 \n"  // 16 processed per loop.
2775       RGBTOUV(v0.8h, v1.8h, v2.8h)
2776       "st1         {v0.8b}, [%2], #8             \n"  // store 8 pixels U.
2777       "st1         {v1.8b}, [%3], #8             \n"  // store 8 pixels V.
2778       "b.gt        1b                            \n"
2779       : "+r"(src_rgb565),    // %0
2780         "+r"(src_rgb565_1),  // %1
2781         "+r"(dst_u),           // %2
2782         "+r"(dst_v),           // %3
2783         "+r"(width)            // %4
2784       :
2785       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17",
2786         "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27",
2787         "v28");
2788 }
2789 
2790 // 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
ARGB1555ToUVRow_NEON(const uint8_t * src_argb1555,int src_stride_argb1555,uint8_t * dst_u,uint8_t * dst_v,int width)2791 void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555,
2792                           int src_stride_argb1555,
2793                           uint8_t* dst_u,
2794                           uint8_t* dst_v,
2795                           int width) {
2796   const uint8_t* src_argb1555_1 = src_argb1555 + src_stride_argb1555;
2797   asm volatile(
2798       RGBTOUV_SETUP_REG
2799       "1:                                        \n"
2800       "ld1         {v0.16b}, [%0], #16           \n"  // load 8 ARGB1555 pixels.
2801       RGB555TOARGB
2802       "uaddlp      v16.4h, v0.8b                 \n"  // B 8 bytes -> 4 shorts.
2803       "prfm        pldl1keep, [%0, 448]          \n"
2804       "uaddlp      v17.4h, v1.8b                 \n"  // G 8 bytes -> 4 shorts.
2805       "uaddlp      v18.4h, v2.8b                 \n"  // R 8 bytes -> 4 shorts.
2806       "ld1         {v0.16b}, [%0], #16           \n"  // next 8 ARGB1555 pixels.
2807       RGB555TOARGB
2808       "uaddlp      v26.4h, v0.8b                 \n"  // B 8 bytes -> 4 shorts.
2809       "uaddlp      v27.4h, v1.8b                 \n"  // G 8 bytes -> 4 shorts.
2810       "uaddlp      v28.4h, v2.8b                 \n"  // R 8 bytes -> 4 shorts.
2811 
2812       "ld1         {v0.16b}, [%1], #16           \n"  // load 8 ARGB1555 pixels.
2813       RGB555TOARGB
2814       "uadalp      v16.4h, v0.8b                 \n"  // B 8 bytes -> 4 shorts.
2815       "prfm        pldl1keep, [%1, 448]          \n"
2816       "uadalp      v17.4h, v1.8b                 \n"  // G 8 bytes -> 4 shorts.
2817       "uadalp      v18.4h, v2.8b                 \n"  // R 8 bytes -> 4 shorts.
2818       "ld1         {v0.16b}, [%1], #16           \n"  // next 8 ARGB1555 pixels.
2819       RGB555TOARGB
2820       "uadalp      v26.4h, v0.8b                 \n"  // B 8 bytes -> 4 shorts.
2821       "uadalp      v27.4h, v1.8b                 \n"  // G 8 bytes -> 4 shorts.
2822       "uadalp      v28.4h, v2.8b                 \n"  // R 8 bytes -> 4 shorts.
2823 
2824       "ins         v16.D[1], v26.D[0]            \n"
2825       "ins         v17.D[1], v27.D[0]            \n"
2826       "ins         v18.D[1], v28.D[0]            \n"
2827 
2828       "urshr       v0.8h, v16.8h, #1             \n"  // 2x average
2829       "urshr       v1.8h, v17.8h, #1             \n"
2830       "urshr       v2.8h, v18.8h, #1             \n"
2831 
2832       "subs        %w4, %w4, #16                 \n"  // 16 processed per loop.
2833       RGBTOUV(v0.8h, v1.8h, v2.8h)
2834       "st1         {v0.8b}, [%2], #8             \n"  // store 8 pixels U.
2835       "st1         {v1.8b}, [%3], #8             \n"  // store 8 pixels V.
2836       "b.gt        1b                            \n"
2837       : "+r"(src_argb1555),    // %0
2838         "+r"(src_argb1555_1),  // %1
2839         "+r"(dst_u),           // %2
2840         "+r"(dst_v),           // %3
2841         "+r"(width)            // %4
2842       :
2843       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17",
2844         "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27",
2845         "v28");
2846 }
2847 
2848 // 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
ARGB4444ToUVRow_NEON(const uint8_t * src_argb4444,int src_stride_argb4444,uint8_t * dst_u,uint8_t * dst_v,int width)2849 void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
2850                           int src_stride_argb4444,
2851                           uint8_t* dst_u,
2852                           uint8_t* dst_v,
2853                           int width) {
2854   const uint8_t* src_argb4444_1 = src_argb4444 + src_stride_argb4444;
2855   asm volatile(
2856       RGBTOUV_SETUP_REG  // sets v20-v25
2857       "1:                                        \n"
2858       "ld1         {v0.16b}, [%0], #16           \n"  // load 8 ARGB4444 pixels.
2859       ARGB4444TOARGB
2860       "uaddlp      v16.4h, v0.8b                 \n"  // B 8 bytes -> 4 shorts.
2861       "prfm        pldl1keep, [%0, 448]          \n"
2862       "uaddlp      v17.4h, v1.8b                 \n"  // G 8 bytes -> 4 shorts.
2863       "uaddlp      v18.4h, v2.8b                 \n"  // R 8 bytes -> 4 shorts.
2864       "ld1         {v0.16b}, [%0], #16           \n"  // next 8 ARGB4444 pixels.
2865       ARGB4444TOARGB
2866       "uaddlp      v26.4h, v0.8b                 \n"  // B 8 bytes -> 4 shorts.
2867       "uaddlp      v27.4h, v1.8b                 \n"  // G 8 bytes -> 4 shorts.
2868       "uaddlp      v28.4h, v2.8b                 \n"  // R 8 bytes -> 4 shorts.
2869 
2870       "ld1         {v0.16b}, [%1], #16           \n"  // load 8 ARGB4444 pixels.
2871       ARGB4444TOARGB
2872       "uadalp      v16.4h, v0.8b                 \n"  // B 8 bytes -> 4 shorts.
2873       "prfm        pldl1keep, [%1, 448]          \n"
2874       "uadalp      v17.4h, v1.8b                 \n"  // G 8 bytes -> 4 shorts.
2875       "uadalp      v18.4h, v2.8b                 \n"  // R 8 bytes -> 4 shorts.
2876       "ld1         {v0.16b}, [%1], #16           \n"  // next 8 ARGB4444 pixels.
2877       ARGB4444TOARGB
2878       "uadalp      v26.4h, v0.8b                 \n"  // B 8 bytes -> 4 shorts.
2879       "uadalp      v27.4h, v1.8b                 \n"  // G 8 bytes -> 4 shorts.
2880       "uadalp      v28.4h, v2.8b                 \n"  // R 8 bytes -> 4 shorts.
2881 
2882       "ins         v16.D[1], v26.D[0]            \n"
2883       "ins         v17.D[1], v27.D[0]            \n"
2884       "ins         v18.D[1], v28.D[0]            \n"
2885 
2886       "urshr       v0.8h, v16.8h, #1             \n"  // 2x average
2887       "urshr       v1.8h, v17.8h, #1             \n"
2888       "urshr       v2.8h, v18.8h, #1             \n"
2889 
2890       "subs        %w4, %w4, #16                 \n"  // 16 processed per loop.
2891       RGBTOUV(v0.8h, v1.8h, v2.8h)
2892       "st1         {v0.8b}, [%2], #8             \n"  // store 8 pixels U.
2893       "st1         {v1.8b}, [%3], #8             \n"  // store 8 pixels V.
2894       "b.gt        1b                            \n"
2895       : "+r"(src_argb4444),    // %0
2896         "+r"(src_argb4444_1),  // %1
2897         "+r"(dst_u),           // %2
2898         "+r"(dst_v),           // %3
2899         "+r"(width)            // %4
2900       :
2901       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17",
2902         "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27",
2903         "v28"
2904 
2905   );
2906 }
2907 
RGB565ToYRow_NEON(const uint8_t * src_rgb565,uint8_t * dst_y,int width)2908 void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
2909   asm volatile(
2910       "movi        v24.8b, #25                   \n"  // B * 0.1016 coefficient
2911       "movi        v25.8b, #129                  \n"  // G * 0.5078 coefficient
2912       "movi        v26.8b, #66                   \n"  // R * 0.2578 coefficient
2913       "movi        v27.8b, #16                   \n"  // Add 16 constant
2914       "1:                                        \n"
2915       "ld1         {v0.16b}, [%0], #16           \n"  // load 8 RGB565 pixels.
2916       "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
2917       RGB565TOARGB
2918       "umull       v3.8h, v0.8b, v24.8b          \n"  // B
2919       "prfm        pldl1keep, [%0, 448]          \n"
2920       "umlal       v3.8h, v1.8b, v25.8b          \n"  // G
2921       "umlal       v3.8h, v2.8b, v26.8b          \n"  // R
2922       "uqrshrn     v0.8b, v3.8h, #8              \n"  // 16 bit to 8 bit Y
2923       "uqadd       v0.8b, v0.8b, v27.8b          \n"
2924       "st1         {v0.8b}, [%1], #8             \n"  // store 8 pixels Y.
2925       "b.gt        1b                            \n"
2926       : "+r"(src_rgb565),  // %0
2927         "+r"(dst_y),       // %1
2928         "+r"(width)        // %2
2929       :
2930       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6", "v24", "v25", "v26",
2931         "v27");
2932 }
2933 
ARGB1555ToYRow_NEON(const uint8_t * src_argb1555,uint8_t * dst_y,int width)2934 void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555,
2935                          uint8_t* dst_y,
2936                          int width) {
2937   asm volatile(
2938       "movi        v4.8b, #25                    \n"  // B * 0.1016 coefficient
2939       "movi        v5.8b, #129                   \n"  // G * 0.5078 coefficient
2940       "movi        v6.8b, #66                    \n"  // R * 0.2578 coefficient
2941       "movi        v7.8b, #16                    \n"  // Add 16 constant
2942       "1:                                        \n"
2943       "ld1         {v0.16b}, [%0], #16           \n"  // load 8 ARGB1555 pixels.
2944       "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
2945       ARGB1555TOARGB
2946       "umull       v3.8h, v0.8b, v4.8b           \n"  // B
2947       "prfm        pldl1keep, [%0, 448]          \n"
2948       "umlal       v3.8h, v1.8b, v5.8b           \n"  // G
2949       "umlal       v3.8h, v2.8b, v6.8b           \n"  // R
2950       "uqrshrn     v0.8b, v3.8h, #8              \n"  // 16 bit to 8 bit Y
2951       "uqadd       v0.8b, v0.8b, v7.8b           \n"
2952       "st1         {v0.8b}, [%1], #8             \n"  // store 8 pixels Y.
2953       "b.gt        1b                            \n"
2954       : "+r"(src_argb1555),  // %0
2955         "+r"(dst_y),         // %1
2956         "+r"(width)          // %2
2957       :
2958       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
2959 }
2960 
ARGB4444ToYRow_NEON(const uint8_t * src_argb4444,uint8_t * dst_y,int width)2961 void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
2962                          uint8_t* dst_y,
2963                          int width) {
2964   asm volatile(
2965       "movi        v24.8b, #25                   \n"  // B * 0.1016 coefficient
2966       "movi        v25.8b, #129                  \n"  // G * 0.5078 coefficient
2967       "movi        v26.8b, #66                   \n"  // R * 0.2578 coefficient
2968       "movi        v27.8b, #16                   \n"  // Add 16 constant
2969       "1:                                        \n"
2970       "ld1         {v0.16b}, [%0], #16           \n"  // load 8 ARGB4444 pixels.
2971       "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
2972       ARGB4444TOARGB
2973       "umull       v3.8h, v0.8b, v24.8b          \n"  // B
2974       "prfm        pldl1keep, [%0, 448]          \n"
2975       "umlal       v3.8h, v1.8b, v25.8b          \n"  // G
2976       "umlal       v3.8h, v2.8b, v26.8b          \n"  // R
2977       "uqrshrn     v0.8b, v3.8h, #8              \n"  // 16 bit to 8 bit Y
2978       "uqadd       v0.8b, v0.8b, v27.8b          \n"
2979       "st1         {v0.8b}, [%1], #8             \n"  // store 8 pixels Y.
2980       "b.gt        1b                            \n"
2981       : "+r"(src_argb4444),  // %0
2982         "+r"(dst_y),         // %1
2983         "+r"(width)          // %2
2984       :
2985       : "cc", "memory", "v0", "v1", "v2", "v3", "v24", "v25", "v26", "v27");
2986 }
2987 
2988 struct RgbConstants {
2989   uint8_t kRGBToY[4];
2990   uint16_t kAddY;
2991 };
2992 
2993 // ARGB expects first 3 values to contain RGB and 4th value is ignored.
ARGBToYMatrixRow_NEON(const uint8_t * src_argb,uint8_t * dst_y,int width,const struct RgbConstants * rgbconstants)2994 void ARGBToYMatrixRow_NEON(const uint8_t* src_argb,
2995                            uint8_t* dst_y,
2996                            int width,
2997                            const struct RgbConstants* rgbconstants) {
2998   asm volatile(
2999       "ldr         d0, [%3]                      \n"  // load rgbconstants
3000       "dup         v6.16b, v0.b[0]               \n"
3001       "dup         v7.16b, v0.b[1]               \n"
3002       "dup         v16.16b, v0.b[2]              \n"
3003       "dup         v17.8h,  v0.h[2]              \n"
3004       "1:                                        \n"
3005       "ld4         {v2.16b,v3.16b,v4.16b,v5.16b}, [%0], #64 \n"  // load 16
3006                                                                  // pixels.
3007       "subs        %w2, %w2, #16                 \n"  // 16 processed per loop.
3008       "umull       v0.8h, v2.8b, v6.8b           \n"  // B
3009       "umull2      v1.8h, v2.16b, v6.16b         \n"
3010       "prfm        pldl1keep, [%0, 448]          \n"
3011       "umlal       v0.8h, v3.8b, v7.8b           \n"  // G
3012       "umlal2      v1.8h, v3.16b, v7.16b         \n"
3013       "umlal       v0.8h, v4.8b, v16.8b          \n"  // R
3014       "umlal2      v1.8h, v4.16b, v16.16b        \n"
3015       "addhn       v0.8b, v0.8h, v17.8h          \n"  // 16 bit to 8 bit Y
3016       "addhn       v1.8b, v1.8h, v17.8h          \n"
3017       "st1         {v0.8b, v1.8b}, [%1], #16     \n"  // store 16 pixels Y.
3018       "b.gt        1b                            \n"
3019       : "+r"(src_argb),    // %0
3020         "+r"(dst_y),       // %1
3021         "+r"(width)        // %2
3022       : "r"(rgbconstants)  // %3
3023       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
3024         "v17");
3025 }
3026 
3027 // RGB to JPeg coefficients
3028 // B * 0.1140 coefficient = 29
3029 // G * 0.5870 coefficient = 150
3030 // R * 0.2990 coefficient = 77
3031 // Add 0.5 = 0x80
3032 static const struct RgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0}, 128};
3033 
3034 static const struct RgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128};
3035 
3036 // RGB to BT.601 coefficients
3037 // B * 0.1016 coefficient = 25
3038 // G * 0.5078 coefficient = 129
3039 // R * 0.2578 coefficient = 66
3040 // Add 16.5 = 0x1080
3041 
3042 static const struct RgbConstants kRgb24I601Constants = {{25, 129, 66, 0},
3043                                                         0x1080};
3044 
3045 static const struct RgbConstants kRawI601Constants = {{66, 129, 25, 0}, 0x1080};
3046 
ARGBToYRow_NEON(const uint8_t * src_argb,uint8_t * dst_y,int width)3047 void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
3048   ARGBToYMatrixRow_NEON(src_argb, dst_y, width, &kRgb24I601Constants);
3049 }
3050 
ARGBToYJRow_NEON(const uint8_t * src_argb,uint8_t * dst_yj,int width)3051 void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_yj, int width) {
3052   ARGBToYMatrixRow_NEON(src_argb, dst_yj, width, &kRgb24JPEGConstants);
3053 }
3054 
ABGRToYRow_NEON(const uint8_t * src_abgr,uint8_t * dst_y,int width)3055 void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
3056   ARGBToYMatrixRow_NEON(src_abgr, dst_y, width, &kRawI601Constants);
3057 }
3058 
ABGRToYJRow_NEON(const uint8_t * src_abgr,uint8_t * dst_yj,int width)3059 void ABGRToYJRow_NEON(const uint8_t* src_abgr, uint8_t* dst_yj, int width) {
3060   ARGBToYMatrixRow_NEON(src_abgr, dst_yj, width, &kRawJPEGConstants);
3061 }
3062 
3063 // RGBA expects first value to be A and ignored, then 3 values to contain RGB.
3064 // Same code as ARGB, except the LD4
RGBAToYMatrixRow_NEON(const uint8_t * src_rgba,uint8_t * dst_y,int width,const struct RgbConstants * rgbconstants)3065 void RGBAToYMatrixRow_NEON(const uint8_t* src_rgba,
3066                            uint8_t* dst_y,
3067                            int width,
3068                            const struct RgbConstants* rgbconstants) {
3069   asm volatile(
3070       "ldr         d0, [%3]                      \n"  // load rgbconstants
3071       "dup         v6.16b, v0.b[0]               \n"
3072       "dup         v7.16b, v0.b[1]               \n"
3073       "dup         v16.16b, v0.b[2]              \n"
3074       "dup         v17.8h,  v0.h[2]              \n"
3075       "1:                                        \n"
3076       "ld4         {v1.16b,v2.16b,v3.16b,v4.16b}, [%0], #64 \n"  // load 16
3077                                                                  // pixels.
3078       "subs        %w2, %w2, #16                 \n"  // 16 processed per loop.
3079       "umull       v0.8h, v2.8b, v6.8b           \n"  // B
3080       "umull2      v1.8h, v2.16b, v6.16b         \n"
3081       "prfm        pldl1keep, [%0, 448]          \n"
3082       "umlal       v0.8h, v3.8b, v7.8b           \n"  // G
3083       "umlal2      v1.8h, v3.16b, v7.16b         \n"
3084       "umlal       v0.8h, v4.8b, v16.8b          \n"  // R
3085       "umlal2      v1.8h, v4.16b, v16.16b        \n"
3086       "addhn       v0.8b, v0.8h, v17.8h          \n"  // 16 bit to 8 bit Y
3087       "addhn       v1.8b, v1.8h, v17.8h          \n"
3088       "st1         {v0.8b, v1.8b}, [%1], #16     \n"  // store 16 pixels Y.
3089       "b.gt        1b                            \n"
3090       : "+r"(src_rgba),    // %0
3091         "+r"(dst_y),       // %1
3092         "+r"(width)        // %2
3093       : "r"(rgbconstants)  // %3
3094       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
3095         "v17");
3096 }
3097 
RGBAToYRow_NEON(const uint8_t * src_rgba,uint8_t * dst_y,int width)3098 void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
3099   RGBAToYMatrixRow_NEON(src_rgba, dst_y, width, &kRgb24I601Constants);
3100 }
3101 
RGBAToYJRow_NEON(const uint8_t * src_rgba,uint8_t * dst_yj,int width)3102 void RGBAToYJRow_NEON(const uint8_t* src_rgba, uint8_t* dst_yj, int width) {
3103   RGBAToYMatrixRow_NEON(src_rgba, dst_yj, width, &kRgb24JPEGConstants);
3104 }
3105 
BGRAToYRow_NEON(const uint8_t * src_bgra,uint8_t * dst_y,int width)3106 void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
3107   RGBAToYMatrixRow_NEON(src_bgra, dst_y, width, &kRawI601Constants);
3108 }
3109 
RGBToYMatrixRow_NEON(const uint8_t * src_rgb,uint8_t * dst_y,int width,const struct RgbConstants * rgbconstants)3110 void RGBToYMatrixRow_NEON(const uint8_t* src_rgb,
3111                           uint8_t* dst_y,
3112                           int width,
3113                           const struct RgbConstants* rgbconstants) {
3114   asm volatile(
3115       "ldr         d0, [%3]                      \n"  // load rgbconstants
3116       "dup         v5.16b, v0.b[0]               \n"
3117       "dup         v6.16b, v0.b[1]               \n"
3118       "dup         v7.16b, v0.b[2]               \n"
3119       "dup         v16.8h,  v0.h[2]              \n"
3120       "1:                                        \n"
3121       "ld3         {v2.16b,v3.16b,v4.16b}, [%0], #48 \n"  // load 16 pixels.
3122       "subs        %w2, %w2, #16                 \n"  // 16 processed per loop.
3123       "umull       v0.8h, v2.8b, v5.8b           \n"  // B
3124       "umull2      v1.8h, v2.16b, v5.16b         \n"
3125       "prfm        pldl1keep, [%0, 448]          \n"
3126       "umlal       v0.8h, v3.8b, v6.8b           \n"  // G
3127       "umlal2      v1.8h, v3.16b, v6.16b         \n"
3128       "umlal       v0.8h, v4.8b, v7.8b           \n"  // R
3129       "umlal2      v1.8h, v4.16b, v7.16b         \n"
3130       "addhn       v0.8b, v0.8h, v16.8h          \n"  // 16 bit to 8 bit Y
3131       "addhn       v1.8b, v1.8h, v16.8h          \n"
3132       "st1         {v0.8b, v1.8b}, [%1], #16     \n"  // store 16 pixels Y.
3133       "b.gt        1b                            \n"
3134       : "+r"(src_rgb),     // %0
3135         "+r"(dst_y),       // %1
3136         "+r"(width)        // %2
3137       : "r"(rgbconstants)  // %3
3138       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
3139 }
3140 
RGB24ToYJRow_NEON(const uint8_t * src_rgb24,uint8_t * dst_yj,int width)3141 void RGB24ToYJRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) {
3142   RGBToYMatrixRow_NEON(src_rgb24, dst_yj, width, &kRgb24JPEGConstants);
3143 }
3144 
RAWToYJRow_NEON(const uint8_t * src_raw,uint8_t * dst_yj,int width)3145 void RAWToYJRow_NEON(const uint8_t* src_raw, uint8_t* dst_yj, int width) {
3146   RGBToYMatrixRow_NEON(src_raw, dst_yj, width, &kRawJPEGConstants);
3147 }
3148 
RGB24ToYRow_NEON(const uint8_t * src_rgb24,uint8_t * dst_y,int width)3149 void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width) {
3150   RGBToYMatrixRow_NEON(src_rgb24, dst_y, width, &kRgb24I601Constants);
3151 }
3152 
RAWToYRow_NEON(const uint8_t * src_raw,uint8_t * dst_y,int width)3153 void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width) {
3154   RGBToYMatrixRow_NEON(src_raw, dst_y, width, &kRawI601Constants);
3155 }
3156 
3157 // Bilinear filter 16x2 -> 16x1
InterpolateRow_NEON(uint8_t * dst_ptr,const uint8_t * src_ptr,ptrdiff_t src_stride,int dst_width,int source_y_fraction)3158 void InterpolateRow_NEON(uint8_t* dst_ptr,
3159                          const uint8_t* src_ptr,
3160                          ptrdiff_t src_stride,
3161                          int dst_width,
3162                          int source_y_fraction) {
3163   int y1_fraction = source_y_fraction;
3164   int y0_fraction = 256 - y1_fraction;
3165   const uint8_t* src_ptr1 = src_ptr + src_stride;
3166   asm volatile(
3167       "cmp         %w4, #0                       \n"
3168       "b.eq        100f                          \n"
3169       "cmp         %w4, #128                     \n"
3170       "b.eq        50f                           \n"
3171 
3172       "dup         v5.16b, %w4                   \n"
3173       "dup         v4.16b, %w5                   \n"
3174       // General purpose row blend.
3175       "1:                                        \n"
3176       "ld1         {v0.16b}, [%1], #16           \n"
3177       "ld1         {v1.16b}, [%2], #16           \n"
3178       "subs        %w3, %w3, #16                 \n"
3179       "umull       v2.8h, v0.8b,  v4.8b          \n"
3180       "prfm        pldl1keep, [%1, 448]          \n"
3181       "umull2      v3.8h, v0.16b, v4.16b         \n"
3182       "prfm        pldl1keep, [%2, 448]          \n"
3183       "umlal       v2.8h, v1.8b,  v5.8b          \n"
3184       "umlal2      v3.8h, v1.16b, v5.16b         \n"
3185       "rshrn       v0.8b,  v2.8h, #8             \n"
3186       "rshrn2      v0.16b, v3.8h, #8             \n"
3187       "st1         {v0.16b}, [%0], #16           \n"
3188       "b.gt        1b                            \n"
3189       "b           99f                           \n"
3190 
3191       // Blend 50 / 50.
3192       "50:                                       \n"
3193       "ld1         {v0.16b}, [%1], #16           \n"
3194       "ld1         {v1.16b}, [%2], #16           \n"
3195       "subs        %w3, %w3, #16                 \n"
3196       "prfm        pldl1keep, [%1, 448]          \n"
3197       "urhadd      v0.16b, v0.16b, v1.16b        \n"
3198       "prfm        pldl1keep, [%2, 448]          \n"
3199       "st1         {v0.16b}, [%0], #16           \n"
3200       "b.gt        50b                           \n"
3201       "b           99f                           \n"
3202 
3203       // Blend 100 / 0 - Copy row unchanged.
3204       "100:                                      \n"
3205       "ld1         {v0.16b}, [%1], #16           \n"
3206       "subs        %w3, %w3, #16                 \n"
3207       "prfm        pldl1keep, [%1, 448]          \n"
3208       "st1         {v0.16b}, [%0], #16           \n"
3209       "b.gt        100b                          \n"
3210 
3211       "99:                                       \n"
3212       : "+r"(dst_ptr),      // %0
3213         "+r"(src_ptr),      // %1
3214         "+r"(src_ptr1),     // %2
3215         "+r"(dst_width),    // %3
3216         "+r"(y1_fraction),  // %4
3217         "+r"(y0_fraction)   // %5
3218       :
3219       : "cc", "memory", "v0", "v1", "v3", "v4", "v5");
3220 }
3221 
3222 // Bilinear filter 8x2 -> 8x1
InterpolateRow_16_NEON(uint16_t * dst_ptr,const uint16_t * src_ptr,ptrdiff_t src_stride,int dst_width,int source_y_fraction)3223 void InterpolateRow_16_NEON(uint16_t* dst_ptr,
3224                             const uint16_t* src_ptr,
3225                             ptrdiff_t src_stride,
3226                             int dst_width,
3227                             int source_y_fraction) {
3228   int y1_fraction = source_y_fraction;
3229   int y0_fraction = 256 - y1_fraction;
3230   const uint16_t* src_ptr1 = src_ptr + src_stride;
3231 
3232   asm volatile(
3233       "cmp         %w4, #0                       \n"
3234       "b.eq        100f                          \n"
3235       "cmp         %w4, #128                     \n"
3236       "b.eq        50f                           \n"
3237 
3238       "dup         v5.8h, %w4                    \n"
3239       "dup         v4.8h, %w5                    \n"
3240       // General purpose row blend.
3241       "1:                                        \n"
3242       "ld1         {v0.8h}, [%1], #16            \n"
3243       "ld1         {v1.8h}, [%2], #16            \n"
3244       "subs        %w3, %w3, #8                  \n"
3245       "umull       v2.4s, v0.4h, v4.4h           \n"
3246       "prfm        pldl1keep, [%1, 448]          \n"
3247       "umull2      v3.4s, v0.8h, v4.8h           \n"
3248       "prfm        pldl1keep, [%2, 448]          \n"
3249       "umlal       v2.4s, v1.4h, v5.4h           \n"
3250       "umlal2      v3.4s, v1.8h, v5.8h           \n"
3251       "rshrn       v0.4h, v2.4s, #8              \n"
3252       "rshrn2      v0.8h, v3.4s, #8              \n"
3253       "st1         {v0.8h}, [%0], #16            \n"
3254       "b.gt        1b                            \n"
3255       "b           99f                           \n"
3256 
3257       // Blend 50 / 50.
3258       "50:                                       \n"
3259       "ld1         {v0.8h}, [%1], #16            \n"
3260       "ld1         {v1.8h}, [%2], #16            \n"
3261       "subs        %w3, %w3, #8                  \n"
3262       "prfm        pldl1keep, [%1, 448]          \n"
3263       "urhadd      v0.8h, v0.8h, v1.8h           \n"
3264       "prfm        pldl1keep, [%2, 448]          \n"
3265       "st1         {v0.8h}, [%0], #16            \n"
3266       "b.gt        50b                           \n"
3267       "b           99f                           \n"
3268 
3269       // Blend 100 / 0 - Copy row unchanged.
3270       "100:                                      \n"
3271       "ld1         {v0.8h}, [%1], #16            \n"
3272       "subs        %w3, %w3, #8                  \n"
3273       "prfm        pldl1keep, [%1, 448]          \n"
3274       "st1         {v0.8h}, [%0], #16            \n"
3275       "b.gt        100b                          \n"
3276 
3277       "99:                                       \n"
3278       : "+r"(dst_ptr),     // %0
3279         "+r"(src_ptr),     // %1
3280         "+r"(src_ptr1),    // %2
3281         "+r"(dst_width)    // %3
3282       : "r"(y1_fraction),  // %4
3283         "r"(y0_fraction)   // %5
3284       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5");
3285 }
3286 
3287 // Bilinear filter 8x2 -> 8x1
3288 // Use scale to convert lsb formats to msb, depending how many bits there are:
3289 // 32768 = 9 bits
3290 // 16384 = 10 bits
3291 // 4096 = 12 bits
3292 // 256 = 16 bits
InterpolateRow_16To8_NEON(uint8_t * dst_ptr,const uint16_t * src_ptr,ptrdiff_t src_stride,int scale,int dst_width,int source_y_fraction)3293 void InterpolateRow_16To8_NEON(uint8_t* dst_ptr,
3294                                const uint16_t* src_ptr,
3295                                ptrdiff_t src_stride,
3296                                int scale,
3297                                int dst_width,
3298                                int source_y_fraction) {
3299   int y1_fraction = source_y_fraction;
3300   int y0_fraction = 256 - y1_fraction;
3301   const uint16_t* src_ptr1 = src_ptr + src_stride;
3302   int shift = 15 - __builtin_clz((int32_t)scale);  // Negative shl is shr
3303 
3304   asm volatile(
3305       "dup         v6.8h, %w6                    \n"
3306       "cmp         %w4, #0                       \n"
3307       "b.eq        100f                          \n"
3308       "cmp         %w4, #128                     \n"
3309       "b.eq        50f                           \n"
3310 
3311       "dup         v5.8h, %w4                    \n"
3312       "dup         v4.8h, %w5                    \n"
3313       // General purpose row blend.
3314       "1:                                        \n"
3315       "ld1         {v0.8h}, [%1], #16            \n"
3316       "ld1         {v1.8h}, [%2], #16            \n"
3317       "subs        %w3, %w3, #8                  \n"
3318       "umull       v2.4s, v0.4h, v4.4h           \n"
3319       "prfm        pldl1keep, [%1, 448]          \n"
3320       "umull2      v3.4s, v0.8h, v4.8h           \n"
3321       "prfm        pldl1keep, [%2, 448]          \n"
3322       "umlal       v2.4s, v1.4h, v5.4h           \n"
3323       "umlal2      v3.4s, v1.8h, v5.8h           \n"
3324       "rshrn       v0.4h, v2.4s, #8              \n"
3325       "rshrn2      v0.8h, v3.4s, #8              \n"
3326       "ushl        v0.8h, v0.8h, v6.8h           \n"
3327       "uqxtn       v0.8b, v0.8h                  \n"
3328       "st1         {v0.8b}, [%0], #8             \n"
3329       "b.gt        1b                            \n"
3330       "b           99f                           \n"
3331 
3332       // Blend 50 / 50.
3333       "50:                                       \n"
3334       "ld1         {v0.8h}, [%1], #16            \n"
3335       "ld1         {v1.8h}, [%2], #16            \n"
3336       "subs        %w3, %w3, #8                  \n"
3337       "prfm        pldl1keep, [%1, 448]          \n"
3338       "urhadd      v0.8h, v0.8h, v1.8h           \n"
3339       "prfm        pldl1keep, [%2, 448]          \n"
3340       "ushl        v0.8h, v0.8h, v6.8h           \n"
3341       "uqxtn       v0.8b, v0.8h                  \n"
3342       "st1         {v0.8b}, [%0], #8             \n"
3343       "b.gt        50b                           \n"
3344       "b           99f                           \n"
3345 
3346       // Blend 100 / 0 - Copy row unchanged.
3347       "100:                                      \n"
3348       "ldr         q0, [%1], #16                 \n"
3349       "ushl        v0.8h, v0.8h, v2.8h           \n"  // shr = v2 is negative
3350       "prfm        pldl1keep, [%1, 448]          \n"
3351       "uqxtn       v0.8b, v0.8h                  \n"
3352       "subs        %w3, %w3, #8                  \n"  // 8 src pixels per loop
3353       "str         d0, [%0], #8                  \n"  // store 8 pixels
3354       "b.gt        100b                          \n"
3355 
3356       "99:                                       \n"
3357       : "+r"(dst_ptr),     // %0
3358         "+r"(src_ptr),     // %1
3359         "+r"(src_ptr1),    // %2
3360         "+r"(dst_width)    // %3
3361       : "r"(y1_fraction),  // %4
3362         "r"(y0_fraction),  // %5
3363         "r"(shift)         // %6
3364       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6");
3365 }
3366 
3367 // dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr
ARGBBlendRow_NEON(const uint8_t * src_argb,const uint8_t * src_argb1,uint8_t * dst_argb,int width)3368 void ARGBBlendRow_NEON(const uint8_t* src_argb,
3369                        const uint8_t* src_argb1,
3370                        uint8_t* dst_argb,
3371                        int width) {
3372   asm volatile(
3373       "subs        %w3, %w3, #8                  \n"
3374       "b.lt        89f                           \n"
3375       // Blend 8 pixels.
3376       "8:                                        \n"
3377       "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB0
3378       "ld4         {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 ARGB1
3379       "subs        %w3, %w3, #8                  \n"  // 8 processed per loop.
3380       "umull       v16.8h, v4.8b, v3.8b          \n"  // db * a
3381       "prfm        pldl1keep, [%0, 448]          \n"
3382       "umull       v17.8h, v5.8b, v3.8b          \n"  // dg * a
3383       "prfm        pldl1keep, [%1, 448]          \n"
3384       "umull       v18.8h, v6.8b, v3.8b          \n"  // dr * a
3385       "uqrshrn     v16.8b, v16.8h, #8            \n"  // db >>= 8
3386       "uqrshrn     v17.8b, v17.8h, #8            \n"  // dg >>= 8
3387       "uqrshrn     v18.8b, v18.8h, #8            \n"  // dr >>= 8
3388       "uqsub       v4.8b, v4.8b, v16.8b          \n"  // db - (db * a / 256)
3389       "uqsub       v5.8b, v5.8b, v17.8b          \n"  // dg - (dg * a / 256)
3390       "uqsub       v6.8b, v6.8b, v18.8b          \n"  // dr - (dr * a / 256)
3391       "uqadd       v0.8b, v0.8b, v4.8b           \n"  // + sb
3392       "uqadd       v1.8b, v1.8b, v5.8b           \n"  // + sg
3393       "uqadd       v2.8b, v2.8b, v6.8b           \n"  // + sr
3394       "movi        v3.8b, #255                   \n"  // a = 255
3395       "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB
3396                                                              // pixels
3397       "b.ge        8b                            \n"
3398 
3399       "89:                                       \n"
3400       "adds        %w3, %w3, #8-1                \n"
3401       "b.lt        99f                           \n"
3402 
3403       // Blend 1 pixels.
3404       "1:                                        \n"
3405       "ld4         {v0.b,v1.b,v2.b,v3.b}[0], [%0], #4 \n"  // load 1 pixel
3406                                                            // ARGB0.
3407       "ld4         {v4.b,v5.b,v6.b,v7.b}[0], [%1], #4 \n"  // load 1 pixel
3408                                                            // ARGB1.
3409       "subs        %w3, %w3, #1                  \n"  // 1 processed per loop.
3410       "umull       v16.8h, v4.8b, v3.8b          \n"  // db * a
3411       "prfm        pldl1keep, [%0, 448]          \n"
3412       "umull       v17.8h, v5.8b, v3.8b          \n"  // dg * a
3413       "prfm        pldl1keep, [%1, 448]          \n"
3414       "umull       v18.8h, v6.8b, v3.8b          \n"  // dr * a
3415       "uqrshrn     v16.8b, v16.8h, #8            \n"  // db >>= 8
3416       "uqrshrn     v17.8b, v17.8h, #8            \n"  // dg >>= 8
3417       "uqrshrn     v18.8b, v18.8h, #8            \n"  // dr >>= 8
3418       "uqsub       v4.8b, v4.8b, v16.8b          \n"  // db - (db * a / 256)
3419       "uqsub       v5.8b, v5.8b, v17.8b          \n"  // dg - (dg * a / 256)
3420       "uqsub       v6.8b, v6.8b, v18.8b          \n"  // dr - (dr * a / 256)
3421       "uqadd       v0.8b, v0.8b, v4.8b           \n"  // + sb
3422       "uqadd       v1.8b, v1.8b, v5.8b           \n"  // + sg
3423       "uqadd       v2.8b, v2.8b, v6.8b           \n"  // + sr
3424       "movi        v3.8b, #255                   \n"  // a = 255
3425       "st4         {v0.b,v1.b,v2.b,v3.b}[0], [%2], #4 \n"  // store 1 pixel.
3426       "b.ge        1b                            \n"
3427 
3428       "99:                                       \n"
3429 
3430       : "+r"(src_argb),   // %0
3431         "+r"(src_argb1),  // %1
3432         "+r"(dst_argb),   // %2
3433         "+r"(width)       // %3
3434       :
3435       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
3436         "v17", "v18");
3437 }
3438 
3439 // Attenuate 8 pixels at a time.
ARGBAttenuateRow_NEON(const uint8_t * src_argb,uint8_t * dst_argb,int width)3440 void ARGBAttenuateRow_NEON(const uint8_t* src_argb,
3441                            uint8_t* dst_argb,
3442                            int width) {
3443   asm volatile(
3444       "movi        v7.8h, #0x00ff                \n"  // 255 for rounding up
3445 
3446       // Attenuate 8 pixels.
3447       "1:                                        \n"
3448       "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
3449       "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
3450       "umull       v4.8h, v0.8b, v3.8b           \n"  // b * a
3451       "prfm        pldl1keep, [%0, 448]          \n"
3452       "umull       v5.8h, v1.8b, v3.8b           \n"         // g * a
3453       "umull       v6.8h, v2.8b, v3.8b           \n"         // r * a
3454       "addhn       v0.8b, v4.8h, v7.8h           \n"         // (b + 255) >> 8
3455       "addhn       v1.8b, v5.8h, v7.8h           \n"         // (g + 255) >> 8
3456       "addhn       v2.8b, v6.8h, v7.8h           \n"         // (r + 255) >> 8
3457       "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB
3458       "b.gt        1b                            \n"
3459       : "+r"(src_argb),  // %0
3460         "+r"(dst_argb),  // %1
3461         "+r"(width)      // %2
3462       :
3463       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
3464 }
3465 
3466 // Quantize 8 ARGB pixels (32 bytes).
3467 // dst = (dst * scale >> 16) * interval_size + interval_offset;
ARGBQuantizeRow_NEON(uint8_t * dst_argb,int scale,int interval_size,int interval_offset,int width)3468 void ARGBQuantizeRow_NEON(uint8_t* dst_argb,
3469                           int scale,
3470                           int interval_size,
3471                           int interval_offset,
3472                           int width) {
3473   asm volatile(
3474       "dup         v4.8h, %w2                    \n"
3475       "ushr        v4.8h, v4.8h, #1              \n"  // scale >>= 1
3476       "dup         v5.8h, %w3                    \n"  // interval multiply.
3477       "dup         v6.8h, %w4                    \n"  // interval add
3478 
3479       // 8 pixel loop.
3480       "1:                                        \n"
3481       "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n"  // load 8  ARGB.
3482       "subs        %w1, %w1, #8                  \n"    // 8 processed per loop.
3483       "uxtl        v0.8h, v0.8b                  \n"    // b (0 .. 255)
3484       "prfm        pldl1keep, [%0, 448]          \n"
3485       "uxtl        v1.8h, v1.8b                  \n"
3486       "uxtl        v2.8h, v2.8b                  \n"
3487       "sqdmulh     v0.8h, v0.8h, v4.8h           \n"  // b * scale
3488       "sqdmulh     v1.8h, v1.8h, v4.8h           \n"  // g
3489       "sqdmulh     v2.8h, v2.8h, v4.8h           \n"  // r
3490       "mul         v0.8h, v0.8h, v5.8h           \n"  // b * interval_size
3491       "mul         v1.8h, v1.8h, v5.8h           \n"  // g
3492       "mul         v2.8h, v2.8h, v5.8h           \n"  // r
3493       "add         v0.8h, v0.8h, v6.8h           \n"  // b + interval_offset
3494       "add         v1.8h, v1.8h, v6.8h           \n"  // g
3495       "add         v2.8h, v2.8h, v6.8h           \n"  // r
3496       "uqxtn       v0.8b, v0.8h                  \n"
3497       "uqxtn       v1.8b, v1.8h                  \n"
3498       "uqxtn       v2.8b, v2.8h                  \n"
3499       "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // store 8 ARGB
3500       "b.gt        1b                            \n"
3501       : "+r"(dst_argb),       // %0
3502         "+r"(width)           // %1
3503       : "r"(scale),           // %2
3504         "r"(interval_size),   // %3
3505         "r"(interval_offset)  // %4
3506       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6");
3507 }
3508 
3509 // Shade 8 pixels at a time by specified value.
3510 // NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8.
3511 // Rounding in vqrdmulh does +1 to high if high bit of low s16 is set.
ARGBShadeRow_NEON(const uint8_t * src_argb,uint8_t * dst_argb,int width,uint32_t value)3512 void ARGBShadeRow_NEON(const uint8_t* src_argb,
3513                        uint8_t* dst_argb,
3514                        int width,
3515                        uint32_t value) {
3516   asm volatile(
3517       "dup         v0.4s, %w3                    \n"  // duplicate scale value.
3518       "zip1        v0.8b, v0.8b, v0.8b           \n"  // v0.8b aarrggbb.
3519       "ushr        v0.8h, v0.8h, #1              \n"  // scale / 2.
3520 
3521       // 8 pixel loop.
3522       "1:                                        \n"
3523       "ld4         {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n"  // load 8 ARGB
3524       "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
3525       "uxtl        v4.8h, v4.8b                  \n"  // b (0 .. 255)
3526       "prfm        pldl1keep, [%0, 448]          \n"
3527       "uxtl        v5.8h, v5.8b                  \n"
3528       "uxtl        v6.8h, v6.8b                  \n"
3529       "uxtl        v7.8h, v7.8b                  \n"
3530       "sqrdmulh    v4.8h, v4.8h, v0.h[0]         \n"  // b * scale * 2
3531       "sqrdmulh    v5.8h, v5.8h, v0.h[1]         \n"  // g
3532       "sqrdmulh    v6.8h, v6.8h, v0.h[2]         \n"  // r
3533       "sqrdmulh    v7.8h, v7.8h, v0.h[3]         \n"  // a
3534       "uqxtn       v4.8b, v4.8h                  \n"
3535       "uqxtn       v5.8b, v5.8h                  \n"
3536       "uqxtn       v6.8b, v6.8h                  \n"
3537       "uqxtn       v7.8b, v7.8h                  \n"
3538       "st4         {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // store 8 ARGB
3539       "b.gt        1b                            \n"
3540       : "+r"(src_argb),  // %0
3541         "+r"(dst_argb),  // %1
3542         "+r"(width)      // %2
3543       : "r"(value)       // %3
3544       : "cc", "memory", "v0", "v4", "v5", "v6", "v7");
3545 }
3546 
3547 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
3548 // Similar to ARGBToYJ but stores ARGB.
3549 // C code is (29 * b + 150 * g + 77 * r + 128) >> 8;
ARGBGrayRow_NEON(const uint8_t * src_argb,uint8_t * dst_argb,int width)3550 void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
3551   asm volatile(
3552       "movi        v24.8b, #29                   \n"  // B * 0.1140 coefficient
3553       "movi        v25.8b, #150                  \n"  // G * 0.5870 coefficient
3554       "movi        v26.8b, #77                   \n"  // R * 0.2990 coefficient
3555       "1:                                        \n"
3556       "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
3557       "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
3558       "umull       v4.8h, v0.8b, v24.8b          \n"  // B
3559       "prfm        pldl1keep, [%0, 448]          \n"
3560       "umlal       v4.8h, v1.8b, v25.8b          \n"  // G
3561       "umlal       v4.8h, v2.8b, v26.8b          \n"  // R
3562       "uqrshrn     v0.8b, v4.8h, #8              \n"  // 16 bit to 8 bit B
3563       "orr         v1.8b, v0.8b, v0.8b           \n"  // G
3564       "orr         v2.8b, v0.8b, v0.8b           \n"  // R
3565       "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 pixels.
3566       "b.gt        1b                            \n"
3567       : "+r"(src_argb),  // %0
3568         "+r"(dst_argb),  // %1
3569         "+r"(width)      // %2
3570       :
3571       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26");
3572 }
3573 
3574 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
3575 //    b = (r * 35 + g * 68 + b * 17) >> 7
3576 //    g = (r * 45 + g * 88 + b * 22) >> 7
3577 //    r = (r * 50 + g * 98 + b * 24) >> 7
3578 
ARGBSepiaRow_NEON(uint8_t * dst_argb,int width)3579 void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width) {
3580   asm volatile(
3581       "movi        v20.8b, #17                   \n"  // BB coefficient
3582       "movi        v21.8b, #68                   \n"  // BG coefficient
3583       "movi        v22.8b, #35                   \n"  // BR coefficient
3584       "movi        v24.8b, #22                   \n"  // GB coefficient
3585       "movi        v25.8b, #88                   \n"  // GG coefficient
3586       "movi        v26.8b, #45                   \n"  // GR coefficient
3587       "movi        v28.8b, #24                   \n"  // BB coefficient
3588       "movi        v29.8b, #98                   \n"  // BG coefficient
3589       "movi        v30.8b, #50                   \n"  // BR coefficient
3590       "1:                                        \n"
3591       "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n"  // load 8 ARGB pixels.
3592       "subs        %w1, %w1, #8                  \n"    // 8 processed per loop.
3593       "umull       v4.8h, v0.8b, v20.8b          \n"    // B to Sepia B
3594       "prfm        pldl1keep, [%0, 448]          \n"
3595       "umlal       v4.8h, v1.8b, v21.8b          \n"  // G
3596       "umlal       v4.8h, v2.8b, v22.8b          \n"  // R
3597       "umull       v5.8h, v0.8b, v24.8b          \n"  // B to Sepia G
3598       "umlal       v5.8h, v1.8b, v25.8b          \n"  // G
3599       "umlal       v5.8h, v2.8b, v26.8b          \n"  // R
3600       "umull       v6.8h, v0.8b, v28.8b          \n"  // B to Sepia R
3601       "umlal       v6.8h, v1.8b, v29.8b          \n"  // G
3602       "umlal       v6.8h, v2.8b, v30.8b          \n"  // R
3603       "uqshrn      v0.8b, v4.8h, #7              \n"  // 16 bit to 8 bit B
3604       "uqshrn      v1.8b, v5.8h, #7              \n"  // 16 bit to 8 bit G
3605       "uqshrn      v2.8b, v6.8h, #7              \n"  // 16 bit to 8 bit R
3606       "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // store 8 pixels.
3607       "b.gt        1b                            \n"
3608       : "+r"(dst_argb),  // %0
3609         "+r"(width)      // %1
3610       :
3611       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
3612         "v21", "v22", "v24", "v25", "v26", "v28", "v29", "v30");
3613 }
3614 
3615 // Tranform 8 ARGB pixels (32 bytes) with color matrix.
3616 // TODO(fbarchard): Was same as Sepia except matrix is provided.  This function
3617 // needs to saturate.  Consider doing a non-saturating version.
ARGBColorMatrixRow_NEON(const uint8_t * src_argb,uint8_t * dst_argb,const int8_t * matrix_argb,int width)3618 void ARGBColorMatrixRow_NEON(const uint8_t* src_argb,
3619                              uint8_t* dst_argb,
3620                              const int8_t* matrix_argb,
3621                              int width) {
3622   asm volatile(
3623       "ld1         {v2.16b}, [%3]                \n"  // load 3 ARGB vectors.
3624       "sxtl        v0.8h, v2.8b                  \n"  // B,G coefficients s16.
3625       "sxtl2       v1.8h, v2.16b                 \n"  // R,A coefficients s16.
3626 
3627       "1:                                        \n"
3628       "ld4         {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n"  // load 8 ARGB
3629       "subs        %w2, %w2, #8                  \n"  // 8 processed per loop.
3630       "uxtl        v16.8h, v16.8b                \n"  // b (0 .. 255) 16 bit
3631       "prfm        pldl1keep, [%0, 448]          \n"
3632       "uxtl        v17.8h, v17.8b                \n"  // g
3633       "uxtl        v18.8h, v18.8b                \n"  // r
3634       "uxtl        v19.8h, v19.8b                \n"  // a
3635       "mul         v22.8h, v16.8h, v0.h[0]       \n"  // B = B * Matrix B
3636       "mul         v23.8h, v16.8h, v0.h[4]       \n"  // G = B * Matrix G
3637       "mul         v24.8h, v16.8h, v1.h[0]       \n"  // R = B * Matrix R
3638       "mul         v25.8h, v16.8h, v1.h[4]       \n"  // A = B * Matrix A
3639       "mul         v4.8h, v17.8h, v0.h[1]        \n"  // B += G * Matrix B
3640       "mul         v5.8h, v17.8h, v0.h[5]        \n"  // G += G * Matrix G
3641       "mul         v6.8h, v17.8h, v1.h[1]        \n"  // R += G * Matrix R
3642       "mul         v7.8h, v17.8h, v1.h[5]        \n"  // A += G * Matrix A
3643       "sqadd       v22.8h, v22.8h, v4.8h         \n"  // Accumulate B
3644       "sqadd       v23.8h, v23.8h, v5.8h         \n"  // Accumulate G
3645       "sqadd       v24.8h, v24.8h, v6.8h         \n"  // Accumulate R
3646       "sqadd       v25.8h, v25.8h, v7.8h         \n"  // Accumulate A
3647       "mul         v4.8h, v18.8h, v0.h[2]        \n"  // B += R * Matrix B
3648       "mul         v5.8h, v18.8h, v0.h[6]        \n"  // G += R * Matrix G
3649       "mul         v6.8h, v18.8h, v1.h[2]        \n"  // R += R * Matrix R
3650       "mul         v7.8h, v18.8h, v1.h[6]        \n"  // A += R * Matrix A
3651       "sqadd       v22.8h, v22.8h, v4.8h         \n"  // Accumulate B
3652       "sqadd       v23.8h, v23.8h, v5.8h         \n"  // Accumulate G
3653       "sqadd       v24.8h, v24.8h, v6.8h         \n"  // Accumulate R
3654       "sqadd       v25.8h, v25.8h, v7.8h         \n"  // Accumulate A
3655       "mul         v4.8h, v19.8h, v0.h[3]        \n"  // B += A * Matrix B
3656       "mul         v5.8h, v19.8h, v0.h[7]        \n"  // G += A * Matrix G
3657       "mul         v6.8h, v19.8h, v1.h[3]        \n"  // R += A * Matrix R
3658       "mul         v7.8h, v19.8h, v1.h[7]        \n"  // A += A * Matrix A
3659       "sqadd       v22.8h, v22.8h, v4.8h         \n"  // Accumulate B
3660       "sqadd       v23.8h, v23.8h, v5.8h         \n"  // Accumulate G
3661       "sqadd       v24.8h, v24.8h, v6.8h         \n"  // Accumulate R
3662       "sqadd       v25.8h, v25.8h, v7.8h         \n"  // Accumulate A
3663       "sqshrun     v16.8b, v22.8h, #6            \n"  // 16 bit to 8 bit B
3664       "sqshrun     v17.8b, v23.8h, #6            \n"  // 16 bit to 8 bit G
3665       "sqshrun     v18.8b, v24.8h, #6            \n"  // 16 bit to 8 bit R
3666       "sqshrun     v19.8b, v25.8h, #6            \n"  // 16 bit to 8 bit A
3667       "st4         {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n"  // store 8 ARGB
3668       "b.gt        1b                            \n"
3669       : "+r"(src_argb),   // %0
3670         "+r"(dst_argb),   // %1
3671         "+r"(width)       // %2
3672       : "r"(matrix_argb)  // %3
3673       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
3674         "v17", "v18", "v19", "v22", "v23", "v24", "v25");
3675 }
3676 
3677 // TODO(fbarchard): fix vqshrun in ARGBMultiplyRow_NEON and reenable.
3678 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
ARGBMultiplyRow_NEON(const uint8_t * src_argb,const uint8_t * src_argb1,uint8_t * dst_argb,int width)3679 void ARGBMultiplyRow_NEON(const uint8_t* src_argb,
3680                           const uint8_t* src_argb1,
3681                           uint8_t* dst_argb,
3682                           int width) {
3683   asm volatile(
3684       // 8 pixel loop.
3685       "1:                                        \n"
3686       "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
3687       "ld4         {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 more
3688       "subs        %w3, %w3, #8                  \n"  // 8 processed per loop.
3689       "umull       v0.8h, v0.8b, v4.8b           \n"  // multiply B
3690       "prfm        pldl1keep, [%0, 448]          \n"
3691       "umull       v1.8h, v1.8b, v5.8b           \n"  // multiply G
3692       "prfm        pldl1keep, [%1, 448]          \n"
3693       "umull       v2.8h, v2.8b, v6.8b           \n"  // multiply R
3694       "umull       v3.8h, v3.8b, v7.8b           \n"  // multiply A
3695       "rshrn       v0.8b, v0.8h, #8              \n"  // 16 bit to 8 bit B
3696       "rshrn       v1.8b, v1.8h, #8              \n"  // 16 bit to 8 bit G
3697       "rshrn       v2.8b, v2.8h, #8              \n"  // 16 bit to 8 bit R
3698       "rshrn       v3.8b, v3.8h, #8              \n"  // 16 bit to 8 bit A
3699       "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB
3700       "b.gt        1b                            \n"
3701       : "+r"(src_argb),   // %0
3702         "+r"(src_argb1),  // %1
3703         "+r"(dst_argb),   // %2
3704         "+r"(width)       // %3
3705       :
3706       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
3707 }
3708 
3709 // Add 2 rows of ARGB pixels together, 8 pixels at a time.
ARGBAddRow_NEON(const uint8_t * src_argb,const uint8_t * src_argb1,uint8_t * dst_argb,int width)3710 void ARGBAddRow_NEON(const uint8_t* src_argb,
3711                      const uint8_t* src_argb1,
3712                      uint8_t* dst_argb,
3713                      int width) {
3714   asm volatile(
3715       // 8 pixel loop.
3716       "1:                                        \n"
3717       "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
3718       "ld4         {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 more
3719       "subs        %w3, %w3, #8                  \n"  // 8 processed per loop.
3720       "uqadd       v0.8b, v0.8b, v4.8b           \n"
3721       "prfm        pldl1keep, [%0, 448]          \n"
3722       "uqadd       v1.8b, v1.8b, v5.8b           \n"
3723       "prfm        pldl1keep, [%1, 448]          \n"
3724       "uqadd       v2.8b, v2.8b, v6.8b           \n"
3725       "uqadd       v3.8b, v3.8b, v7.8b           \n"
3726       "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB
3727       "b.gt        1b                            \n"
3728       : "+r"(src_argb),   // %0
3729         "+r"(src_argb1),  // %1
3730         "+r"(dst_argb),   // %2
3731         "+r"(width)       // %3
3732       :
3733       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
3734 }
3735 
3736 // Subtract 2 rows of ARGB pixels, 8 pixels at a time.
ARGBSubtractRow_NEON(const uint8_t * src_argb,const uint8_t * src_argb1,uint8_t * dst_argb,int width)3737 void ARGBSubtractRow_NEON(const uint8_t* src_argb,
3738                           const uint8_t* src_argb1,
3739                           uint8_t* dst_argb,
3740                           int width) {
3741   asm volatile(
3742       // 8 pixel loop.
3743       "1:                                        \n"
3744       "ld4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
3745       "ld4         {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 more
3746       "subs        %w3, %w3, #8                  \n"  // 8 processed per loop.
3747       "uqsub       v0.8b, v0.8b, v4.8b           \n"
3748       "prfm        pldl1keep, [%0, 448]          \n"
3749       "uqsub       v1.8b, v1.8b, v5.8b           \n"
3750       "prfm        pldl1keep, [%1, 448]          \n"
3751       "uqsub       v2.8b, v2.8b, v6.8b           \n"
3752       "uqsub       v3.8b, v3.8b, v7.8b           \n"
3753       "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB
3754       "b.gt        1b                            \n"
3755       : "+r"(src_argb),   // %0
3756         "+r"(src_argb1),  // %1
3757         "+r"(dst_argb),   // %2
3758         "+r"(width)       // %3
3759       :
3760       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
3761 }
3762 
3763 // Adds Sobel X and Sobel Y and stores Sobel into ARGB.
3764 // A = 255
3765 // R = Sobel
3766 // G = Sobel
3767 // B = Sobel
SobelRow_NEON(const uint8_t * src_sobelx,const uint8_t * src_sobely,uint8_t * dst_argb,int width)3768 void SobelRow_NEON(const uint8_t* src_sobelx,
3769                    const uint8_t* src_sobely,
3770                    uint8_t* dst_argb,
3771                    int width) {
3772   asm volatile(
3773       "movi        v3.8b, #255                   \n"  // alpha
3774       // 8 pixel loop.
3775       "1:                                        \n"
3776       "ld1         {v0.8b}, [%0], #8             \n"  // load 8 sobelx.
3777       "ld1         {v1.8b}, [%1], #8             \n"  // load 8 sobely.
3778       "subs        %w3, %w3, #8                  \n"  // 8 processed per loop.
3779       "uqadd       v0.8b, v0.8b, v1.8b           \n"  // add
3780       "prfm        pldl1keep, [%0, 448]          \n"
3781       "orr         v1.8b, v0.8b, v0.8b           \n"
3782       "prfm        pldl1keep, [%1, 448]          \n"
3783       "orr         v2.8b, v0.8b, v0.8b           \n"
3784       "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB
3785       "b.gt        1b                            \n"
3786       : "+r"(src_sobelx),  // %0
3787         "+r"(src_sobely),  // %1
3788         "+r"(dst_argb),    // %2
3789         "+r"(width)        // %3
3790       :
3791       : "cc", "memory", "v0", "v1", "v2", "v3");
3792 }
3793 
3794 // Adds Sobel X and Sobel Y and stores Sobel into plane.
SobelToPlaneRow_NEON(const uint8_t * src_sobelx,const uint8_t * src_sobely,uint8_t * dst_y,int width)3795 void SobelToPlaneRow_NEON(const uint8_t* src_sobelx,
3796                           const uint8_t* src_sobely,
3797                           uint8_t* dst_y,
3798                           int width) {
3799   asm volatile(
3800       // 16 pixel loop.
3801       "1:                                        \n"
3802       "ld1         {v0.16b}, [%0], #16           \n"  // load 16 sobelx.
3803       "ld1         {v1.16b}, [%1], #16           \n"  // load 16 sobely.
3804       "subs        %w3, %w3, #16                 \n"  // 16 processed per loop.
3805       "prfm        pldl1keep, [%0, 448]          \n"
3806       "uqadd       v0.16b, v0.16b, v1.16b        \n"  // add
3807       "prfm        pldl1keep, [%1, 448]          \n"
3808       "st1         {v0.16b}, [%2], #16           \n"  // store 16 pixels.
3809       "b.gt        1b                            \n"
3810       : "+r"(src_sobelx),  // %0
3811         "+r"(src_sobely),  // %1
3812         "+r"(dst_y),       // %2
3813         "+r"(width)        // %3
3814       :
3815       : "cc", "memory", "v0", "v1");
3816 }
3817 
3818 // Mixes Sobel X, Sobel Y and Sobel into ARGB.
3819 // A = 255
3820 // R = Sobel X
3821 // G = Sobel
3822 // B = Sobel Y
SobelXYRow_NEON(const uint8_t * src_sobelx,const uint8_t * src_sobely,uint8_t * dst_argb,int width)3823 void SobelXYRow_NEON(const uint8_t* src_sobelx,
3824                      const uint8_t* src_sobely,
3825                      uint8_t* dst_argb,
3826                      int width) {
3827   asm volatile(
3828       "movi        v3.8b, #255                   \n"  // alpha
3829       // 8 pixel loop.
3830       "1:                                        \n"
3831       "ld1         {v2.8b}, [%0], #8             \n"  // load 8 sobelx.
3832       "ld1         {v0.8b}, [%1], #8             \n"  // load 8 sobely.
3833       "subs        %w3, %w3, #8                  \n"  // 8 processed per loop.
3834       "prfm        pldl1keep, [%0, 448]          \n"
3835       "uqadd       v1.8b, v0.8b, v2.8b           \n"  // add
3836       "prfm        pldl1keep, [%1, 448]          \n"
3837       "st4         {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB
3838       "b.gt        1b                            \n"
3839       : "+r"(src_sobelx),  // %0
3840         "+r"(src_sobely),  // %1
3841         "+r"(dst_argb),    // %2
3842         "+r"(width)        // %3
3843       :
3844       : "cc", "memory", "v0", "v1", "v2", "v3");
3845 }
3846 
3847 // SobelX as a matrix is
3848 // -1  0  1
3849 // -2  0  2
3850 // -1  0  1
SobelXRow_NEON(const uint8_t * src_y0,const uint8_t * src_y1,const uint8_t * src_y2,uint8_t * dst_sobelx,int width)3851 void SobelXRow_NEON(const uint8_t* src_y0,
3852                     const uint8_t* src_y1,
3853                     const uint8_t* src_y2,
3854                     uint8_t* dst_sobelx,
3855                     int width) {
3856   asm volatile(
3857       "1:                                        \n"
3858       "ld1         {v0.8b}, [%0],%5              \n"  // top
3859       "ld1         {v1.8b}, [%0],%6              \n"
3860       "usubl       v0.8h, v0.8b, v1.8b           \n"
3861       "prfm        pldl1keep, [%0, 448]          \n"
3862       "ld1         {v2.8b}, [%1],%5              \n"  // center * 2
3863       "ld1         {v3.8b}, [%1],%6              \n"
3864       "usubl       v1.8h, v2.8b, v3.8b           \n"
3865       "prfm        pldl1keep, [%1, 448]          \n"
3866       "add         v0.8h, v0.8h, v1.8h           \n"
3867       "add         v0.8h, v0.8h, v1.8h           \n"
3868       "ld1         {v2.8b}, [%2],%5              \n"  // bottom
3869       "ld1         {v3.8b}, [%2],%6              \n"
3870       "subs        %w4, %w4, #8                  \n"  // 8 pixels
3871       "prfm        pldl1keep, [%2, 448]          \n"
3872       "usubl       v1.8h, v2.8b, v3.8b           \n"
3873       "add         v0.8h, v0.8h, v1.8h           \n"
3874       "abs         v0.8h, v0.8h                  \n"
3875       "uqxtn       v0.8b, v0.8h                  \n"
3876       "st1         {v0.8b}, [%3], #8             \n"  // store 8 sobelx
3877       "b.gt        1b                            \n"
3878       : "+r"(src_y0),                           // %0
3879         "+r"(src_y1),                           // %1
3880         "+r"(src_y2),                           // %2
3881         "+r"(dst_sobelx),                       // %3
3882         "+r"(width)                             // %4
3883       : "r"(2LL),                               // %5
3884         "r"(6LL)                                // %6
3885       : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
3886   );
3887 }
3888 
3889 // SobelY as a matrix is
3890 // -1 -2 -1
3891 //  0  0  0
3892 //  1  2  1
SobelYRow_NEON(const uint8_t * src_y0,const uint8_t * src_y1,uint8_t * dst_sobely,int width)3893 void SobelYRow_NEON(const uint8_t* src_y0,
3894                     const uint8_t* src_y1,
3895                     uint8_t* dst_sobely,
3896                     int width) {
3897   asm volatile(
3898       "1:                                        \n"
3899       "ld1         {v0.8b}, [%0],%4              \n"  // left
3900       "ld1         {v1.8b}, [%1],%4              \n"
3901       "usubl       v0.8h, v0.8b, v1.8b           \n"
3902       "ld1         {v2.8b}, [%0],%4              \n"  // center * 2
3903       "ld1         {v3.8b}, [%1],%4              \n"
3904       "usubl       v1.8h, v2.8b, v3.8b           \n"
3905       "add         v0.8h, v0.8h, v1.8h           \n"
3906       "add         v0.8h, v0.8h, v1.8h           \n"
3907       "ld1         {v2.8b}, [%0],%5              \n"  // right
3908       "ld1         {v3.8b}, [%1],%5              \n"
3909       "subs        %w3, %w3, #8                  \n"  // 8 pixels
3910       "usubl       v1.8h, v2.8b, v3.8b           \n"
3911       "prfm        pldl1keep, [%0, 448]          \n"
3912       "add         v0.8h, v0.8h, v1.8h           \n"
3913       "prfm        pldl1keep, [%1, 448]          \n"
3914       "abs         v0.8h, v0.8h                  \n"
3915       "uqxtn       v0.8b, v0.8h                  \n"
3916       "st1         {v0.8b}, [%2], #8             \n"  // store 8 sobely
3917       "b.gt        1b                            \n"
3918       : "+r"(src_y0),                           // %0
3919         "+r"(src_y1),                           // %1
3920         "+r"(dst_sobely),                       // %2
3921         "+r"(width)                             // %3
3922       : "r"(1LL),                               // %4
3923         "r"(6LL)                                // %5
3924       : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
3925   );
3926 }
3927 
3928 // Caveat - rounds float to half float whereas scaling version truncates.
HalfFloat1Row_NEON(const uint16_t * src,uint16_t * dst,float,int width)3929 void HalfFloat1Row_NEON(const uint16_t* src,
3930                         uint16_t* dst,
3931                         float /*unused*/,
3932                         int width) {
3933   asm volatile(
3934       "1:                                        \n"
3935       "ld1         {v1.16b}, [%0], #16           \n"  // load 8 shorts
3936       "subs        %w2, %w2, #8                  \n"  // 8 pixels per loop
3937       "uxtl        v2.4s, v1.4h                  \n"  // 8 int's
3938       "prfm        pldl1keep, [%0, 448]          \n"
3939       "uxtl2       v3.4s, v1.8h                  \n"
3940       "scvtf       v2.4s, v2.4s                  \n"  // 8 floats
3941       "scvtf       v3.4s, v3.4s                  \n"
3942       "fcvtn       v1.4h, v2.4s                  \n"  // 8 half floats
3943       "fcvtn2      v1.8h, v3.4s                  \n"
3944       "st1         {v1.16b}, [%1], #16           \n"  // store 8 shorts
3945       "b.gt        1b                            \n"
3946       : "+r"(src),   // %0
3947         "+r"(dst),   // %1
3948         "+r"(width)  // %2
3949       :
3950       : "cc", "memory", "v1", "v2", "v3");
3951 }
3952 
HalfFloatRow_NEON(const uint16_t * src,uint16_t * dst,float scale,int width)3953 void HalfFloatRow_NEON(const uint16_t* src,
3954                        uint16_t* dst,
3955                        float scale,
3956                        int width) {
3957   asm volatile(
3958       "1:                                        \n"
3959       "ld1         {v1.16b}, [%0], #16           \n"  // load 8 shorts
3960       "subs        %w2, %w2, #8                  \n"  // 8 pixels per loop
3961       "uxtl        v2.4s, v1.4h                  \n"  // 8 int's
3962       "prfm        pldl1keep, [%0, 448]          \n"
3963       "uxtl2       v3.4s, v1.8h                  \n"
3964       "scvtf       v2.4s, v2.4s                  \n"  // 8 floats
3965       "scvtf       v3.4s, v3.4s                  \n"
3966       "fmul        v2.4s, v2.4s, %3.s[0]         \n"  // adjust exponent
3967       "fmul        v3.4s, v3.4s, %3.s[0]         \n"
3968       "uqshrn      v1.4h, v2.4s, #13             \n"  // isolate halffloat
3969       "uqshrn2     v1.8h, v3.4s, #13             \n"
3970       "st1         {v1.16b}, [%1], #16           \n"  // store 8 shorts
3971       "b.gt        1b                            \n"
3972       : "+r"(src),                      // %0
3973         "+r"(dst),                      // %1
3974         "+r"(width)                     // %2
3975       : "w"(scale * 1.9259299444e-34f)  // %3
3976       : "cc", "memory", "v1", "v2", "v3");
3977 }
3978 
ByteToFloatRow_NEON(const uint8_t * src,float * dst,float scale,int width)3979 void ByteToFloatRow_NEON(const uint8_t* src,
3980                          float* dst,
3981                          float scale,
3982                          int width) {
3983   asm volatile(
3984       "1:                                        \n"
3985       "ld1         {v1.8b}, [%0], #8             \n"  // load 8 bytes
3986       "subs        %w2, %w2, #8                  \n"  // 8 pixels per loop
3987       "uxtl        v1.8h, v1.8b                  \n"  // 8 shorts
3988       "prfm        pldl1keep, [%0, 448]          \n"
3989       "uxtl        v2.4s, v1.4h                  \n"  // 8 ints
3990       "uxtl2       v3.4s, v1.8h                  \n"
3991       "scvtf       v2.4s, v2.4s                  \n"  // 8 floats
3992       "scvtf       v3.4s, v3.4s                  \n"
3993       "fmul        v2.4s, v2.4s, %3.s[0]         \n"  // scale
3994       "fmul        v3.4s, v3.4s, %3.s[0]         \n"
3995       "st1         {v2.16b, v3.16b}, [%1], #32   \n"  // store 8 floats
3996       "b.gt        1b                            \n"
3997       : "+r"(src),   // %0
3998         "+r"(dst),   // %1
3999         "+r"(width)  // %2
4000       : "w"(scale)   // %3
4001       : "cc", "memory", "v1", "v2", "v3");
4002 }
4003 
4004 // Convert FP16 Half Floats to FP32 Floats
ConvertFP16ToFP32Row_NEON(const uint16_t * src,float * dst,int width)4005 void ConvertFP16ToFP32Row_NEON(const uint16_t* src,  // fp16
4006                                float* dst,
4007                                int width) {
4008   asm volatile(
4009       "1:                                        \n"
4010       "ld1         {v1.8h}, [%0], #16            \n"  // load 8 halffloats
4011       "subs        %w2, %w2, #8                  \n"  // 8 floats per loop
4012       "prfm        pldl1keep, [%0, 448]          \n"
4013       "fcvtl       v2.4s, v1.4h                  \n"  // 8 floats
4014       "fcvtl2      v3.4s, v1.8h                  \n"
4015       "stp         q2, q3, [%1], #32             \n"  // store 8 floats
4016       "b.gt        1b                            \n"
4017       : "+r"(src),   // %0
4018         "+r"(dst),   // %1
4019         "+r"(width)  // %2
4020       :
4021       : "cc", "memory", "v1", "v2", "v3");
4022 }
4023 
4024 // Convert FP16 Half Floats to FP32 Floats
4025 // Read a column and write a row
ConvertFP16ToFP32Column_NEON(const uint16_t * src,int src_stride,float * dst,int width)4026 void ConvertFP16ToFP32Column_NEON(const uint16_t* src,  // fp16
4027                                   int src_stride,       // stride in elements
4028                                   float* dst,
4029                                   int width) {
4030   asm volatile(
4031       "cmp         %w2, #8                       \n"  // Is there 8 rows?
4032       "b.lo        2f                            \n"
4033       "1:                                        \n"
4034       "ld1         {v0.h}[0], [%0], %3           \n"  // load 8 halffloats
4035       "ld1         {v0.h}[1], [%0], %3           \n"
4036       "ld1         {v0.h}[2], [%0], %3           \n"
4037       "ld1         {v0.h}[3], [%0], %3           \n"
4038       "ld1         {v1.h}[0], [%0], %3           \n"
4039       "ld1         {v1.h}[1], [%0], %3           \n"
4040       "ld1         {v1.h}[2], [%0], %3           \n"
4041       "ld1         {v1.h}[3], [%0], %3           \n"
4042       "subs        %w2, %w2, #8                  \n"  // 8 rows per loop
4043       "prfm        pldl1keep, [%0, 448]          \n"
4044       "fcvtl       v2.4s, v0.4h                  \n"  // 4 floats
4045       "fcvtl       v3.4s, v1.4h                  \n"  // 4 more floats
4046       "stp         q2, q3, [%1], #32             \n"  // store 8 floats
4047       "b.gt        1b                            \n"
4048       "cmp         %w2, #1                       \n"  // Is there 1 value?
4049       "b.lo        3f                            \n"
4050       "2:                                        \n"
4051       "ld1         {v1.h}[0], [%0], %3           \n"  // load 1 halffloats
4052       "subs        %w2, %w2, #1                  \n"  // 1 floats per loop
4053       "fcvtl       v2.4s, v1.4h                  \n"  // 1 floats
4054       "str         s2, [%1], #4                  \n"  // store 1 floats
4055       "b.gt        2b                            \n"
4056       "3:                                        \n"
4057       : "+r"(src),                        // %0
4058         "+r"(dst),                        // %1
4059         "+r"(width)                       // %2
4060       : "r"((ptrdiff_t)(src_stride * 2))  // %3
4061       : "cc", "memory", "v0", "v1", "v2", "v3");
4062 }
4063 
4064 // Convert FP32 Floats to FP16 Half Floats
ConvertFP32ToFP16Row_NEON(const float * src,uint16_t * dst,int width)4065 void ConvertFP32ToFP16Row_NEON(const float* src,
4066                                uint16_t* dst,  // fp16
4067                                int width) {
4068   asm volatile(
4069       "1:                                        \n"
4070       "ldp         q2, q3, [%0], #32             \n"  // load 8 floats
4071       "subs        %w2, %w2, #8                  \n"  // 8 floats per loop
4072       "prfm        pldl1keep, [%0, 448]          \n"
4073       "fcvtn       v1.4h, v2.4s                  \n"  // 8 fp16 halffloats
4074       "fcvtn2      v1.8h, v3.4s                  \n"
4075       "str         q1, [%1], #16                 \n"  // store 8 fp16 halffloats
4076       "b.gt        1b                            \n"
4077       : "+r"(src),   // %0
4078         "+r"(dst),   // %1
4079         "+r"(width)  // %2
4080       :
4081       : "cc", "memory", "v1", "v2", "v3");
4082 }
4083 
ScaleMaxSamples_NEON(const float * src,float * dst,float scale,int width)4084 float ScaleMaxSamples_NEON(const float* src,
4085                            float* dst,
4086                            float scale,
4087                            int width) {
4088   float fmax;
4089   asm volatile(
4090       "movi        v5.4s, #0                     \n"  // max
4091       "movi        v6.4s, #0                     \n"
4092 
4093       "1:                                        \n"
4094       "ld1         {v1.4s, v2.4s}, [%0], #32     \n"  // load 8 samples
4095       "subs        %w2, %w2, #8                  \n"  // 8 processed per loop
4096       "fmul        v3.4s, v1.4s, %4.s[0]         \n"  // scale
4097       "prfm        pldl1keep, [%0, 448]          \n"
4098       "fmul        v4.4s, v2.4s, %4.s[0]         \n"  // scale
4099       "fmax        v5.4s, v5.4s, v1.4s           \n"  // max
4100       "fmax        v6.4s, v6.4s, v2.4s           \n"
4101       "st1         {v3.4s, v4.4s}, [%1], #32     \n"  // store 8 samples
4102       "b.gt        1b                            \n"
4103       "fmax        v5.4s, v5.4s, v6.4s           \n"  // max
4104       "fmaxv       %s3, v5.4s                    \n"  // signed max acculator
4105       : "+r"(src),                                    // %0
4106         "+r"(dst),                                    // %1
4107         "+r"(width),                                  // %2
4108         "=w"(fmax)                                    // %3
4109       : "w"(scale)                                    // %4
4110       : "cc", "memory", "v1", "v2", "v3", "v4", "v5", "v6");
4111   return fmax;
4112 }
4113 
ScaleSumSamples_NEON(const float * src,float * dst,float scale,int width)4114 float ScaleSumSamples_NEON(const float* src,
4115                            float* dst,
4116                            float scale,
4117                            int width) {
4118   float fsum;
4119   asm volatile(
4120       "movi        v5.4s, #0                     \n"  // max
4121       "movi        v6.4s, #0                     \n"  // max
4122 
4123       "1:                                        \n"
4124       "ld1         {v1.4s, v2.4s}, [%0], #32     \n"  // load 8 samples
4125       "subs        %w2, %w2, #8                  \n"  // 8 processed per loop
4126       "fmul        v3.4s, v1.4s, %4.s[0]         \n"  // scale
4127       "prfm        pldl1keep, [%0, 448]          \n"
4128       "fmul        v4.4s, v2.4s, %4.s[0]         \n"
4129       "fmla        v5.4s, v1.4s, v1.4s           \n"  // sum of squares
4130       "fmla        v6.4s, v2.4s, v2.4s           \n"
4131       "st1         {v3.4s, v4.4s}, [%1], #32     \n"  // store 8 samples
4132       "b.gt        1b                            \n"
4133       "faddp       v5.4s, v5.4s, v6.4s           \n"
4134       "faddp       v5.4s, v5.4s, v5.4s           \n"
4135       "faddp       %3.4s, v5.4s, v5.4s           \n"  // sum
4136       : "+r"(src),                                    // %0
4137         "+r"(dst),                                    // %1
4138         "+r"(width),                                  // %2
4139         "=w"(fsum)                                    // %3
4140       : "w"(scale)                                    // %4
4141       : "cc", "memory", "v1", "v2", "v3", "v4", "v5", "v6");
4142   return fsum;
4143 }
4144 
ScaleSamples_NEON(const float * src,float * dst,float scale,int width)4145 void ScaleSamples_NEON(const float* src, float* dst, float scale, int width) {
4146   asm volatile(
4147       "1:                                        \n"
4148       "ld1         {v1.4s, v2.4s}, [%0], #32     \n"  // load 8 samples
4149       "prfm        pldl1keep, [%0, 448]          \n"
4150       "subs        %w2, %w2, #8                  \n"  // 8 processed per loop
4151       "fmul        v1.4s, v1.4s, %3.s[0]         \n"  // scale
4152       "fmul        v2.4s, v2.4s, %3.s[0]         \n"  // scale
4153       "st1         {v1.4s, v2.4s}, [%1], #32     \n"  // store 8 samples
4154       "b.gt        1b                            \n"
4155       : "+r"(src),   // %0
4156         "+r"(dst),   // %1
4157         "+r"(width)  // %2
4158       : "w"(scale)   // %3
4159       : "cc", "memory", "v1", "v2");
4160 }
4161 
4162 // filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
GaussCol_NEON(const uint16_t * src0,const uint16_t * src1,const uint16_t * src2,const uint16_t * src3,const uint16_t * src4,uint32_t * dst,int width)4163 void GaussCol_NEON(const uint16_t* src0,
4164                    const uint16_t* src1,
4165                    const uint16_t* src2,
4166                    const uint16_t* src3,
4167                    const uint16_t* src4,
4168                    uint32_t* dst,
4169                    int width) {
4170   asm volatile(
4171       "movi        v6.8h, #4                     \n"  // constant 4
4172       "movi        v7.8h, #6                     \n"  // constant 6
4173 
4174       "1:                                        \n"
4175       "ld1         {v1.8h}, [%0], #16            \n"  // load 8 samples, 5 rows
4176       "ld1         {v2.8h}, [%4], #16            \n"
4177       "uaddl       v0.4s, v1.4h, v2.4h           \n"  // * 1
4178       "prfm        pldl1keep, [%0, 448]          \n"
4179       "uaddl2      v1.4s, v1.8h, v2.8h           \n"  // * 1
4180       "ld1         {v2.8h}, [%1], #16            \n"
4181       "umlal       v0.4s, v2.4h, v6.4h           \n"  // * 4
4182       "prfm        pldl1keep, [%1, 448]          \n"
4183       "umlal2      v1.4s, v2.8h, v6.8h           \n"  // * 4
4184       "ld1         {v2.8h}, [%2], #16            \n"
4185       "umlal       v0.4s, v2.4h, v7.4h           \n"  // * 6
4186       "prfm        pldl1keep, [%2, 448]          \n"
4187       "umlal2      v1.4s, v2.8h, v7.8h           \n"  // * 6
4188       "ld1         {v2.8h}, [%3], #16            \n"
4189       "umlal       v0.4s, v2.4h, v6.4h           \n"  // * 4
4190       "prfm        pldl1keep, [%3, 448]          \n"
4191       "umlal2      v1.4s, v2.8h, v6.8h           \n"  // * 4
4192       "subs        %w6, %w6, #8                  \n"  // 8 processed per loop
4193       "st1         {v0.4s,v1.4s}, [%5], #32      \n"  // store 8 samples
4194       "prfm        pldl1keep, [%4, 448]          \n"
4195       "b.gt        1b                            \n"
4196       : "+r"(src0),  // %0
4197         "+r"(src1),  // %1
4198         "+r"(src2),  // %2
4199         "+r"(src3),  // %3
4200         "+r"(src4),  // %4
4201         "+r"(dst),   // %5
4202         "+r"(width)  // %6
4203       :
4204       : "cc", "memory", "v0", "v1", "v2", "v6", "v7");
4205 }
4206 
4207 // filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
GaussRow_NEON(const uint32_t * src,uint16_t * dst,int width)4208 void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width) {
4209   const uint32_t* src1 = src + 1;
4210   const uint32_t* src2 = src + 2;
4211   const uint32_t* src3 = src + 3;
4212   asm volatile(
4213       "movi        v6.4s, #4                     \n"  // constant 4
4214       "movi        v7.4s, #6                     \n"  // constant 6
4215 
4216       "1:                                        \n"
4217       "ld1         {v0.4s,v1.4s,v2.4s}, [%0], %6 \n"  // load 12 source samples
4218       "add         v0.4s, v0.4s, v1.4s           \n"  // * 1
4219       "add         v1.4s, v1.4s, v2.4s           \n"  // * 1
4220       "ld1         {v2.4s,v3.4s}, [%2], #32      \n"
4221       "mla         v0.4s, v2.4s, v7.4s           \n"  // * 6
4222       "mla         v1.4s, v3.4s, v7.4s           \n"  // * 6
4223       "ld1         {v2.4s,v3.4s}, [%1], #32      \n"
4224       "ld1         {v4.4s,v5.4s}, [%3], #32      \n"
4225       "add         v2.4s, v2.4s, v4.4s           \n"  // add rows for * 4
4226       "add         v3.4s, v3.4s, v5.4s           \n"
4227       "prfm        pldl1keep, [%0, 448]          \n"
4228       "mla         v0.4s, v2.4s, v6.4s           \n"  // * 4
4229       "mla         v1.4s, v3.4s, v6.4s           \n"  // * 4
4230       "subs        %w5, %w5, #8                  \n"  // 8 processed per loop
4231       "uqrshrn     v0.4h, v0.4s, #8              \n"  // round and pack
4232       "uqrshrn2    v0.8h, v1.4s, #8              \n"
4233       "st1         {v0.8h}, [%4], #16            \n"  // store 8 samples
4234       "b.gt        1b                            \n"
4235       : "+r"(src),   // %0
4236         "+r"(src1),  // %1
4237         "+r"(src2),  // %2
4238         "+r"(src3),  // %3
4239         "+r"(dst),   // %4
4240         "+r"(width)  // %5
4241       : "r"(32LL)    // %6
4242       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
4243 }
4244 
4245 static const vecf32 kGaussCoefficients = {4.0f, 6.0f, 1.0f / 256.0f, 0.0f};
4246 
4247 // filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
GaussCol_F32_NEON(const float * src0,const float * src1,const float * src2,const float * src3,const float * src4,float * dst,int width)4248 void GaussCol_F32_NEON(const float* src0,
4249                        const float* src1,
4250                        const float* src2,
4251                        const float* src3,
4252                        const float* src4,
4253                        float* dst,
4254                        int width) {
4255   asm volatile(
4256       "ld2r        {v6.4s, v7.4s}, [%7]          \n"  // constants 4 and 6
4257 
4258       "1:                                        \n"
4259       "ld1         {v0.4s, v1.4s}, [%0], #32     \n"  // load 8 samples, 5 rows
4260       "ld1         {v2.4s, v3.4s}, [%1], #32     \n"
4261       "fmla        v0.4s, v2.4s, v6.4s           \n"  // * 4
4262       "ld1         {v4.4s, v5.4s}, [%2], #32     \n"
4263       "fmla        v1.4s, v3.4s, v6.4s           \n"
4264       "prfm        pldl1keep, [%0, 448]          \n"
4265       "fmla        v0.4s, v4.4s, v7.4s           \n"  // * 6
4266       "ld1         {v2.4s, v3.4s}, [%3], #32     \n"
4267       "fmla        v1.4s, v5.4s, v7.4s           \n"
4268       "prfm        pldl1keep, [%1, 448]          \n"
4269       "fmla        v0.4s, v2.4s, v6.4s           \n"  // * 4
4270       "ld1         {v4.4s, v5.4s}, [%4], #32     \n"
4271       "fmla        v1.4s, v3.4s, v6.4s           \n"
4272       "prfm        pldl1keep, [%2, 448]          \n"
4273       "fadd        v0.4s, v0.4s, v4.4s           \n"  // * 1
4274       "prfm        pldl1keep, [%3, 448]          \n"
4275       "fadd        v1.4s, v1.4s, v5.4s           \n"
4276       "prfm        pldl1keep, [%4, 448]          \n"
4277       "subs        %w6, %w6, #8                  \n"  // 8 processed per loop
4278       "st1         {v0.4s, v1.4s}, [%5], #32     \n"  // store 8 samples
4279       "b.gt        1b                            \n"
4280       : "+r"(src0),               // %0
4281         "+r"(src1),               // %1
4282         "+r"(src2),               // %2
4283         "+r"(src3),               // %3
4284         "+r"(src4),               // %4
4285         "+r"(dst),                // %5
4286         "+r"(width)               // %6
4287       : "r"(&kGaussCoefficients)  // %7
4288       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
4289 }
4290 
4291 // filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
GaussRow_F32_NEON(const float * src,float * dst,int width)4292 void GaussRow_F32_NEON(const float* src, float* dst, int width) {
4293   asm volatile(
4294       "ld3r        {v6.4s, v7.4s, v8.4s}, [%3]   \n"  // constants 4, 6, 1/256
4295 
4296       "1:                                        \n"
4297       "ld1         {v0.4s, v1.4s, v2.4s}, [%0], %4 \n"  // load 12 samples, 5
4298                                                         // rows
4299       "fadd        v0.4s, v0.4s, v1.4s           \n"    // * 1
4300       "ld1         {v4.4s, v5.4s}, [%0], %5      \n"
4301       "fadd        v1.4s, v1.4s, v2.4s           \n"
4302       "fmla        v0.4s, v4.4s, v7.4s           \n"  // * 6
4303       "ld1         {v2.4s, v3.4s}, [%0], %4      \n"
4304       "fmla        v1.4s, v5.4s, v7.4s           \n"
4305       "ld1         {v4.4s, v5.4s}, [%0], %6      \n"
4306       "fadd        v2.4s, v2.4s, v4.4s           \n"
4307       "fadd        v3.4s, v3.4s, v5.4s           \n"
4308       "fmla        v0.4s, v2.4s, v6.4s           \n"  // * 4
4309       "fmla        v1.4s, v3.4s, v6.4s           \n"
4310       "prfm        pldl1keep, [%0, 448]          \n"
4311       "fmul        v0.4s, v0.4s, v8.4s           \n"  // / 256
4312       "fmul        v1.4s, v1.4s, v8.4s           \n"
4313       "subs        %w2, %w2, #8                  \n"  // 8 processed per loop
4314       "st1         {v0.4s, v1.4s}, [%1], #32     \n"  // store 8 samples
4315       "b.gt        1b                            \n"
4316       : "+r"(src),                 // %0
4317         "+r"(dst),                 // %1
4318         "+r"(width)                // %2
4319       : "r"(&kGaussCoefficients),  // %3
4320         "r"(8LL),                  // %4
4321         "r"(-4LL),                 // %5
4322         "r"(20LL)                  // %6
4323       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8");
4324 }
4325 
4326 #if LIBYUV_USE_ST3
4327 // Convert biplanar NV21 to packed YUV24
NV21ToYUV24Row_NEON(const uint8_t * src_y,const uint8_t * src_vu,uint8_t * dst_yuv24,int width)4328 void NV21ToYUV24Row_NEON(const uint8_t* src_y,
4329                          const uint8_t* src_vu,
4330                          uint8_t* dst_yuv24,
4331                          int width) {
4332   asm volatile(
4333       "1:                                        \n"
4334       "ld1         {v2.16b}, [%0], #16           \n"  // load 16 Y values
4335       "ld2         {v0.8b, v1.8b}, [%1], #16     \n"  // load 8 VU values
4336       "zip1        v0.16b, v0.16b, v0.16b        \n"  // replicate V values
4337       "prfm        pldl1keep, [%0, 448]          \n"
4338       "zip1        v1.16b, v1.16b, v1.16b        \n"  // replicate U values
4339       "prfm        pldl1keep, [%1, 448]          \n"
4340       "subs        %w3, %w3, #16                 \n"      // 16 pixels per loop
4341       "st3         {v0.16b,v1.16b,v2.16b}, [%2], #48 \n"  // store 16 YUV pixels
4342       "b.gt        1b                            \n"
4343       : "+r"(src_y),      // %0
4344         "+r"(src_vu),     // %1
4345         "+r"(dst_yuv24),  // %2
4346         "+r"(width)       // %3
4347       :
4348       : "cc", "memory", "v0", "v1", "v2");
4349 }
4350 #else
4351 static const uvec8 kYUV24Shuffle[3] = {
4352     {16, 17, 0, 16, 17, 1, 18, 19, 2, 18, 19, 3, 20, 21, 4, 20},
4353     {21, 5, 22, 23, 6, 22, 23, 7, 24, 25, 8, 24, 25, 9, 26, 27},
4354     {10, 26, 27, 11, 28, 29, 12, 28, 29, 13, 30, 31, 14, 30, 31, 15}};
4355 
4356 // Convert biplanar NV21 to packed YUV24
4357 // NV21 has VU in memory for chroma.
4358 // YUV24 is VUY in memory
NV21ToYUV24Row_NEON(const uint8_t * src_y,const uint8_t * src_vu,uint8_t * dst_yuv24,int width)4359 void NV21ToYUV24Row_NEON(const uint8_t* src_y,
4360                          const uint8_t* src_vu,
4361                          uint8_t* dst_yuv24,
4362                          int width) {
4363   asm volatile(
4364       "ld1         {v5.16b,v6.16b,v7.16b}, [%4]  \n"  // 3 shuffler constants
4365       "1:                                        \n"
4366       "ld1         {v0.16b}, [%0], #16           \n"    // load 16 Y values
4367       "ld1         {v1.16b}, [%1], #16           \n"    // load 8 VU values
4368       "tbl         v2.16b, {v0.16b,v1.16b}, v5.16b \n"  // weave into YUV24
4369       "prfm        pldl1keep, [%0, 448]          \n"
4370       "tbl         v3.16b, {v0.16b,v1.16b}, v6.16b \n"
4371       "prfm        pldl1keep, [%1, 448]          \n"
4372       "tbl         v4.16b, {v0.16b,v1.16b}, v7.16b \n"
4373       "subs        %w3, %w3, #16                 \n"      // 16 pixels per loop
4374       "st1         {v2.16b,v3.16b,v4.16b}, [%2], #48 \n"  // store 16 YUV pixels
4375       "b.gt        1b                            \n"
4376       : "+r"(src_y),            // %0
4377         "+r"(src_vu),           // %1
4378         "+r"(dst_yuv24),        // %2
4379         "+r"(width)             // %3
4380       : "r"(&kYUV24Shuffle[0])  // %4
4381       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
4382 }
4383 #endif  // LIBYUV_USE_ST3
4384 
4385 // Note ST2 8b version is faster than zip+ST1
4386 
4387 // AYUV is VUYA in memory.  UV for NV12 is UV order in memory.
AYUVToUVRow_NEON(const uint8_t * src_ayuv,int src_stride_ayuv,uint8_t * dst_uv,int width)4388 void AYUVToUVRow_NEON(const uint8_t* src_ayuv,
4389                       int src_stride_ayuv,
4390                       uint8_t* dst_uv,
4391                       int width) {
4392   const uint8_t* src_ayuv_1 = src_ayuv + src_stride_ayuv;
4393   asm volatile(
4394 
4395       "1:                                        \n"
4396       "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 ayuv
4397       "uaddlp      v0.8h, v0.16b                 \n"  // V 16 bytes -> 8 shorts.
4398       "prfm        pldl1keep, [%0, 448]          \n"
4399       "uaddlp      v1.8h, v1.16b                 \n"  // U 16 bytes -> 8 shorts.
4400       "ld4         {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load next 16
4401       "uadalp      v0.8h, v4.16b                 \n"  // V 16 bytes -> 8 shorts.
4402       "uadalp      v1.8h, v5.16b                 \n"  // U 16 bytes -> 8 shorts.
4403       "prfm        pldl1keep, [%1, 448]          \n"
4404       "uqrshrn     v3.8b, v0.8h, #2              \n"  // 2x2 average
4405       "uqrshrn     v2.8b, v1.8h, #2              \n"
4406       "subs        %w3, %w3, #16                 \n"  // 16 processed per loop.
4407       "st2         {v2.8b,v3.8b}, [%2], #16      \n"  // store 8 pixels UV.
4408       "b.gt        1b                            \n"
4409       : "+r"(src_ayuv),    // %0
4410         "+r"(src_ayuv_1),  // %1
4411         "+r"(dst_uv),      // %2
4412         "+r"(width)        // %3
4413       :
4414       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
4415 }
4416 
AYUVToVURow_NEON(const uint8_t * src_ayuv,int src_stride_ayuv,uint8_t * dst_vu,int width)4417 void AYUVToVURow_NEON(const uint8_t* src_ayuv,
4418                       int src_stride_ayuv,
4419                       uint8_t* dst_vu,
4420                       int width) {
4421   const uint8_t* src_ayuv_1 = src_ayuv + src_stride_ayuv;
4422   asm volatile(
4423 
4424       "1:                                        \n"
4425       "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 ayuv
4426       "uaddlp      v0.8h, v0.16b                 \n"  // V 16 bytes -> 8 shorts.
4427       "prfm        pldl1keep, [%0, 448]          \n"
4428       "uaddlp      v1.8h, v1.16b                 \n"  // U 16 bytes -> 8 shorts.
4429       "ld4         {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load next 16
4430       "uadalp      v0.8h, v4.16b                 \n"  // V 16 bytes -> 8 shorts.
4431       "uadalp      v1.8h, v5.16b                 \n"  // U 16 bytes -> 8 shorts.
4432       "prfm        pldl1keep, [%1, 448]          \n"
4433       "uqrshrn     v0.8b, v0.8h, #2              \n"  // 2x2 average
4434       "uqrshrn     v1.8b, v1.8h, #2              \n"
4435       "subs        %w3, %w3, #16                 \n"  // 16 processed per loop.
4436       "st2         {v0.8b,v1.8b}, [%2], #16      \n"  // store 8 pixels VU.
4437       "b.gt        1b                            \n"
4438       : "+r"(src_ayuv),    // %0
4439         "+r"(src_ayuv_1),  // %1
4440         "+r"(dst_vu),      // %2
4441         "+r"(width)        // %3
4442       :
4443       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
4444 }
4445 
4446 // Copy row of AYUV Y's into Y
AYUVToYRow_NEON(const uint8_t * src_ayuv,uint8_t * dst_y,int width)4447 void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) {
4448   asm volatile(
4449       "1:                                        \n"
4450       "ld4         {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16
4451       "subs        %w2, %w2, #16                 \n"  // 16 pixels per loop
4452       "prfm        pldl1keep, [%0, 448]          \n"
4453       "st1         {v2.16b}, [%1], #16           \n"  // store 16 Y pixels
4454       "b.gt        1b                            \n"
4455       : "+r"(src_ayuv),  // %0
4456         "+r"(dst_y),     // %1
4457         "+r"(width)      // %2
4458       :
4459       : "cc", "memory", "v0", "v1", "v2", "v3");
4460 }
4461 
4462 // Shuffle table for swapping UV bytes.
4463 static const uvec8 kShuffleSwapUV = {1u, 0u, 3u,  2u,  5u,  4u,  7u,  6u,
4464                                      9u, 8u, 11u, 10u, 13u, 12u, 15u, 14u};
4465 
4466 // Convert UV plane of NV12 to VU of NV21.
SwapUVRow_NEON(const uint8_t * src_uv,uint8_t * dst_vu,int width)4467 void SwapUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
4468   asm volatile(
4469       "ld1         {v2.16b}, [%3]                \n"  // shuffler
4470       "1:                                        \n"
4471       "ld1         {v0.16b}, [%0], 16            \n"  // load 16 UV values
4472       "ld1         {v1.16b}, [%0], 16            \n"
4473       "subs        %w2, %w2, #16                 \n"  // 16 pixels per loop
4474       "tbl         v0.16b, {v0.16b}, v2.16b      \n"
4475       "prfm        pldl1keep, [%0, 448]          \n"
4476       "tbl         v1.16b, {v1.16b}, v2.16b      \n"
4477       "stp         q0, q1, [%1], 32              \n"  // store 16 VU pixels
4478       "b.gt        1b                            \n"
4479       : "+r"(src_uv),         // %0
4480         "+r"(dst_vu),         // %1
4481         "+r"(width)           // %2
4482       : "r"(&kShuffleSwapUV)  // %3
4483       : "cc", "memory", "v0", "v1", "v2");
4484 }
4485 
HalfMergeUVRow_NEON(const uint8_t * src_u,int src_stride_u,const uint8_t * src_v,int src_stride_v,uint8_t * dst_uv,int width)4486 void HalfMergeUVRow_NEON(const uint8_t* src_u,
4487                          int src_stride_u,
4488                          const uint8_t* src_v,
4489                          int src_stride_v,
4490                          uint8_t* dst_uv,
4491                          int width) {
4492   const uint8_t* src_u_1 = src_u + src_stride_u;
4493   const uint8_t* src_v_1 = src_v + src_stride_v;
4494   asm volatile(
4495       "1:                                        \n"
4496       "ld1         {v0.16b}, [%0], #16           \n"  // load 16 U values
4497       "ld1         {v1.16b}, [%2], #16           \n"  // load 16 V values
4498       "ld1         {v2.16b}, [%1], #16           \n"
4499       "ld1         {v3.16b}, [%3], #16           \n"
4500       "uaddlp      v0.8h, v0.16b                 \n"  // half size
4501       "prfm        pldl1keep, [%0, 448]          \n"
4502       "uaddlp      v1.8h, v1.16b                 \n"
4503       "prfm        pldl1keep, [%2, 448]          \n"
4504       "uadalp      v0.8h, v2.16b                 \n"
4505       "prfm        pldl1keep, [%1, 448]          \n"
4506       "uadalp      v1.8h, v3.16b                 \n"
4507       "prfm        pldl1keep, [%3, 448]          \n"
4508       "uqrshrn     v0.8b, v0.8h, #2              \n"
4509       "uqrshrn     v1.8b, v1.8h, #2              \n"
4510       "subs        %w5, %w5, #16                 \n"  // 16 src pixels per loop
4511       "st2         {v0.8b, v1.8b}, [%4], #16     \n"  // store 8 UV pixels
4512       "b.gt        1b                            \n"
4513       : "+r"(src_u),    // %0
4514         "+r"(src_u_1),  // %1
4515         "+r"(src_v),    // %2
4516         "+r"(src_v_1),  // %3
4517         "+r"(dst_uv),   // %4
4518         "+r"(width)     // %5
4519       :
4520       : "cc", "memory", "v0", "v1", "v2", "v3");
4521 }
4522 
SplitUVRow_16_NEON(const uint16_t * src_uv,uint16_t * dst_u,uint16_t * dst_v,int depth,int width)4523 void SplitUVRow_16_NEON(const uint16_t* src_uv,
4524                         uint16_t* dst_u,
4525                         uint16_t* dst_v,
4526                         int depth,
4527                         int width) {
4528   int shift = depth - 16;  // Negative for right shift.
4529   asm volatile(
4530       "dup         v2.8h, %w4                    \n"
4531       "1:                                        \n"
4532       "ld2         {v0.8h, v1.8h}, [%0], #32     \n"  // load 8 UV
4533       "subs        %w3, %w3, #8                  \n"  // 8 src pixels per loop
4534       "ushl        v0.8h, v0.8h, v2.8h           \n"
4535       "prfm        pldl1keep, [%0, 448]          \n"
4536       "ushl        v1.8h, v1.8h, v2.8h           \n"
4537       "st1         {v0.8h}, [%1], #16            \n"  // store 8 U pixels
4538       "st1         {v1.8h}, [%2], #16            \n"  // store 8 V pixels
4539       "b.gt        1b                            \n"
4540       : "+r"(src_uv),  // %0
4541         "+r"(dst_u),   // %1
4542         "+r"(dst_v),   // %2
4543         "+r"(width)    // %3
4544       : "r"(shift)     // %4
4545       : "cc", "memory", "v0", "v1", "v2");
4546 }
4547 
MultiplyRow_16_NEON(const uint16_t * src_y,uint16_t * dst_y,int scale,int width)4548 void MultiplyRow_16_NEON(const uint16_t* src_y,
4549                          uint16_t* dst_y,
4550                          int scale,
4551                          int width) {
4552   asm volatile(
4553       "dup         v2.8h, %w3                    \n"
4554       "1:                                        \n"
4555       "ldp         q0, q1, [%0], #32             \n"
4556       "mul         v0.8h, v0.8h, v2.8h           \n"
4557       "prfm        pldl1keep, [%0, 448]          \n"
4558       "mul         v1.8h, v1.8h, v2.8h           \n"
4559       "stp         q0, q1, [%1], #32             \n"  // store 16 pixels
4560       "subs        %w2, %w2, #16                 \n"  // 16 src pixels per loop
4561       "b.gt        1b                            \n"
4562       : "+r"(src_y),  // %0
4563         "+r"(dst_y),  // %1
4564         "+r"(width)   // %2
4565       : "r"(scale)    // %3
4566       : "cc", "memory", "v0", "v1", "v2");
4567 }
4568 
DivideRow_16_NEON(const uint16_t * src_y,uint16_t * dst_y,int scale,int width)4569 void DivideRow_16_NEON(const uint16_t* src_y,
4570                        uint16_t* dst_y,
4571                        int scale,
4572                        int width) {
4573   asm volatile(
4574       "dup         v4.8h, %w3                    \n"
4575       "1:                                        \n"
4576       "ldp         q2, q3, [%0], #32             \n"
4577       "umull       v0.4s, v2.4h, v4.4h           \n"
4578       "umull2      v1.4s, v2.8h, v4.8h           \n"
4579       "umull       v2.4s, v3.4h, v4.4h           \n"
4580       "umull2      v3.4s, v3.8h, v4.8h           \n"
4581       "prfm        pldl1keep, [%0, 448]          \n"
4582       "shrn        v0.4h, v0.4s, #16             \n"
4583       "shrn2       v0.8h, v1.4s, #16             \n"
4584       "shrn        v1.4h, v2.4s, #16             \n"
4585       "shrn2       v1.8h, v3.4s, #16             \n"
4586       "stp         q0, q1, [%1], #32             \n"  // store 16 pixels
4587       "subs        %w2, %w2, #16                 \n"  // 16 src pixels per loop
4588       "b.gt        1b                            \n"
4589       : "+r"(src_y),  // %0
4590         "+r"(dst_y),  // %1
4591         "+r"(width)   // %2
4592       : "r"(scale)    // %3
4593       : "cc", "memory", "v0", "v1", "v2", "v3", "v4");
4594 }
4595 
4596 // Use scale to convert lsb formats to msb, depending how many bits there are:
4597 // 32768 = 9 bits = shr 1
4598 // 16384 = 10 bits = shr 2
4599 // 4096 = 12 bits = shr 4
4600 // 256 = 16 bits = shr 8
Convert16To8Row_NEON(const uint16_t * src_y,uint8_t * dst_y,int scale,int width)4601 void Convert16To8Row_NEON(const uint16_t* src_y,
4602                           uint8_t* dst_y,
4603                           int scale,
4604                           int width) {
4605   int shift = 15 - __builtin_clz((int32_t)scale);  // Negative shl is shr
4606   asm volatile(
4607       "dup         v2.8h, %w3                    \n"
4608       "1:                                        \n"
4609       "ldp         q0, q1, [%0], #32             \n"
4610       "ushl        v0.8h, v0.8h, v2.8h           \n"  // shr = v2 is negative
4611       "ushl        v1.8h, v1.8h, v2.8h           \n"
4612       "prfm        pldl1keep, [%0, 448]          \n"
4613       "uqxtn       v0.8b, v0.8h                  \n"
4614       "uqxtn2      v0.16b, v1.8h                 \n"
4615       "subs        %w2, %w2, #16                 \n"  // 16 src pixels per loop
4616       "str         q0, [%1], #16                 \n"  // store 16 pixels
4617       "b.gt        1b                            \n"
4618       : "+r"(src_y),  // %0
4619         "+r"(dst_y),  // %1
4620         "+r"(width)   // %2
4621       : "r"(shift)    // %3
4622       : "cc", "memory", "v0", "v1", "v2");
4623 }
4624 
4625 #endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
4626 
4627 #ifdef __cplusplus
4628 }  // extern "C"
4629 }  // namespace libyuv
4630 #endif
4631