1 /*
2 * Copyright 2014 The LibYuv Project Authors. All rights reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "libyuv/row.h"
12
13 #ifdef __cplusplus
14 namespace libyuv {
15 extern "C" {
16 #endif
17
18 // Enable LIBYUV_USE_ST2, LIBYUV_USE_ST3, LIBYUV_USE_ST4 for CPUs that prefer
19 // STn over ZIP1+ST1
20 // Exynos M1, M2, M3 are slow with ST2, ST3 and ST4 instructions.
21
22 // This module is for GCC Neon armv8 64 bit.
23 #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
24
25 // v0.8h: Y
26 // v1.16b: 8U, 8V
27
28 // Read 8 Y, 4 U and 4 V from 422
29 #define READYUV422 \
30 "ldr d0, [%[src_y]], #8 \n" \
31 "ld1 {v1.s}[0], [%[src_u]], #4 \n" \
32 "ld1 {v1.s}[1], [%[src_v]], #4 \n" \
33 "zip1 v0.16b, v0.16b, v0.16b \n" \
34 "prfm pldl1keep, [%[src_y], 448] \n" \
35 "zip1 v1.16b, v1.16b, v1.16b \n" \
36 "prfm pldl1keep, [%[src_u], 128] \n" \
37 "prfm pldl1keep, [%[src_v], 128] \n"
38
39 // Read 8 Y, 8 U and 8 V from 444
40 #define READYUV444 \
41 "ldr d0, [%[src_y]], #8 \n" \
42 "ld1 {v1.d}[0], [%[src_u]], #8 \n" \
43 "prfm pldl1keep, [%[src_y], 448] \n" \
44 "ld1 {v1.d}[1], [%[src_v]], #8 \n" \
45 "prfm pldl1keep, [%[src_u], 448] \n" \
46 "zip1 v0.16b, v0.16b, v0.16b \n" \
47 "prfm pldl1keep, [%[src_v], 448] \n"
48
49 // Read 8 Y, and set 4 U and 4 V to 128
50 #define READYUV400 \
51 "ldr d0, [%[src_y]], #8 \n" \
52 "movi v1.16b, #128 \n" \
53 "prfm pldl1keep, [%[src_y], 448] \n" \
54 "zip1 v0.16b, v0.16b, v0.16b \n"
55
56 static const uvec8 kNV12Table = {0, 0, 2, 2, 4, 4, 6, 6,
57 1, 1, 3, 3, 5, 5, 7, 7};
58 static const uvec8 kNV21Table = {1, 1, 3, 3, 5, 5, 7, 7,
59 0, 0, 2, 2, 4, 4, 6, 6};
60
61 // Read 8 Y and 4 UV from NV12 or NV21
62 #define READNV12 \
63 "ldr d0, [%[src_y]], #8 \n" \
64 "ldr d1, [%[src_uv]], #8 \n" \
65 "zip1 v0.16b, v0.16b, v0.16b \n" \
66 "prfm pldl1keep, [%[src_y], 448] \n" \
67 "tbl v1.16b, {v1.16b}, v2.16b \n" \
68 "prfm pldl1keep, [%[src_uv], 448] \n"
69
70 // Read 8 YUY2
71 #define READYUY2 \
72 "ld2 {v0.8b, v1.8b}, [%[src_yuy2]], #16 \n" \
73 "zip1 v0.16b, v0.16b, v0.16b \n" \
74 "prfm pldl1keep, [%[src_yuy2], 448] \n" \
75 "tbl v1.16b, {v1.16b}, v2.16b \n"
76
77 // Read 8 UYVY
78 #define READUYVY \
79 "ld2 {v3.8b, v4.8b}, [%[src_uyvy]], #16 \n" \
80 "zip1 v0.16b, v4.16b, v4.16b \n" \
81 "prfm pldl1keep, [%[src_uyvy], 448] \n" \
82 "tbl v1.16b, {v3.16b}, v2.16b \n"
83
84 // UB VR UG VG
85 // YG BB BG BR
86 #define YUVTORGB_SETUP \
87 "ld4r {v28.16b, v29.16b, v30.16b, v31.16b}, [%[kUVCoeff]] \n" \
88 "ld4r {v24.8h, v25.8h, v26.8h, v27.8h}, [%[kRGBCoeffBias]] \n"
89
90 // v16.8h: B
91 // v17.8h: G
92 // v18.8h: R
93
94 // Convert from YUV to 2.14 fixed point RGB
95 #define YUVTORGB \
96 "umull2 v3.4s, v0.8h, v24.8h \n" \
97 "umull v6.8h, v1.8b, v30.8b \n" \
98 "umull v0.4s, v0.4h, v24.4h \n" \
99 "umlal2 v6.8h, v1.16b, v31.16b \n" /* DG */ \
100 "uqshrn v0.4h, v0.4s, #16 \n" \
101 "uqshrn2 v0.8h, v3.4s, #16 \n" /* Y */ \
102 "umull v4.8h, v1.8b, v28.8b \n" /* DB */ \
103 "umull2 v5.8h, v1.16b, v29.16b \n" /* DR */ \
104 "add v17.8h, v0.8h, v26.8h \n" /* G */ \
105 "add v16.8h, v0.8h, v4.8h \n" /* B */ \
106 "add v18.8h, v0.8h, v5.8h \n" /* R */ \
107 "uqsub v17.8h, v17.8h, v6.8h \n" /* G */ \
108 "uqsub v16.8h, v16.8h, v25.8h \n" /* B */ \
109 "uqsub v18.8h, v18.8h, v27.8h \n" /* R */
110
111 // Convert from 2.14 fixed point RGB To 8 bit RGB
112 #define RGBTORGB8 \
113 "uqshrn v17.8b, v17.8h, #6 \n" \
114 "uqshrn v16.8b, v16.8h, #6 \n" \
115 "uqshrn v18.8b, v18.8h, #6 \n"
116
117 #define YUVTORGB_REGS \
118 "v0", "v1", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v24", "v25", \
119 "v26", "v27", "v28", "v29", "v30", "v31"
120
I444ToARGBRow_NEON(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)121 void I444ToARGBRow_NEON(const uint8_t* src_y,
122 const uint8_t* src_u,
123 const uint8_t* src_v,
124 uint8_t* dst_argb,
125 const struct YuvConstants* yuvconstants,
126 int width) {
127 asm volatile(
128 YUVTORGB_SETUP
129 "movi v19.8b, #255 \n" /* A */
130 "1: \n" READYUV444 YUVTORGB
131 RGBTORGB8
132 "subs %w[width], %w[width], #8 \n"
133 "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
134 "b.gt 1b \n"
135 : [src_y] "+r"(src_y), // %[src_y]
136 [src_u] "+r"(src_u), // %[src_u]
137 [src_v] "+r"(src_v), // %[src_v]
138 [dst_argb] "+r"(dst_argb), // %[dst_argb]
139 [width] "+r"(width) // %[width]
140 : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
141 [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias]
142 : "cc", "memory", YUVTORGB_REGS, "v19");
143 }
144
I444ToRGB24Row_NEON(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_rgb24,const struct YuvConstants * yuvconstants,int width)145 void I444ToRGB24Row_NEON(const uint8_t* src_y,
146 const uint8_t* src_u,
147 const uint8_t* src_v,
148 uint8_t* dst_rgb24,
149 const struct YuvConstants* yuvconstants,
150 int width) {
151 asm volatile(
152 YUVTORGB_SETUP
153 "1: \n" READYUV444 YUVTORGB
154 RGBTORGB8
155 "subs %w[width], %w[width], #8 \n"
156 "st3 {v16.8b,v17.8b,v18.8b}, [%[dst_rgb24]], #24 \n"
157 "b.gt 1b \n"
158 : [src_y] "+r"(src_y), // %[src_y]
159 [src_u] "+r"(src_u), // %[src_u]
160 [src_v] "+r"(src_v), // %[src_v]
161 [dst_rgb24] "+r"(dst_rgb24), // %[dst_rgb24]
162 [width] "+r"(width) // %[width]
163 : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
164 [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias]
165 : "cc", "memory", YUVTORGB_REGS);
166 }
167
I422ToARGBRow_NEON(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)168 void I422ToARGBRow_NEON(const uint8_t* src_y,
169 const uint8_t* src_u,
170 const uint8_t* src_v,
171 uint8_t* dst_argb,
172 const struct YuvConstants* yuvconstants,
173 int width) {
174 asm volatile(
175 YUVTORGB_SETUP
176 "movi v19.8b, #255 \n" /* A */
177 "1: \n" READYUV422 YUVTORGB
178 RGBTORGB8
179 "subs %w[width], %w[width], #8 \n"
180 "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
181 "b.gt 1b \n"
182 : [src_y] "+r"(src_y), // %[src_y]
183 [src_u] "+r"(src_u), // %[src_u]
184 [src_v] "+r"(src_v), // %[src_v]
185 [dst_argb] "+r"(dst_argb), // %[dst_argb]
186 [width] "+r"(width) // %[width]
187 : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
188 [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias]
189 : "cc", "memory", YUVTORGB_REGS, "v19");
190 }
191
I444AlphaToARGBRow_NEON(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,const uint8_t * src_a,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)192 void I444AlphaToARGBRow_NEON(const uint8_t* src_y,
193 const uint8_t* src_u,
194 const uint8_t* src_v,
195 const uint8_t* src_a,
196 uint8_t* dst_argb,
197 const struct YuvConstants* yuvconstants,
198 int width) {
199 asm volatile(
200 YUVTORGB_SETUP
201 "1: \n"
202 "ld1 {v19.8b}, [%[src_a]], #8 \n" READYUV444
203 "prfm pldl1keep, [%[src_a], 448] \n" YUVTORGB RGBTORGB8
204 "subs %w[width], %w[width], #8 \n"
205 "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
206 "b.gt 1b \n"
207 : [src_y] "+r"(src_y), // %[src_y]
208 [src_u] "+r"(src_u), // %[src_u]
209 [src_v] "+r"(src_v), // %[src_v]
210 [src_a] "+r"(src_a), // %[src_a]
211 [dst_argb] "+r"(dst_argb), // %[dst_argb]
212 [width] "+r"(width) // %[width]
213 : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
214 [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias]
215 : "cc", "memory", YUVTORGB_REGS, "v19");
216 }
217
I422AlphaToARGBRow_NEON(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,const uint8_t * src_a,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)218 void I422AlphaToARGBRow_NEON(const uint8_t* src_y,
219 const uint8_t* src_u,
220 const uint8_t* src_v,
221 const uint8_t* src_a,
222 uint8_t* dst_argb,
223 const struct YuvConstants* yuvconstants,
224 int width) {
225 asm volatile(
226 YUVTORGB_SETUP
227 "1: \n"
228 "ld1 {v19.8b}, [%[src_a]], #8 \n" READYUV422
229 "prfm pldl1keep, [%[src_a], 448] \n" YUVTORGB RGBTORGB8
230 "subs %w[width], %w[width], #8 \n"
231 "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
232 "b.gt 1b \n"
233 : [src_y] "+r"(src_y), // %[src_y]
234 [src_u] "+r"(src_u), // %[src_u]
235 [src_v] "+r"(src_v), // %[src_v]
236 [src_a] "+r"(src_a), // %[src_a]
237 [dst_argb] "+r"(dst_argb), // %[dst_argb]
238 [width] "+r"(width) // %[width]
239 : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
240 [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias]
241 : "cc", "memory", YUVTORGB_REGS, "v19");
242 }
243
I422ToRGBARow_NEON(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_rgba,const struct YuvConstants * yuvconstants,int width)244 void I422ToRGBARow_NEON(const uint8_t* src_y,
245 const uint8_t* src_u,
246 const uint8_t* src_v,
247 uint8_t* dst_rgba,
248 const struct YuvConstants* yuvconstants,
249 int width) {
250 asm volatile(
251 YUVTORGB_SETUP
252 "movi v15.8b, #255 \n" /* A */
253 "1: \n" READYUV422 YUVTORGB
254 RGBTORGB8
255 "subs %w[width], %w[width], #8 \n"
256 "st4 {v15.8b,v16.8b,v17.8b,v18.8b}, [%[dst_rgba]], #32 \n"
257 "b.gt 1b \n"
258 : [src_y] "+r"(src_y), // %[src_y]
259 [src_u] "+r"(src_u), // %[src_u]
260 [src_v] "+r"(src_v), // %[src_v]
261 [dst_rgba] "+r"(dst_rgba), // %[dst_rgba]
262 [width] "+r"(width) // %[width]
263 : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
264 [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias]
265 : "cc", "memory", YUVTORGB_REGS, "v15");
266 }
267
I422ToRGB24Row_NEON(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_rgb24,const struct YuvConstants * yuvconstants,int width)268 void I422ToRGB24Row_NEON(const uint8_t* src_y,
269 const uint8_t* src_u,
270 const uint8_t* src_v,
271 uint8_t* dst_rgb24,
272 const struct YuvConstants* yuvconstants,
273 int width) {
274 asm volatile(
275 YUVTORGB_SETUP
276 "1: \n" READYUV422 YUVTORGB
277 RGBTORGB8
278 "subs %w[width], %w[width], #8 \n"
279 "st3 {v16.8b,v17.8b,v18.8b}, [%[dst_rgb24]], #24 \n"
280 "b.gt 1b \n"
281 : [src_y] "+r"(src_y), // %[src_y]
282 [src_u] "+r"(src_u), // %[src_u]
283 [src_v] "+r"(src_v), // %[src_v]
284 [dst_rgb24] "+r"(dst_rgb24), // %[dst_rgb24]
285 [width] "+r"(width) // %[width]
286 : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
287 [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias]
288 : "cc", "memory", YUVTORGB_REGS);
289 }
290
291 #define ARGBTORGB565 \
292 "shll v18.8h, v18.8b, #8 \n" /* R */ \
293 "shll v17.8h, v17.8b, #8 \n" /* G */ \
294 "shll v16.8h, v16.8b, #8 \n" /* B */ \
295 "sri v18.8h, v17.8h, #5 \n" /* RG */ \
296 "sri v18.8h, v16.8h, #11 \n" /* RGB */
297
I422ToRGB565Row_NEON(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_rgb565,const struct YuvConstants * yuvconstants,int width)298 void I422ToRGB565Row_NEON(const uint8_t* src_y,
299 const uint8_t* src_u,
300 const uint8_t* src_v,
301 uint8_t* dst_rgb565,
302 const struct YuvConstants* yuvconstants,
303 int width) {
304 asm volatile(
305 YUVTORGB_SETUP
306 "1: \n" READYUV422 YUVTORGB
307 RGBTORGB8 "subs %w[width], %w[width], #8 \n" ARGBTORGB565
308 "st1 {v18.8h}, [%[dst_rgb565]], #16 \n" // store 8 pixels RGB565.
309 "b.gt 1b \n"
310 : [src_y] "+r"(src_y), // %[src_y]
311 [src_u] "+r"(src_u), // %[src_u]
312 [src_v] "+r"(src_v), // %[src_v]
313 [dst_rgb565] "+r"(dst_rgb565), // %[dst_rgb565]
314 [width] "+r"(width) // %[width]
315 : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
316 [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias]
317 : "cc", "memory", YUVTORGB_REGS);
318 }
319
320 #define ARGBTOARGB1555 \
321 "shll v0.8h, v19.8b, #8 \n" /* A */ \
322 "shll v18.8h, v18.8b, #8 \n" /* R */ \
323 "shll v17.8h, v17.8b, #8 \n" /* G */ \
324 "shll v16.8h, v16.8b, #8 \n" /* B */ \
325 "sri v0.8h, v18.8h, #1 \n" /* AR */ \
326 "sri v0.8h, v17.8h, #6 \n" /* ARG */ \
327 "sri v0.8h, v16.8h, #11 \n" /* ARGB */
328
I422ToARGB1555Row_NEON(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_argb1555,const struct YuvConstants * yuvconstants,int width)329 void I422ToARGB1555Row_NEON(const uint8_t* src_y,
330 const uint8_t* src_u,
331 const uint8_t* src_v,
332 uint8_t* dst_argb1555,
333 const struct YuvConstants* yuvconstants,
334 int width) {
335 asm volatile(
336 YUVTORGB_SETUP
337 "movi v19.8b, #255 \n"
338 "1: \n" READYUV422 YUVTORGB
339 RGBTORGB8
340 "subs %w[width], %w[width], #8 \n" ARGBTOARGB1555
341 "st1 {v0.8h}, [%[dst_argb1555]], #16 \n" // store 8 pixels
342 // RGB565.
343 "b.gt 1b \n"
344 : [src_y] "+r"(src_y), // %[src_y]
345 [src_u] "+r"(src_u), // %[src_u]
346 [src_v] "+r"(src_v), // %[src_v]
347 [dst_argb1555] "+r"(dst_argb1555), // %[dst_argb1555]
348 [width] "+r"(width) // %[width]
349 : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
350 [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias]
351 : "cc", "memory", YUVTORGB_REGS, "v19");
352 }
353
354 #define ARGBTOARGB4444 \
355 /* Input v16.8b<=B, v17.8b<=G, v18.8b<=R, v19.8b<=A, v23.8b<=0x0f */ \
356 "ushr v16.8b, v16.8b, #4 \n" /* B */ \
357 "bic v17.8b, v17.8b, v23.8b \n" /* G */ \
358 "ushr v18.8b, v18.8b, #4 \n" /* R */ \
359 "bic v19.8b, v19.8b, v23.8b \n" /* A */ \
360 "orr v0.8b, v16.8b, v17.8b \n" /* BG */ \
361 "orr v1.8b, v18.8b, v19.8b \n" /* RA */ \
362 "zip1 v0.16b, v0.16b, v1.16b \n" /* BGRA */
363
I422ToARGB4444Row_NEON(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_argb4444,const struct YuvConstants * yuvconstants,int width)364 void I422ToARGB4444Row_NEON(const uint8_t* src_y,
365 const uint8_t* src_u,
366 const uint8_t* src_v,
367 uint8_t* dst_argb4444,
368 const struct YuvConstants* yuvconstants,
369 int width) {
370 asm volatile(
371 YUVTORGB_SETUP
372 "movi v23.16b, #0x0f \n" // bits to clear with
373 // vbic.
374 "1: \n" READYUV422 YUVTORGB
375 RGBTORGB8
376 "subs %w[width], %w[width], #8 \n"
377 "movi v19.8b, #255 \n" ARGBTOARGB4444
378 "st1 {v0.8h}, [%[dst_argb4444]], #16 \n" // store 8
379 // pixels
380 // ARGB4444.
381 "b.gt 1b \n"
382 : [src_y] "+r"(src_y), // %[src_y]
383 [src_u] "+r"(src_u), // %[src_u]
384 [src_v] "+r"(src_v), // %[src_v]
385 [dst_argb4444] "+r"(dst_argb4444), // %[dst_argb4444]
386 [width] "+r"(width) // %[width]
387 : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
388 [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias]
389 : "cc", "memory", YUVTORGB_REGS, "v19", "v23");
390 }
391
I400ToARGBRow_NEON(const uint8_t * src_y,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)392 void I400ToARGBRow_NEON(const uint8_t* src_y,
393 uint8_t* dst_argb,
394 const struct YuvConstants* yuvconstants,
395 int width) {
396 asm volatile(
397 YUVTORGB_SETUP
398 "movi v19.8b, #255 \n"
399 "1: \n" READYUV400 YUVTORGB
400 RGBTORGB8
401 "subs %w[width], %w[width], #8 \n"
402 "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
403 "b.gt 1b \n"
404 : [src_y] "+r"(src_y), // %[src_y]
405 [dst_argb] "+r"(dst_argb), // %[dst_argb]
406 [width] "+r"(width) // %[width]
407 : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
408 [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias]
409 : "cc", "memory", YUVTORGB_REGS, "v19");
410 }
411
412 #if LIBYUV_USE_ST4
J400ToARGBRow_NEON(const uint8_t * src_y,uint8_t * dst_argb,int width)413 void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
414 asm volatile(
415 "movi v23.8b, #255 \n"
416 "1: \n"
417 "ld1 {v20.8b}, [%0], #8 \n"
418 "prfm pldl1keep, [%0, 448] \n"
419 "orr v21.8b, v20.8b, v20.8b \n"
420 "orr v22.8b, v20.8b, v20.8b \n"
421 "subs %w2, %w2, #8 \n"
422 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
423 "b.gt 1b \n"
424 : "+r"(src_y), // %0
425 "+r"(dst_argb), // %1
426 "+r"(width) // %2
427 :
428 : "cc", "memory", "v20", "v21", "v22", "v23");
429 }
430 #else
J400ToARGBRow_NEON(const uint8_t * src_y,uint8_t * dst_argb,int width)431 void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
432 asm volatile(
433 "movi v20.8b, #255 \n"
434 "1: \n"
435 "ldr d16, [%0], #8 \n"
436 "subs %w2, %w2, #8 \n"
437 "zip1 v18.16b, v16.16b, v16.16b \n" // YY
438 "zip1 v19.16b, v16.16b, v20.16b \n" // YA
439 "prfm pldl1keep, [%0, 448] \n"
440 "zip1 v16.16b, v18.16b, v19.16b \n" // YYYA
441 "zip2 v17.16b, v18.16b, v19.16b \n"
442 "stp q16, q17, [%1], #32 \n"
443 "b.gt 1b \n"
444 : "+r"(src_y), // %0
445 "+r"(dst_argb), // %1
446 "+r"(width) // %2
447 :
448 : "cc", "memory", "v16", "v17", "v18", "v19", "v20");
449 }
450 #endif // LIBYUV_USE_ST4
451
NV12ToARGBRow_NEON(const uint8_t * src_y,const uint8_t * src_uv,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)452 void NV12ToARGBRow_NEON(const uint8_t* src_y,
453 const uint8_t* src_uv,
454 uint8_t* dst_argb,
455 const struct YuvConstants* yuvconstants,
456 int width) {
457 asm volatile(
458 YUVTORGB_SETUP
459 "movi v19.8b, #255 \n"
460 "ldr q2, [%[kNV12Table]] \n"
461 "1: \n" READNV12 YUVTORGB RGBTORGB8
462 "subs %w[width], %w[width], #8 \n"
463 "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
464 "b.gt 1b \n"
465 : [src_y] "+r"(src_y), // %[src_y]
466 [src_uv] "+r"(src_uv), // %[src_uv]
467 [dst_argb] "+r"(dst_argb), // %[dst_argb]
468 [width] "+r"(width) // %[width]
469 : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
470 [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias), // %[kRGBCoeffBias]
471 [kNV12Table] "r"(&kNV12Table)
472 : "cc", "memory", YUVTORGB_REGS, "v2", "v19");
473 }
474
NV21ToARGBRow_NEON(const uint8_t * src_y,const uint8_t * src_vu,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)475 void NV21ToARGBRow_NEON(const uint8_t* src_y,
476 const uint8_t* src_vu,
477 uint8_t* dst_argb,
478 const struct YuvConstants* yuvconstants,
479 int width) {
480 asm volatile(
481 YUVTORGB_SETUP
482 "movi v19.8b, #255 \n"
483 "ldr q2, [%[kNV12Table]] \n"
484 "1: \n" READNV12 YUVTORGB RGBTORGB8
485 "subs %w[width], %w[width], #8 \n"
486 "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
487 "b.gt 1b \n"
488 : [src_y] "+r"(src_y), // %[src_y]
489 [src_uv] "+r"(src_vu), // %[src_uv]
490 [dst_argb] "+r"(dst_argb), // %[dst_argb]
491 [width] "+r"(width) // %[width]
492 : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
493 [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias), // %[kRGBCoeffBias]
494 [kNV12Table] "r"(&kNV21Table)
495 : "cc", "memory", YUVTORGB_REGS, "v2", "v19");
496 }
497
NV12ToRGB24Row_NEON(const uint8_t * src_y,const uint8_t * src_uv,uint8_t * dst_rgb24,const struct YuvConstants * yuvconstants,int width)498 void NV12ToRGB24Row_NEON(const uint8_t* src_y,
499 const uint8_t* src_uv,
500 uint8_t* dst_rgb24,
501 const struct YuvConstants* yuvconstants,
502 int width) {
503 asm volatile(
504 YUVTORGB_SETUP
505 "ldr q2, [%[kNV12Table]] \n"
506 "1: \n" READNV12 YUVTORGB RGBTORGB8
507 "subs %w[width], %w[width], #8 \n"
508 "st3 {v16.8b,v17.8b,v18.8b}, [%[dst_rgb24]], #24 \n"
509 "b.gt 1b \n"
510 : [src_y] "+r"(src_y), // %[src_y]
511 [src_uv] "+r"(src_uv), // %[src_uv]
512 [dst_rgb24] "+r"(dst_rgb24), // %[dst_rgb24]
513 [width] "+r"(width) // %[width]
514 : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
515 [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias), // %[kRGBCoeffBias]
516 [kNV12Table] "r"(&kNV12Table)
517 : "cc", "memory", YUVTORGB_REGS, "v2");
518 }
519
NV21ToRGB24Row_NEON(const uint8_t * src_y,const uint8_t * src_vu,uint8_t * dst_rgb24,const struct YuvConstants * yuvconstants,int width)520 void NV21ToRGB24Row_NEON(const uint8_t* src_y,
521 const uint8_t* src_vu,
522 uint8_t* dst_rgb24,
523 const struct YuvConstants* yuvconstants,
524 int width) {
525 asm volatile(
526 YUVTORGB_SETUP
527 "ldr q2, [%[kNV12Table]] \n"
528 "1: \n" READNV12 YUVTORGB RGBTORGB8
529 "subs %w[width], %w[width], #8 \n"
530 "st3 {v16.8b,v17.8b,v18.8b}, [%[dst_rgb24]], #24 \n"
531 "b.gt 1b \n"
532 : [src_y] "+r"(src_y), // %[src_y]
533 [src_uv] "+r"(src_vu), // %[src_uv]
534 [dst_rgb24] "+r"(dst_rgb24), // %[dst_rgb24]
535 [width] "+r"(width) // %[width]
536 : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
537 [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias), // %[kRGBCoeffBias]
538 [kNV12Table] "r"(&kNV21Table)
539 : "cc", "memory", YUVTORGB_REGS, "v2");
540 }
541
NV12ToRGB565Row_NEON(const uint8_t * src_y,const uint8_t * src_uv,uint8_t * dst_rgb565,const struct YuvConstants * yuvconstants,int width)542 void NV12ToRGB565Row_NEON(const uint8_t* src_y,
543 const uint8_t* src_uv,
544 uint8_t* dst_rgb565,
545 const struct YuvConstants* yuvconstants,
546 int width) {
547 asm volatile(
548 YUVTORGB_SETUP
549 "ldr q2, [%[kNV12Table]] \n"
550 "1: \n" READNV12 YUVTORGB RGBTORGB8
551 "subs %w[width], %w[width], #8 \n" ARGBTORGB565
552 "st1 {v18.8h}, [%[dst_rgb565]], #16 \n" // store 8
553 // pixels
554 // RGB565.
555 "b.gt 1b \n"
556 : [src_y] "+r"(src_y), // %[src_y]
557 [src_uv] "+r"(src_uv), // %[src_uv]
558 [dst_rgb565] "+r"(dst_rgb565), // %[dst_rgb565]
559 [width] "+r"(width) // %[width]
560 : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
561 [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias), // %[kRGBCoeffBias]
562 [kNV12Table] "r"(&kNV12Table)
563 : "cc", "memory", YUVTORGB_REGS, "v2");
564 }
565
YUY2ToARGBRow_NEON(const uint8_t * src_yuy2,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)566 void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2,
567 uint8_t* dst_argb,
568 const struct YuvConstants* yuvconstants,
569 int width) {
570 asm volatile(
571 YUVTORGB_SETUP
572 "movi v19.8b, #255 \n"
573 "ldr q2, [%[kNV12Table]] \n"
574 "1: \n" READYUY2 YUVTORGB RGBTORGB8
575 "subs %w[width], %w[width], #8 \n"
576 "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
577 "b.gt 1b \n"
578 : [src_yuy2] "+r"(src_yuy2), // %[src_yuy2]
579 [dst_argb] "+r"(dst_argb), // %[dst_argb]
580 [width] "+r"(width) // %[width]
581 : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
582 [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias), // %[kRGBCoeffBias]
583 [kNV12Table] "r"(&kNV12Table)
584 : "cc", "memory", YUVTORGB_REGS, "v2", "v19");
585 }
586
UYVYToARGBRow_NEON(const uint8_t * src_uyvy,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)587 void UYVYToARGBRow_NEON(const uint8_t* src_uyvy,
588 uint8_t* dst_argb,
589 const struct YuvConstants* yuvconstants,
590 int width) {
591 asm volatile(
592 YUVTORGB_SETUP
593 "movi v19.8b, #255 \n"
594 "ldr q2, [%[kNV12Table]] \n"
595 "1: \n" READUYVY YUVTORGB RGBTORGB8
596 "subs %w[width], %w[width], #8 \n"
597 "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
598 "b.gt 1b \n"
599 : [src_uyvy] "+r"(src_uyvy), // %[src_yuy2]
600 [dst_argb] "+r"(dst_argb), // %[dst_argb]
601 [width] "+r"(width) // %[width]
602 : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
603 [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias), // %[kRGBCoeffBias]
604 [kNV12Table] "r"(&kNV12Table)
605 : "cc", "memory", YUVTORGB_REGS, "v2", "v19");
606 }
607
608 // Reads 16 pairs of UV and write even values to dst_u and odd to dst_v.
SplitUVRow_NEON(const uint8_t * src_uv,uint8_t * dst_u,uint8_t * dst_v,int width)609 void SplitUVRow_NEON(const uint8_t* src_uv,
610 uint8_t* dst_u,
611 uint8_t* dst_v,
612 int width) {
613 asm volatile(
614 "1: \n"
615 "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pairs of UV
616 "subs %w3, %w3, #16 \n" // 16 processed per loop
617 "prfm pldl1keep, [%0, 448] \n"
618 "st1 {v0.16b}, [%1], #16 \n" // store U
619 "st1 {v1.16b}, [%2], #16 \n" // store V
620 "b.gt 1b \n"
621 : "+r"(src_uv), // %0
622 "+r"(dst_u), // %1
623 "+r"(dst_v), // %2
624 "+r"(width) // %3 // Output registers
625 : // Input registers
626 : "cc", "memory", "v0", "v1" // Clobber List
627 );
628 }
629
630 // Reads 16 byte Y's from tile and writes out 16 Y's.
631 // MM21 Y tiles are 16x32 so src_tile_stride = 512 bytes
632 // MM21 UV tiles are 8x16 so src_tile_stride = 256 bytes
633 // width measured in bytes so 8 UV = 16.
DetileRow_NEON(const uint8_t * src,ptrdiff_t src_tile_stride,uint8_t * dst,int width)634 void DetileRow_NEON(const uint8_t* src,
635 ptrdiff_t src_tile_stride,
636 uint8_t* dst,
637 int width) {
638 asm volatile(
639 "1: \n"
640 "ld1 {v0.16b}, [%0], %3 \n" // load 16 bytes
641 "subs %w2, %w2, #16 \n" // 16 processed per loop
642 "prfm pldl1keep, [%0, 1792] \n" // 7 tiles of 256b ahead
643 "st1 {v0.16b}, [%1], #16 \n" // store 16 bytes
644 "b.gt 1b \n"
645 : "+r"(src), // %0
646 "+r"(dst), // %1
647 "+r"(width) // %2
648 : "r"(src_tile_stride) // %3
649 : "cc", "memory", "v0" // Clobber List
650 );
651 }
652
653 // Reads 16 byte Y's of 16 bits from tile and writes out 16 Y's.
DetileRow_16_NEON(const uint16_t * src,ptrdiff_t src_tile_stride,uint16_t * dst,int width)654 void DetileRow_16_NEON(const uint16_t* src,
655 ptrdiff_t src_tile_stride,
656 uint16_t* dst,
657 int width) {
658 asm volatile(
659 "1: \n"
660 "ld1 {v0.8h,v1.8h}, [%0], %3 \n" // load 16 pixels
661 "subs %w2, %w2, #16 \n" // 16 processed per loop
662 "prfm pldl1keep, [%0, 3584] \n" // 7 tiles of 512b ahead
663 "st1 {v0.8h,v1.8h}, [%1], #32 \n" // store 16 pixels
664 "b.gt 1b \n"
665 : "+r"(src), // %0
666 "+r"(dst), // %1
667 "+r"(width) // %2
668 : "r"(src_tile_stride * 2) // %3
669 : "cc", "memory", "v0", "v1" // Clobber List
670 );
671 }
672
673 // Read 16 bytes of UV, detile, and write 8 bytes of U and 8 bytes of V.
DetileSplitUVRow_NEON(const uint8_t * src_uv,ptrdiff_t src_tile_stride,uint8_t * dst_u,uint8_t * dst_v,int width)674 void DetileSplitUVRow_NEON(const uint8_t* src_uv,
675 ptrdiff_t src_tile_stride,
676 uint8_t* dst_u,
677 uint8_t* dst_v,
678 int width) {
679 asm volatile(
680 "1: \n"
681 "ld2 {v0.8b,v1.8b}, [%0], %4 \n"
682 "subs %w3, %w3, #16 \n"
683 "prfm pldl1keep, [%0, 1792] \n"
684 "st1 {v0.8b}, [%1], #8 \n"
685 "st1 {v1.8b}, [%2], #8 \n"
686 "b.gt 1b \n"
687 : "+r"(src_uv), // %0
688 "+r"(dst_u), // %1
689 "+r"(dst_v), // %2
690 "+r"(width) // %3
691 : "r"(src_tile_stride) // %4
692 : "cc", "memory", "v0", "v1" // Clobber List
693 );
694 }
695
696 #if LIBYUV_USE_ST2
697 // Read 16 Y, 8 UV, and write 8 YUY2
DetileToYUY2_NEON(const uint8_t * src_y,ptrdiff_t src_y_tile_stride,const uint8_t * src_uv,ptrdiff_t src_uv_tile_stride,uint8_t * dst_yuy2,int width)698 void DetileToYUY2_NEON(const uint8_t* src_y,
699 ptrdiff_t src_y_tile_stride,
700 const uint8_t* src_uv,
701 ptrdiff_t src_uv_tile_stride,
702 uint8_t* dst_yuy2,
703 int width) {
704 asm volatile(
705 "1: \n"
706 "ld1 {v0.16b}, [%0], %4 \n" // load 16 Ys
707 "prfm pldl1keep, [%0, 1792] \n"
708 "ld1 {v1.16b}, [%1], %5 \n" // load 8 UVs
709 "prfm pldl1keep, [%1, 1792] \n"
710 "subs %w3, %w3, #16 \n" // store 8 YUY2
711 "st2 {v0.16b,v1.16b}, [%2], #32 \n"
712 "b.gt 1b \n"
713 : "+r"(src_y), // %0
714 "+r"(src_uv), // %1
715 "+r"(dst_yuy2), // %2
716 "+r"(width) // %3
717 : "r"(src_y_tile_stride), // %4
718 "r"(src_uv_tile_stride) // %5
719 : "cc", "memory", "v0", "v1" // Clobber list
720 );
721 }
722 #else
723 // Read 16 Y, 8 UV, and write 8 YUY2
DetileToYUY2_NEON(const uint8_t * src_y,ptrdiff_t src_y_tile_stride,const uint8_t * src_uv,ptrdiff_t src_uv_tile_stride,uint8_t * dst_yuy2,int width)724 void DetileToYUY2_NEON(const uint8_t* src_y,
725 ptrdiff_t src_y_tile_stride,
726 const uint8_t* src_uv,
727 ptrdiff_t src_uv_tile_stride,
728 uint8_t* dst_yuy2,
729 int width) {
730 asm volatile(
731 "1: \n"
732 "ld1 {v0.16b}, [%0], %4 \n" // load 16 Ys
733 "ld1 {v1.16b}, [%1], %5 \n" // load 8 UVs
734 "subs %w3, %w3, #16 \n"
735 "prfm pldl1keep, [%0, 1792] \n"
736 "zip1 v2.16b, v0.16b, v1.16b \n"
737 "prfm pldl1keep, [%1, 1792] \n"
738 "zip2 v3.16b, v0.16b, v1.16b \n"
739 "st1 {v2.16b,v3.16b}, [%2], #32 \n" // store 8 YUY2
740 "b.gt 1b \n"
741 : "+r"(src_y), // %0
742 "+r"(src_uv), // %1
743 "+r"(dst_yuy2), // %2
744 "+r"(width) // %3
745 : "r"(src_y_tile_stride), // %4
746 "r"(src_uv_tile_stride) // %5
747 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber list
748 );
749 }
750 #endif
751
752 // Unpack MT2T into tiled P010 64 pixels at a time. See
753 // tinyurl.com/mtk-10bit-video-format for format documentation.
UnpackMT2T_NEON(const uint8_t * src,uint16_t * dst,size_t size)754 void UnpackMT2T_NEON(const uint8_t* src, uint16_t* dst, size_t size) {
755 asm volatile(
756 "1: \n"
757 "ld1 {v7.16b}, [%0], #16 \n"
758 "ld1 {v0.16b-v3.16b}, [%0], #64 \n"
759 "shl v4.16b, v7.16b, #6 \n"
760 "shl v5.16b, v7.16b, #4 \n"
761 "shl v6.16b, v7.16b, #2 \n"
762 "subs %2, %2, #80 \n"
763 "zip1 v16.16b, v4.16b, v0.16b \n"
764 "zip1 v18.16b, v5.16b, v1.16b \n"
765 "zip1 v20.16b, v6.16b, v2.16b \n"
766 "zip1 v22.16b, v7.16b, v3.16b \n"
767 "zip2 v17.16b, v4.16b, v0.16b \n"
768 "zip2 v19.16b, v5.16b, v1.16b \n"
769 "zip2 v21.16b, v6.16b, v2.16b \n"
770 "zip2 v23.16b, v7.16b, v3.16b \n"
771 "sri v16.8h, v16.8h, #10 \n"
772 "sri v17.8h, v17.8h, #10 \n"
773 "sri v18.8h, v18.8h, #10 \n"
774 "sri v19.8h, v19.8h, #10 \n"
775 "st1 {v16.8h-v19.8h}, [%1], #64 \n"
776 "sri v20.8h, v20.8h, #10 \n"
777 "sri v21.8h, v21.8h, #10 \n"
778 "sri v22.8h, v22.8h, #10 \n"
779 "sri v23.8h, v23.8h, #10 \n"
780 "st1 {v20.8h-v23.8h}, [%1], #64 \n"
781 "b.gt 1b \n"
782 : "+r"(src), // %0
783 "+r"(dst), // %1
784 "+r"(size) // %2
785 :
786 : "cc", "memory", "w0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
787 "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23");
788 }
789
790 #if LIBYUV_USE_ST2
791 // Reads 16 U's and V's and writes out 16 pairs of UV.
MergeUVRow_NEON(const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_uv,int width)792 void MergeUVRow_NEON(const uint8_t* src_u,
793 const uint8_t* src_v,
794 uint8_t* dst_uv,
795 int width) {
796 asm volatile(
797 "1: \n"
798 "ld1 {v0.16b}, [%0], #16 \n" // load U
799 "ld1 {v1.16b}, [%1], #16 \n" // load V
800 "subs %w3, %w3, #16 \n" // 16 processed per loop
801 "prfm pldl1keep, [%0, 448] \n"
802 "prfm pldl1keep, [%1, 448] \n"
803 "st2 {v0.16b,v1.16b}, [%2], #32 \n" // store 16 pairs of UV
804 "b.gt 1b \n"
805 : "+r"(src_u), // %0
806 "+r"(src_v), // %1
807 "+r"(dst_uv), // %2
808 "+r"(width) // %3 // Output registers
809 : // Input registers
810 : "cc", "memory", "v0", "v1" // Clobber List
811 );
812 }
813
MergeUVRow_16_NEON(const uint16_t * src_u,const uint16_t * src_v,uint16_t * dst_uv,int depth,int width)814 void MergeUVRow_16_NEON(const uint16_t* src_u,
815 const uint16_t* src_v,
816 uint16_t* dst_uv,
817 int depth,
818 int width) {
819 int shift = 16 - depth;
820 asm volatile(
821 "dup v2.8h, %w4 \n"
822 "1: \n"
823 "ld1 {v0.8h}, [%0], #16 \n" // load 8 U
824 "subs %w3, %w3, #8 \n" // 8 src pixels per loop
825 "ld1 {v1.8h}, [%1], #16 \n" // load 8 V
826 "ushl v0.8h, v0.8h, v2.8h \n"
827 "prfm pldl1keep, [%0, 448] \n"
828 "ushl v1.8h, v1.8h, v2.8h \n"
829 "prfm pldl1keep, [%1, 448] \n"
830 "st2 {v0.8h, v1.8h}, [%2], #32 \n" // store 8 UV pixels
831 "b.gt 1b \n"
832 : "+r"(src_u), // %0
833 "+r"(src_v), // %1
834 "+r"(dst_uv), // %2
835 "+r"(width) // %3
836 : "r"(shift) // %4
837 : "cc", "memory", "v0", "v1", "v2");
838 }
839 #else
840 // Reads 16 U's and V's and writes out 16 pairs of UV.
MergeUVRow_NEON(const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_uv,int width)841 void MergeUVRow_NEON(const uint8_t* src_u,
842 const uint8_t* src_v,
843 uint8_t* dst_uv,
844 int width) {
845 asm volatile(
846 "1: \n"
847 "ld1 {v0.16b}, [%0], #16 \n" // load U
848 "ld1 {v1.16b}, [%1], #16 \n" // load V
849 "subs %w3, %w3, #16 \n" // 16 processed per loop
850 "zip1 v2.16b, v0.16b, v1.16b \n"
851 "prfm pldl1keep, [%0, 448] \n"
852 "zip2 v3.16b, v0.16b, v1.16b \n"
853 "prfm pldl1keep, [%1, 448] \n"
854 "st1 {v2.16b,v3.16b}, [%2], #32 \n" // store 16 pairs of UV
855 "b.gt 1b \n"
856 : "+r"(src_u), // %0
857 "+r"(src_v), // %1
858 "+r"(dst_uv), // %2
859 "+r"(width) // %3 // Output registers
860 : // Input registers
861 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
862 );
863 }
864
MergeUVRow_16_NEON(const uint16_t * src_u,const uint16_t * src_v,uint16_t * dst_uv,int depth,int width)865 void MergeUVRow_16_NEON(const uint16_t* src_u,
866 const uint16_t* src_v,
867 uint16_t* dst_uv,
868 int depth,
869 int width) {
870 int shift = 16 - depth;
871 asm volatile(
872 "dup v4.8h, %w4 \n"
873 "1: \n"
874 "ld1 {v0.8h}, [%0], #16 \n" // load 8 U
875 "subs %w3, %w3, #8 \n" // 8 src pixels per loop
876 "ld1 {v1.8h}, [%1], #16 \n" // load 8 V
877 "ushl v0.8h, v0.8h, v4.8h \n"
878 "ushl v1.8h, v1.8h, v4.8h \n"
879 "prfm pldl1keep, [%0, 448] \n"
880 "zip1 v2.8h, v0.8h, v1.8h \n"
881 "zip2 v3.8h, v0.8h, v1.8h \n"
882 "prfm pldl1keep, [%1, 448] \n"
883 "st1 {v2.8h, v3.8h}, [%2], #32 \n" // store 8 UV pixels
884 "b.gt 1b \n"
885 : "+r"(src_u), // %0
886 "+r"(src_v), // %1
887 "+r"(dst_uv), // %2
888 "+r"(width) // %3
889 : "r"(shift) // %4
890 : "cc", "memory", "v0", "v1", "v2", "v1", "v2", "v3", "v4");
891 }
892 #endif // LIBYUV_USE_ST2
893
894 // Reads 16 packed RGB and write to planar dst_r, dst_g, dst_b.
SplitRGBRow_NEON(const uint8_t * src_rgb,uint8_t * dst_r,uint8_t * dst_g,uint8_t * dst_b,int width)895 void SplitRGBRow_NEON(const uint8_t* src_rgb,
896 uint8_t* dst_r,
897 uint8_t* dst_g,
898 uint8_t* dst_b,
899 int width) {
900 asm volatile(
901 "1: \n"
902 "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 RGB
903 "subs %w4, %w4, #16 \n" // 16 processed per loop
904 "prfm pldl1keep, [%0, 448] \n"
905 "st1 {v0.16b}, [%1], #16 \n" // store R
906 "st1 {v1.16b}, [%2], #16 \n" // store G
907 "st1 {v2.16b}, [%3], #16 \n" // store B
908 "b.gt 1b \n"
909 : "+r"(src_rgb), // %0
910 "+r"(dst_r), // %1
911 "+r"(dst_g), // %2
912 "+r"(dst_b), // %3
913 "+r"(width) // %4
914 : // Input registers
915 : "cc", "memory", "v0", "v1", "v2" // Clobber List
916 );
917 }
918
919 // Reads 16 planar R's, G's and B's and writes out 16 packed RGB at a time
MergeRGBRow_NEON(const uint8_t * src_r,const uint8_t * src_g,const uint8_t * src_b,uint8_t * dst_rgb,int width)920 void MergeRGBRow_NEON(const uint8_t* src_r,
921 const uint8_t* src_g,
922 const uint8_t* src_b,
923 uint8_t* dst_rgb,
924 int width) {
925 asm volatile(
926 "1: \n"
927 "ld1 {v0.16b}, [%0], #16 \n" // load R
928 "ld1 {v1.16b}, [%1], #16 \n" // load G
929 "ld1 {v2.16b}, [%2], #16 \n" // load B
930 "subs %w4, %w4, #16 \n" // 16 processed per loop
931 "prfm pldl1keep, [%0, 448] \n"
932 "prfm pldl1keep, [%1, 448] \n"
933 "prfm pldl1keep, [%2, 448] \n"
934 "st3 {v0.16b,v1.16b,v2.16b}, [%3], #48 \n" // store 16 RGB
935 "b.gt 1b \n"
936 : "+r"(src_r), // %0
937 "+r"(src_g), // %1
938 "+r"(src_b), // %2
939 "+r"(dst_rgb), // %3
940 "+r"(width) // %4
941 : // Input registers
942 : "cc", "memory", "v0", "v1", "v2" // Clobber List
943 );
944 }
945
946 // Reads 16 packed ARGB and write to planar dst_r, dst_g, dst_b, dst_a.
SplitARGBRow_NEON(const uint8_t * src_rgba,uint8_t * dst_r,uint8_t * dst_g,uint8_t * dst_b,uint8_t * dst_a,int width)947 void SplitARGBRow_NEON(const uint8_t* src_rgba,
948 uint8_t* dst_r,
949 uint8_t* dst_g,
950 uint8_t* dst_b,
951 uint8_t* dst_a,
952 int width) {
953 asm volatile(
954 "1: \n"
955 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ARGB
956 "subs %w5, %w5, #16 \n" // 16 processed per loop
957 "prfm pldl1keep, [%0, 448] \n"
958 "st1 {v0.16b}, [%3], #16 \n" // store B
959 "st1 {v1.16b}, [%2], #16 \n" // store G
960 "st1 {v2.16b}, [%1], #16 \n" // store R
961 "st1 {v3.16b}, [%4], #16 \n" // store A
962 "b.gt 1b \n"
963 : "+r"(src_rgba), // %0
964 "+r"(dst_r), // %1
965 "+r"(dst_g), // %2
966 "+r"(dst_b), // %3
967 "+r"(dst_a), // %4
968 "+r"(width) // %5
969 : // Input registers
970 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
971 );
972 }
973
974 #if LIBYUV_USE_ST4
975 // Reads 16 planar R's, G's, B's and A's and writes out 16 packed ARGB at a time
MergeARGBRow_NEON(const uint8_t * src_r,const uint8_t * src_g,const uint8_t * src_b,const uint8_t * src_a,uint8_t * dst_argb,int width)976 void MergeARGBRow_NEON(const uint8_t* src_r,
977 const uint8_t* src_g,
978 const uint8_t* src_b,
979 const uint8_t* src_a,
980 uint8_t* dst_argb,
981 int width) {
982 asm volatile(
983 "1: \n"
984 "ld1 {v0.16b}, [%2], #16 \n" // load B
985 "ld1 {v1.16b}, [%1], #16 \n" // load G
986 "ld1 {v2.16b}, [%0], #16 \n" // load R
987 "ld1 {v3.16b}, [%3], #16 \n" // load A
988 "subs %w5, %w5, #16 \n" // 16 processed per loop
989 "prfm pldl1keep, [%0, 448] \n"
990 "prfm pldl1keep, [%1, 448] \n"
991 "prfm pldl1keep, [%2, 448] \n"
992 "prfm pldl1keep, [%3, 448] \n"
993 "st4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%4], #64 \n" // store 16ARGB
994 "b.gt 1b \n"
995 : "+r"(src_r), // %0
996 "+r"(src_g), // %1
997 "+r"(src_b), // %2
998 "+r"(src_a), // %3
999 "+r"(dst_argb), // %4
1000 "+r"(width) // %5
1001 : // Input registers
1002 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
1003 );
1004 }
1005 #else
1006 // Reads 16 planar R's, G's, B's and A's and writes out 16 packed ARGB at a time
MergeARGBRow_NEON(const uint8_t * src_r,const uint8_t * src_g,const uint8_t * src_b,const uint8_t * src_a,uint8_t * dst_argb,int width)1007 void MergeARGBRow_NEON(const uint8_t* src_r,
1008 const uint8_t* src_g,
1009 const uint8_t* src_b,
1010 const uint8_t* src_a,
1011 uint8_t* dst_argb,
1012 int width) {
1013 asm volatile(
1014 "1: \n"
1015 "ld1 {v0.16b}, [%2], #16 \n" // load B
1016 "ld1 {v1.16b}, [%1], #16 \n" // load G
1017 "ld1 {v2.16b}, [%0], #16 \n" // load R
1018 "ld1 {v3.16b}, [%3], #16 \n" // load A
1019 "subs %w5, %w5, #16 \n" // 16 processed per loop
1020 "prfm pldl1keep, [%2, 448] \n"
1021 "zip1 v4.16b, v0.16b, v1.16b \n" // BG
1022 "zip1 v5.16b, v2.16b, v3.16b \n" // RA
1023 "prfm pldl1keep, [%1, 448] \n"
1024 "zip2 v6.16b, v0.16b, v1.16b \n" // BG
1025 "zip2 v7.16b, v2.16b, v3.16b \n" // RA
1026 "prfm pldl1keep, [%0, 448] \n"
1027 "zip1 v0.8h, v4.8h, v5.8h \n" // BGRA
1028 "zip2 v1.8h, v4.8h, v5.8h \n"
1029 "prfm pldl1keep, [%3, 448] \n"
1030 "zip1 v2.8h, v6.8h, v7.8h \n"
1031 "zip2 v3.8h, v6.8h, v7.8h \n"
1032 "st1 {v0.16b,v1.16b,v2.16b,v3.16b}, [%4], #64 \n" // store 16ARGB
1033 "b.gt 1b \n"
1034 : "+r"(src_r), // %0
1035 "+r"(src_g), // %1
1036 "+r"(src_b), // %2
1037 "+r"(src_a), // %3
1038 "+r"(dst_argb), // %4
1039 "+r"(width) // %5
1040 : // Input registers
1041 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
1042 "v7" // Clobber List
1043 );
1044 }
1045 #endif // LIBYUV_USE_ST4
1046
1047 // Reads 16 packed ARGB and write to planar dst_r, dst_g, dst_b.
SplitXRGBRow_NEON(const uint8_t * src_rgba,uint8_t * dst_r,uint8_t * dst_g,uint8_t * dst_b,int width)1048 void SplitXRGBRow_NEON(const uint8_t* src_rgba,
1049 uint8_t* dst_r,
1050 uint8_t* dst_g,
1051 uint8_t* dst_b,
1052 int width) {
1053 asm volatile(
1054 "1: \n"
1055 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ARGB
1056 "subs %w4, %w4, #16 \n" // 16 processed per loop
1057 "prfm pldl1keep, [%0, 448] \n"
1058 "st1 {v0.16b}, [%3], #16 \n" // store B
1059 "st1 {v1.16b}, [%2], #16 \n" // store G
1060 "st1 {v2.16b}, [%1], #16 \n" // store R
1061 "b.gt 1b \n"
1062 : "+r"(src_rgba), // %0
1063 "+r"(dst_r), // %1
1064 "+r"(dst_g), // %2
1065 "+r"(dst_b), // %3
1066 "+r"(width) // %4
1067 : // Input registers
1068 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
1069 );
1070 }
1071
1072 // Reads 16 planar R's, G's and B's and writes out 16 packed ARGB at a time
MergeXRGBRow_NEON(const uint8_t * src_r,const uint8_t * src_g,const uint8_t * src_b,uint8_t * dst_argb,int width)1073 void MergeXRGBRow_NEON(const uint8_t* src_r,
1074 const uint8_t* src_g,
1075 const uint8_t* src_b,
1076 uint8_t* dst_argb,
1077 int width) {
1078 asm volatile(
1079 "movi v3.16b, #255 \n" // load A(255)
1080 "1: \n"
1081 "ld1 {v2.16b}, [%0], #16 \n" // load R
1082 "ld1 {v1.16b}, [%1], #16 \n" // load G
1083 "ld1 {v0.16b}, [%2], #16 \n" // load B
1084 "subs %w4, %w4, #16 \n" // 16 processed per loop
1085 "prfm pldl1keep, [%0, 448] \n"
1086 "prfm pldl1keep, [%1, 448] \n"
1087 "prfm pldl1keep, [%2, 448] \n"
1088 "st4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%3], #64 \n" // store 16ARGB
1089 "b.gt 1b \n"
1090 : "+r"(src_r), // %0
1091 "+r"(src_g), // %1
1092 "+r"(src_b), // %2
1093 "+r"(dst_argb), // %3
1094 "+r"(width) // %4
1095 : // Input registers
1096 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
1097 );
1098 }
1099
MergeXR30Row_NEON(const uint16_t * src_r,const uint16_t * src_g,const uint16_t * src_b,uint8_t * dst_ar30,int depth,int width)1100 void MergeXR30Row_NEON(const uint16_t* src_r,
1101 const uint16_t* src_g,
1102 const uint16_t* src_b,
1103 uint8_t* dst_ar30,
1104 int depth,
1105 int width) {
1106 int shift = 10 - depth;
1107 asm volatile(
1108 "movi v30.16b, #255 \n"
1109 "ushr v30.4s, v30.4s, #22 \n" // 1023
1110 "dup v31.4s, %w5 \n"
1111 "1: \n"
1112 "ldr d2, [%2], #8 \n" // B
1113 "ldr d1, [%1], #8 \n" // G
1114 "ldr d0, [%0], #8 \n" // R
1115 "ushll v2.4s, v2.4h, #0 \n" // B
1116 "ushll v1.4s, v1.4h, #0 \n" // G
1117 "ushll v0.4s, v0.4h, #0 \n" // R
1118 "ushl v2.4s, v2.4s, v31.4s \n" // 000B
1119 "ushl v1.4s, v1.4s, v31.4s \n" // G
1120 "ushl v0.4s, v0.4s, v31.4s \n" // R
1121 "umin v2.4s, v2.4s, v30.4s \n"
1122 "umin v1.4s, v1.4s, v30.4s \n"
1123 "umin v0.4s, v0.4s, v30.4s \n"
1124 "sli v2.4s, v1.4s, #10 \n" // 00GB
1125 "sli v2.4s, v0.4s, #20 \n" // 0RGB
1126 "orr v2.4s, #0xc0, lsl #24 \n" // ARGB (AR30)
1127 "subs %w4, %w4, #4 \n"
1128 "str q2, [%3], #16 \n"
1129 "b.gt 1b \n"
1130 : "+r"(src_r), // %0
1131 "+r"(src_g), // %1
1132 "+r"(src_b), // %2
1133 "+r"(dst_ar30), // %3
1134 "+r"(width) // %4
1135 : "r"(shift) // %5
1136 : "memory", "cc", "v0", "v1", "v2", "v30", "v31");
1137 }
1138
MergeXR30Row_10_NEON(const uint16_t * src_r,const uint16_t * src_g,const uint16_t * src_b,uint8_t * dst_ar30,int,int width)1139 void MergeXR30Row_10_NEON(const uint16_t* src_r,
1140 const uint16_t* src_g,
1141 const uint16_t* src_b,
1142 uint8_t* dst_ar30,
1143 int /* depth */,
1144 int width) {
1145 asm volatile(
1146 "movi v30.16b, #255 \n"
1147 "ushr v30.4s, v30.4s, #22 \n" // 1023
1148 "1: \n"
1149 "ldr d2, [%2], #8 \n" // B
1150 "ldr d1, [%1], #8 \n" // G
1151 "ldr d0, [%0], #8 \n" // R
1152 "ushll v2.4s, v2.4h, #0 \n" // 000B
1153 "ushll v1.4s, v1.4h, #0 \n" // G
1154 "ushll v0.4s, v0.4h, #0 \n" // R
1155 "umin v2.4s, v2.4s, v30.4s \n"
1156 "umin v1.4s, v1.4s, v30.4s \n"
1157 "umin v0.4s, v0.4s, v30.4s \n"
1158 "sli v2.4s, v1.4s, #10 \n" // 00GB
1159 "sli v2.4s, v0.4s, #20 \n" // 0RGB
1160 "orr v2.4s, #0xc0, lsl #24 \n" // ARGB (AR30)
1161 "subs %w4, %w4, #4 \n"
1162 "str q2, [%3], #16 \n"
1163 "b.gt 1b \n"
1164 : "+r"(src_r), // %0
1165 "+r"(src_g), // %1
1166 "+r"(src_b), // %2
1167 "+r"(dst_ar30), // %3
1168 "+r"(width) // %4
1169 :
1170 : "memory", "cc", "v0", "v1", "v2", "v30");
1171 }
1172
MergeAR64Row_NEON(const uint16_t * src_r,const uint16_t * src_g,const uint16_t * src_b,const uint16_t * src_a,uint16_t * dst_ar64,int depth,int width)1173 void MergeAR64Row_NEON(const uint16_t* src_r,
1174 const uint16_t* src_g,
1175 const uint16_t* src_b,
1176 const uint16_t* src_a,
1177 uint16_t* dst_ar64,
1178 int depth,
1179 int width) {
1180 int shift = 16 - depth;
1181 int mask = (1 << depth) - 1;
1182 asm volatile(
1183
1184 "dup v30.8h, %w7 \n"
1185 "dup v31.8h, %w6 \n"
1186 "1: \n"
1187 "ldr q2, [%0], #16 \n" // R
1188 "ldr q1, [%1], #16 \n" // G
1189 "ldr q0, [%2], #16 \n" // B
1190 "ldr q3, [%3], #16 \n" // A
1191 "umin v2.8h, v2.8h, v30.8h \n"
1192 "prfm pldl1keep, [%0, 448] \n"
1193 "umin v1.8h, v1.8h, v30.8h \n"
1194 "prfm pldl1keep, [%1, 448] \n"
1195 "umin v0.8h, v0.8h, v30.8h \n"
1196 "prfm pldl1keep, [%2, 448] \n"
1197 "umin v3.8h, v3.8h, v30.8h \n"
1198 "prfm pldl1keep, [%3, 448] \n"
1199 "ushl v2.8h, v2.8h, v31.8h \n"
1200 "ushl v1.8h, v1.8h, v31.8h \n"
1201 "ushl v0.8h, v0.8h, v31.8h \n"
1202 "ushl v3.8h, v3.8h, v31.8h \n"
1203 "subs %w5, %w5, #8 \n"
1204 "st4 {v0.8h, v1.8h, v2.8h, v3.8h}, [%4], #64 \n"
1205 "b.gt 1b \n"
1206 : "+r"(src_r), // %0
1207 "+r"(src_g), // %1
1208 "+r"(src_b), // %2
1209 "+r"(src_a), // %3
1210 "+r"(dst_ar64), // %4
1211 "+r"(width) // %5
1212 : "r"(shift), // %6
1213 "r"(mask) // %7
1214 : "memory", "cc", "v0", "v1", "v2", "v3", "v31");
1215 }
1216
MergeXR64Row_NEON(const uint16_t * src_r,const uint16_t * src_g,const uint16_t * src_b,uint16_t * dst_ar64,int depth,int width)1217 void MergeXR64Row_NEON(const uint16_t* src_r,
1218 const uint16_t* src_g,
1219 const uint16_t* src_b,
1220 uint16_t* dst_ar64,
1221 int depth,
1222 int width) {
1223 int shift = 16 - depth;
1224 int mask = (1 << depth) - 1;
1225 asm volatile(
1226
1227 "movi v3.16b, #0xff \n" // A (0xffff)
1228 "dup v30.8h, %w6 \n"
1229 "dup v31.8h, %w5 \n"
1230
1231 "1: \n"
1232 "ldr q2, [%0], #16 \n" // R
1233 "ldr q1, [%1], #16 \n" // G
1234 "ldr q0, [%2], #16 \n" // B
1235 "umin v2.8h, v2.8h, v30.8h \n"
1236 "prfm pldl1keep, [%0, 448] \n"
1237 "umin v1.8h, v1.8h, v30.8h \n"
1238 "prfm pldl1keep, [%1, 448] \n"
1239 "umin v0.8h, v0.8h, v30.8h \n"
1240 "prfm pldl1keep, [%2, 448] \n"
1241 "ushl v2.8h, v2.8h, v31.8h \n"
1242 "ushl v1.8h, v1.8h, v31.8h \n"
1243 "ushl v0.8h, v0.8h, v31.8h \n"
1244 "subs %w4, %w4, #8 \n"
1245 "st4 {v0.8h, v1.8h, v2.8h, v3.8h}, [%3], #64 \n"
1246 "b.gt 1b \n"
1247 : "+r"(src_r), // %0
1248 "+r"(src_g), // %1
1249 "+r"(src_b), // %2
1250 "+r"(dst_ar64), // %3
1251 "+r"(width) // %4
1252 : "r"(shift), // %5
1253 "r"(mask) // %6
1254 : "memory", "cc", "v0", "v1", "v2", "v3", "v31");
1255 }
1256
MergeARGB16To8Row_NEON(const uint16_t * src_r,const uint16_t * src_g,const uint16_t * src_b,const uint16_t * src_a,uint8_t * dst_argb,int depth,int width)1257 void MergeARGB16To8Row_NEON(const uint16_t* src_r,
1258 const uint16_t* src_g,
1259 const uint16_t* src_b,
1260 const uint16_t* src_a,
1261 uint8_t* dst_argb,
1262 int depth,
1263 int width) {
1264 int shift = 8 - depth;
1265 asm volatile(
1266
1267 "dup v31.8h, %w6 \n"
1268 "1: \n"
1269 "ldr q2, [%0], #16 \n" // R
1270 "ldr q1, [%1], #16 \n" // G
1271 "ldr q0, [%2], #16 \n" // B
1272 "ldr q3, [%3], #16 \n" // A
1273 "ushl v2.8h, v2.8h, v31.8h \n"
1274 "prfm pldl1keep, [%0, 448] \n"
1275 "ushl v1.8h, v1.8h, v31.8h \n"
1276 "prfm pldl1keep, [%1, 448] \n"
1277 "ushl v0.8h, v0.8h, v31.8h \n"
1278 "prfm pldl1keep, [%2, 448] \n"
1279 "ushl v3.8h, v3.8h, v31.8h \n"
1280 "prfm pldl1keep, [%3, 448] \n"
1281 "uqxtn v2.8b, v2.8h \n"
1282 "uqxtn v1.8b, v1.8h \n"
1283 "uqxtn v0.8b, v0.8h \n"
1284 "uqxtn v3.8b, v3.8h \n"
1285 "subs %w5, %w5, #8 \n"
1286 "st4 {v0.8b, v1.8b, v2.8b, v3.8b}, [%4], #32 \n"
1287 "b.gt 1b \n"
1288 : "+r"(src_r), // %0
1289 "+r"(src_g), // %1
1290 "+r"(src_b), // %2
1291 "+r"(src_a), // %3
1292 "+r"(dst_argb), // %4
1293 "+r"(width) // %5
1294 : "r"(shift) // %6
1295 : "memory", "cc", "v0", "v1", "v2", "v3", "v31");
1296 }
1297
MergeXRGB16To8Row_NEON(const uint16_t * src_r,const uint16_t * src_g,const uint16_t * src_b,uint8_t * dst_argb,int depth,int width)1298 void MergeXRGB16To8Row_NEON(const uint16_t* src_r,
1299 const uint16_t* src_g,
1300 const uint16_t* src_b,
1301 uint8_t* dst_argb,
1302 int depth,
1303 int width) {
1304 int shift = 8 - depth;
1305 asm volatile(
1306
1307 "dup v31.8h, %w5 \n"
1308 "movi v3.8b, #0xff \n" // A (0xff)
1309 "1: \n"
1310 "ldr q2, [%0], #16 \n" // R
1311 "ldr q1, [%1], #16 \n" // G
1312 "ldr q0, [%2], #16 \n" // B
1313 "ushl v2.8h, v2.8h, v31.8h \n"
1314 "prfm pldl1keep, [%0, 448] \n"
1315 "ushl v1.8h, v1.8h, v31.8h \n"
1316 "prfm pldl1keep, [%1, 448] \n"
1317 "ushl v0.8h, v0.8h, v31.8h \n"
1318 "prfm pldl1keep, [%2, 448] \n"
1319 "uqxtn v2.8b, v2.8h \n"
1320 "uqxtn v1.8b, v1.8h \n"
1321 "uqxtn v0.8b, v0.8h \n"
1322 "subs %w4, %w4, #8 \n"
1323 "st4 {v0.8b, v1.8b, v2.8b, v3.8b}, [%3], #32 \n"
1324 "b.gt 1b \n"
1325 : "+r"(src_r), // %0
1326 "+r"(src_g), // %1
1327 "+r"(src_b), // %2
1328 "+r"(dst_argb), // %3
1329 "+r"(width) // %4
1330 : "r"(shift) // %5
1331 : "memory", "cc", "v0", "v1", "v2", "v3", "v31");
1332 }
1333
1334 // Copy multiple of 32.
CopyRow_NEON(const uint8_t * src,uint8_t * dst,int width)1335 void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
1336 asm volatile(
1337 "1: \n"
1338 "ldp q0, q1, [%0], #32 \n"
1339 "prfm pldl1keep, [%0, 448] \n"
1340 "subs %w2, %w2, #32 \n" // 32 processed per loop
1341 "stp q0, q1, [%1], #32 \n"
1342 "b.gt 1b \n"
1343 : "+r"(src), // %0
1344 "+r"(dst), // %1
1345 "+r"(width) // %2 // Output registers
1346 : // Input registers
1347 : "cc", "memory", "v0", "v1" // Clobber List
1348 );
1349 }
1350
1351 // SetRow writes 'width' bytes using an 8 bit value repeated.
SetRow_NEON(uint8_t * dst,uint8_t v8,int width)1352 void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) {
1353 asm volatile(
1354 "dup v0.16b, %w2 \n" // duplicate 16 bytes
1355 "1: \n"
1356 "subs %w1, %w1, #16 \n" // 16 bytes per loop
1357 "st1 {v0.16b}, [%0], #16 \n" // store
1358 "b.gt 1b \n"
1359 : "+r"(dst), // %0
1360 "+r"(width) // %1
1361 : "r"(v8) // %2
1362 : "cc", "memory", "v0");
1363 }
1364
ARGBSetRow_NEON(uint8_t * dst,uint32_t v32,int width)1365 void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width) {
1366 asm volatile(
1367 "dup v0.4s, %w2 \n" // duplicate 4 ints
1368 "1: \n"
1369 "subs %w1, %w1, #4 \n" // 4 ints per loop
1370 "st1 {v0.16b}, [%0], #16 \n" // store
1371 "b.gt 1b \n"
1372 : "+r"(dst), // %0
1373 "+r"(width) // %1
1374 : "r"(v32) // %2
1375 : "cc", "memory", "v0");
1376 }
1377
1378 // Shuffle table for reversing the bytes.
1379 static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u,
1380 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u};
1381
MirrorRow_NEON(const uint8_t * src,uint8_t * dst,int width)1382 void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
1383 asm volatile(
1384 // Start at end of source row.
1385 "ld1 {v3.16b}, [%3] \n" // shuffler
1386 "add %0, %0, %w2, sxtw \n"
1387 "sub %0, %0, #32 \n"
1388 "1: \n"
1389 "ldr q2, [%0, 16] \n"
1390 "ldr q1, [%0], -32 \n" // src -= 32
1391 "subs %w2, %w2, #32 \n" // 32 pixels per loop.
1392 "tbl v0.16b, {v2.16b}, v3.16b \n"
1393 "tbl v1.16b, {v1.16b}, v3.16b \n"
1394 "st1 {v0.16b, v1.16b}, [%1], #32 \n" // store 32 pixels
1395 "b.gt 1b \n"
1396 : "+r"(src), // %0
1397 "+r"(dst), // %1
1398 "+r"(width) // %2
1399 : "r"(&kShuffleMirror) // %3
1400 : "cc", "memory", "v0", "v1", "v2", "v3");
1401 }
1402
1403 // Shuffle table for reversing the UV.
1404 static const uvec8 kShuffleMirrorUV = {14u, 15u, 12u, 13u, 10u, 11u, 8u, 9u,
1405 6u, 7u, 4u, 5u, 2u, 3u, 0u, 1u};
1406
MirrorUVRow_NEON(const uint8_t * src_uv,uint8_t * dst_uv,int width)1407 void MirrorUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
1408 asm volatile(
1409 // Start at end of source row.
1410 "ld1 {v4.16b}, [%3] \n" // shuffler
1411 "add %0, %0, %w2, sxtw #1 \n"
1412 "sub %0, %0, #32 \n"
1413 "1: \n"
1414 "ldr q1, [%0, 16] \n"
1415 "ldr q0, [%0], -32 \n" // src -= 32
1416 "subs %w2, %w2, #16 \n" // 16 pixels per loop.
1417 "tbl v2.16b, {v1.16b}, v4.16b \n"
1418 "tbl v3.16b, {v0.16b}, v4.16b \n"
1419 "st1 {v2.16b, v3.16b}, [%1], #32 \n" // dst += 32
1420 "b.gt 1b \n"
1421 : "+r"(src_uv), // %0
1422 "+r"(dst_uv), // %1
1423 "+r"(width) // %2
1424 : "r"(&kShuffleMirrorUV) // %3
1425 : "cc", "memory", "v0", "v1", "v2", "v3", "v4");
1426 }
1427
MirrorSplitUVRow_NEON(const uint8_t * src_uv,uint8_t * dst_u,uint8_t * dst_v,int width)1428 void MirrorSplitUVRow_NEON(const uint8_t* src_uv,
1429 uint8_t* dst_u,
1430 uint8_t* dst_v,
1431 int width) {
1432 asm volatile(
1433 // Start at end of source row.
1434 "ld1 {v4.16b}, [%4] \n" // shuffler
1435 "add %0, %0, %w3, sxtw #1 \n"
1436 "sub %0, %0, #32 \n"
1437 "1: \n"
1438 "ldr q1, [%0, 16] \n"
1439 "ldr q0, [%0], -32 \n" // src -= 32
1440 "subs %w3, %w3, #16 \n" // 16 pixels per loop.
1441 "tbl v2.16b, {v1.16b}, v4.16b \n"
1442 "tbl v3.16b, {v0.16b}, v4.16b \n"
1443 "uzp1 v0.16b, v2.16b, v3.16b \n" // U
1444 "uzp2 v1.16b, v2.16b, v3.16b \n" // V
1445 "st1 {v0.16b}, [%1], #16 \n" // dst += 16
1446 "st1 {v1.16b}, [%2], #16 \n"
1447 "b.gt 1b \n"
1448 : "+r"(src_uv), // %0
1449 "+r"(dst_u), // %1
1450 "+r"(dst_v), // %2
1451 "+r"(width) // %3
1452 : "r"(&kShuffleMirrorUV) // %4
1453 : "cc", "memory", "v0", "v1", "v2", "v3", "v4");
1454 }
1455
1456 // Shuffle table for reversing the ARGB.
1457 static const uvec8 kShuffleMirrorARGB = {12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u,
1458 4u, 5u, 6u, 7u, 0u, 1u, 2u, 3u};
1459
ARGBMirrorRow_NEON(const uint8_t * src_argb,uint8_t * dst_argb,int width)1460 void ARGBMirrorRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
1461 asm volatile(
1462 // Start at end of source row.
1463 "ld1 {v4.16b}, [%3] \n" // shuffler
1464 "add %0, %0, %w2, sxtw #2 \n"
1465 "sub %0, %0, #32 \n"
1466 "1: \n"
1467 "ldr q1, [%0, 16] \n"
1468 "ldr q0, [%0], -32 \n" // src -= 32
1469 "subs %w2, %w2, #8 \n" // 8 pixels per loop.
1470 "tbl v2.16b, {v1.16b}, v4.16b \n"
1471 "tbl v3.16b, {v0.16b}, v4.16b \n"
1472 "st1 {v2.16b, v3.16b}, [%1], #32 \n" // dst += 32
1473 "b.gt 1b \n"
1474 : "+r"(src_argb), // %0
1475 "+r"(dst_argb), // %1
1476 "+r"(width) // %2
1477 : "r"(&kShuffleMirrorARGB) // %3
1478 : "cc", "memory", "v0", "v1", "v2", "v3", "v4");
1479 }
1480
RGB24MirrorRow_NEON(const uint8_t * src_rgb24,uint8_t * dst_rgb24,int width)1481 void RGB24MirrorRow_NEON(const uint8_t* src_rgb24,
1482 uint8_t* dst_rgb24,
1483 int width) {
1484 asm volatile(
1485 "ld1 {v3.16b}, [%4] \n" // shuffler
1486 "add %0, %0, %w2, sxtw #1 \n" // Start at end of row.
1487 "add %0, %0, %w2, sxtw \n"
1488 "sub %0, %0, #48 \n"
1489
1490 "1: \n"
1491 "ld3 {v0.16b, v1.16b, v2.16b}, [%0], %3 \n" // src -= 48
1492 "subs %w2, %w2, #16 \n" // 16 pixels per loop.
1493 "tbl v0.16b, {v0.16b}, v3.16b \n"
1494 "tbl v1.16b, {v1.16b}, v3.16b \n"
1495 "tbl v2.16b, {v2.16b}, v3.16b \n"
1496 "st3 {v0.16b, v1.16b, v2.16b}, [%1], #48 \n" // dst += 48
1497 "b.gt 1b \n"
1498 : "+r"(src_rgb24), // %0
1499 "+r"(dst_rgb24), // %1
1500 "+r"(width) // %2
1501 : "r"((ptrdiff_t)-48), // %3
1502 "r"(&kShuffleMirror) // %4
1503 : "cc", "memory", "v0", "v1", "v2", "v3");
1504 }
1505
RGB24ToARGBRow_NEON(const uint8_t * src_rgb24,uint8_t * dst_argb,int width)1506 void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24,
1507 uint8_t* dst_argb,
1508 int width) {
1509 asm volatile(
1510 "movi v4.8b, #255 \n" // Alpha
1511 "1: \n"
1512 "ld3 {v1.8b,v2.8b,v3.8b}, [%0], #24 \n" // load 8 pixels of
1513 // RGB24.
1514 "prfm pldl1keep, [%0, 448] \n"
1515 "subs %w2, %w2, #8 \n" // 8 processed per loop.
1516 "st4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n" // store 8 ARGB
1517 "b.gt 1b \n"
1518 : "+r"(src_rgb24), // %0
1519 "+r"(dst_argb), // %1
1520 "+r"(width) // %2
1521 :
1522 : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List
1523 );
1524 }
1525
RAWToARGBRow_NEON(const uint8_t * src_raw,uint8_t * dst_argb,int width)1526 void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
1527 asm volatile(
1528 "movi v5.8b, #255 \n" // Alpha
1529 "1: \n"
1530 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b
1531 "subs %w2, %w2, #8 \n" // 8 processed per loop.
1532 "orr v3.8b, v1.8b, v1.8b \n" // move g
1533 "prfm pldl1keep, [%0, 448] \n"
1534 "orr v4.8b, v0.8b, v0.8b \n" // move r
1535 "st4 {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n" // store b g r a
1536 "b.gt 1b \n"
1537 : "+r"(src_raw), // %0
1538 "+r"(dst_argb), // %1
1539 "+r"(width) // %2
1540 :
1541 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5" // Clobber List
1542 );
1543 }
1544
RAWToRGBARow_NEON(const uint8_t * src_raw,uint8_t * dst_rgba,int width)1545 void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
1546 asm volatile(
1547 "movi v0.8b, #255 \n" // Alpha
1548 "1: \n"
1549 "ld3 {v3.8b,v4.8b,v5.8b}, [%0], #24 \n" // read r g b
1550 "subs %w2, %w2, #8 \n" // 8 processed per loop.
1551 "orr v2.8b, v4.8b, v4.8b \n" // move g
1552 "prfm pldl1keep, [%0, 448] \n"
1553 "orr v1.8b, v5.8b, v5.8b \n" // move r
1554 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store a b g r
1555 "b.gt 1b \n"
1556 : "+r"(src_raw), // %0
1557 "+r"(dst_rgba), // %1
1558 "+r"(width) // %2
1559 :
1560 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5" // Clobber List
1561 );
1562 }
1563
RAWToRGB24Row_NEON(const uint8_t * src_raw,uint8_t * dst_rgb24,int width)1564 void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
1565 asm volatile(
1566 "1: \n"
1567 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b
1568 "subs %w2, %w2, #8 \n" // 8 processed per loop.
1569 "orr v3.8b, v1.8b, v1.8b \n" // move g
1570 "prfm pldl1keep, [%0, 448] \n"
1571 "orr v4.8b, v0.8b, v0.8b \n" // move r
1572 "st3 {v2.8b,v3.8b,v4.8b}, [%1], #24 \n" // store b g r
1573 "b.gt 1b \n"
1574 : "+r"(src_raw), // %0
1575 "+r"(dst_rgb24), // %1
1576 "+r"(width) // %2
1577 :
1578 : "cc", "memory", "v0", "v1", "v2", "v3", "v4" // Clobber List
1579 );
1580 }
1581
1582 #define RGB565TOARGB \
1583 "shrn v6.8b, v0.8h, #5 \n" /* G xxGGGGGG */ \
1584 "shl v6.8b, v6.8b, #2 \n" /* G GGGGGG00 upper 6 */ \
1585 "ushr v4.8b, v6.8b, #6 \n" /* G 000000GG lower 2 */ \
1586 "orr v1.8b, v4.8b, v6.8b \n" /* G */ \
1587 "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \
1588 "ushr v0.8h, v0.8h, #11 \n" /* R 000RRRRR */ \
1589 "xtn2 v2.16b,v0.8h \n" /* R in upper part */ \
1590 "shl v2.16b, v2.16b, #3 \n" /* R,B BBBBB000 upper 5 */ \
1591 "ushr v0.16b, v2.16b, #5 \n" /* R,B 00000BBB lower 3 */ \
1592 "orr v0.16b, v0.16b, v2.16b \n" /* R,B */ \
1593 "dup v2.2D, v0.D[1] \n" /* R */
1594
RGB565ToARGBRow_NEON(const uint8_t * src_rgb565,uint8_t * dst_argb,int width)1595 void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565,
1596 uint8_t* dst_argb,
1597 int width) {
1598 asm volatile(
1599 "movi v3.8b, #255 \n" // Alpha
1600 "1: \n"
1601 "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels.
1602 "subs %w2, %w2, #8 \n" // 8 processed per loop.
1603 "prfm pldl1keep, [%0, 448] \n" RGB565TOARGB
1604 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB
1605 "b.gt 1b \n"
1606 : "+r"(src_rgb565), // %0
1607 "+r"(dst_argb), // %1
1608 "+r"(width) // %2
1609 :
1610 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6" // Clobber List
1611 );
1612 }
1613
1614 #define ARGB1555TOARGB \
1615 "ushr v2.8h, v0.8h, #10 \n" /* R xxxRRRRR */ \
1616 "shl v2.8h, v2.8h, #3 \n" /* R RRRRR000 upper 5 */ \
1617 "xtn v3.8b, v2.8h \n" /* RRRRR000 AAAAAAAA */ \
1618 \
1619 "sshr v2.8h, v0.8h, #15 \n" /* A AAAAAAAA */ \
1620 "xtn2 v3.16b, v2.8h \n" \
1621 \
1622 "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \
1623 "shrn2 v2.16b,v0.8h, #5 \n" /* G xxxGGGGG */ \
1624 \
1625 "ushr v1.16b, v3.16b, #5 \n" /* R,A 00000RRR lower 3 */ \
1626 "shl v0.16b, v2.16b, #3 \n" /* B,G BBBBB000 upper 5 */ \
1627 "ushr v2.16b, v0.16b, #5 \n" /* B,G 00000BBB lower 3 */ \
1628 \
1629 "orr v0.16b, v0.16b, v2.16b \n" /* B,G */ \
1630 "orr v2.16b, v1.16b, v3.16b \n" /* R,A */ \
1631 "dup v1.2D, v0.D[1] \n" \
1632 "dup v3.2D, v2.D[1] \n"
1633
1634 // RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha.
1635 #define RGB555TOARGB \
1636 "ushr v2.8h, v0.8h, #10 \n" /* R xxxRRRRR */ \
1637 "shl v2.8h, v2.8h, #3 \n" /* R RRRRR000 upper 5 */ \
1638 "xtn v3.8b, v2.8h \n" /* RRRRR000 */ \
1639 \
1640 "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \
1641 "shrn2 v2.16b,v0.8h, #5 \n" /* G xxxGGGGG */ \
1642 \
1643 "ushr v1.16b, v3.16b, #5 \n" /* R 00000RRR lower 3 */ \
1644 "shl v0.16b, v2.16b, #3 \n" /* B,G BBBBB000 upper 5 */ \
1645 "ushr v2.16b, v0.16b, #5 \n" /* B,G 00000BBB lower 3 */ \
1646 \
1647 "orr v0.16b, v0.16b, v2.16b \n" /* B,G */ \
1648 "orr v2.16b, v1.16b, v3.16b \n" /* R */ \
1649 "dup v1.2D, v0.D[1] \n" /* G */
1650
ARGB1555ToARGBRow_NEON(const uint8_t * src_argb1555,uint8_t * dst_argb,int width)1651 void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555,
1652 uint8_t* dst_argb,
1653 int width) {
1654 asm volatile(
1655 "movi v3.8b, #255 \n" // Alpha
1656 "1: \n"
1657 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
1658 "prfm pldl1keep, [%0, 448] \n"
1659 "subs %w2, %w2, #8 \n" // 8 processed per loop.
1660 ARGB1555TOARGB
1661 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB
1662 "b.gt 1b \n"
1663 : "+r"(src_argb1555), // %0
1664 "+r"(dst_argb), // %1
1665 "+r"(width) // %2
1666 :
1667 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
1668 );
1669 }
1670
1671 // Convert v0.8h to b = v0.8b g = v1.8b r = v2.8b
1672 // clobbers v3
1673 #define ARGB4444TOARGB \
1674 "shrn v1.8b, v0.8h, #8 \n" /* v1(l) AR */ \
1675 "xtn2 v1.16b, v0.8h \n" /* v1(h) GB */ \
1676 "shl v2.16b, v1.16b, #4 \n" /* B,R BBBB0000 */ \
1677 "ushr v3.16b, v1.16b, #4 \n" /* G,A 0000GGGG */ \
1678 "ushr v0.16b, v2.16b, #4 \n" /* B,R 0000BBBB */ \
1679 "shl v1.16b, v3.16b, #4 \n" /* G,A GGGG0000 */ \
1680 "orr v2.16b, v0.16b, v2.16b \n" /* B,R BBBBBBBB */ \
1681 "orr v3.16b, v1.16b, v3.16b \n" /* G,A GGGGGGGG */ \
1682 "dup v0.2D, v2.D[1] \n" \
1683 "dup v1.2D, v3.D[1] \n"
1684
ARGB4444ToARGBRow_NEON(const uint8_t * src_argb4444,uint8_t * dst_argb,int width)1685 void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444,
1686 uint8_t* dst_argb,
1687 int width) {
1688 asm volatile(
1689 "1: \n"
1690 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels.
1691 "subs %w2, %w2, #8 \n" // 8 processed per loop.
1692 "prfm pldl1keep, [%0, 448] \n" ARGB4444TOARGB
1693 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB
1694 "b.gt 1b \n"
1695 : "+r"(src_argb4444), // %0
1696 "+r"(dst_argb), // %1
1697 "+r"(width) // %2
1698 :
1699 : "cc", "memory", "v0", "v1", "v2", "v3", "v4" // Clobber List
1700 );
1701 }
1702
ARGBToRGB24Row_NEON(const uint8_t * src_argb,uint8_t * dst_rgb24,int width)1703 void ARGBToRGB24Row_NEON(const uint8_t* src_argb,
1704 uint8_t* dst_rgb24,
1705 int width) {
1706 asm volatile(
1707 "1: \n"
1708 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ARGB
1709 "subs %w2, %w2, #16 \n" // 16 pixels per loop.
1710 "prfm pldl1keep, [%0, 448] \n"
1711 "st3 {v0.16b,v1.16b,v2.16b}, [%1], #48 \n" // store 8 RGB24
1712 "b.gt 1b \n"
1713 : "+r"(src_argb), // %0
1714 "+r"(dst_rgb24), // %1
1715 "+r"(width) // %2
1716 :
1717 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
1718 );
1719 }
1720
ARGBToRAWRow_NEON(const uint8_t * src_argb,uint8_t * dst_raw,int width)1721 void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) {
1722 asm volatile(
1723 "1: \n"
1724 "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load b g r a
1725 "subs %w2, %w2, #8 \n" // 8 processed per loop.
1726 "orr v4.8b, v2.8b, v2.8b \n" // mov g
1727 "prfm pldl1keep, [%0, 448] \n"
1728 "orr v5.8b, v1.8b, v1.8b \n" // mov b
1729 "st3 {v3.8b,v4.8b,v5.8b}, [%1], #24 \n" // store r g b
1730 "b.gt 1b \n"
1731 : "+r"(src_argb), // %0
1732 "+r"(dst_raw), // %1
1733 "+r"(width) // %2
1734 :
1735 : "cc", "memory", "v1", "v2", "v3", "v4", "v5" // Clobber List
1736 );
1737 }
1738
YUY2ToYRow_NEON(const uint8_t * src_yuy2,uint8_t * dst_y,int width)1739 void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
1740 asm volatile(
1741 "1: \n"
1742 "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of YUY2.
1743 "subs %w2, %w2, #16 \n" // 16 processed per loop.
1744 "prfm pldl1keep, [%0, 448] \n"
1745 "st1 {v0.16b}, [%1], #16 \n" // store 16 pixels of Y.
1746 "b.gt 1b \n"
1747 : "+r"(src_yuy2), // %0
1748 "+r"(dst_y), // %1
1749 "+r"(width) // %2
1750 :
1751 : "cc", "memory", "v0", "v1" // Clobber List
1752 );
1753 }
1754
UYVYToYRow_NEON(const uint8_t * src_uyvy,uint8_t * dst_y,int width)1755 void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
1756 asm volatile(
1757 "1: \n"
1758 "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY.
1759 "subs %w2, %w2, #16 \n" // 16 processed per loop.
1760 "prfm pldl1keep, [%0, 448] \n"
1761 "st1 {v1.16b}, [%1], #16 \n" // store 16 pixels of Y.
1762 "b.gt 1b \n"
1763 : "+r"(src_uyvy), // %0
1764 "+r"(dst_y), // %1
1765 "+r"(width) // %2
1766 :
1767 : "cc", "memory", "v0", "v1" // Clobber List
1768 );
1769 }
1770
YUY2ToUV422Row_NEON(const uint8_t * src_yuy2,uint8_t * dst_u,uint8_t * dst_v,int width)1771 void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2,
1772 uint8_t* dst_u,
1773 uint8_t* dst_v,
1774 int width) {
1775 asm volatile(
1776 "1: \n"
1777 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 YUY2
1778 "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs.
1779 "prfm pldl1keep, [%0, 448] \n"
1780 "st1 {v1.8b}, [%1], #8 \n" // store 8 U.
1781 "st1 {v3.8b}, [%2], #8 \n" // store 8 V.
1782 "b.gt 1b \n"
1783 : "+r"(src_yuy2), // %0
1784 "+r"(dst_u), // %1
1785 "+r"(dst_v), // %2
1786 "+r"(width) // %3
1787 :
1788 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
1789 );
1790 }
1791
UYVYToUV422Row_NEON(const uint8_t * src_uyvy,uint8_t * dst_u,uint8_t * dst_v,int width)1792 void UYVYToUV422Row_NEON(const uint8_t* src_uyvy,
1793 uint8_t* dst_u,
1794 uint8_t* dst_v,
1795 int width) {
1796 asm volatile(
1797 "1: \n"
1798 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 UYVY
1799 "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs.
1800 "prfm pldl1keep, [%0, 448] \n"
1801 "st1 {v0.8b}, [%1], #8 \n" // store 8 U.
1802 "st1 {v2.8b}, [%2], #8 \n" // store 8 V.
1803 "b.gt 1b \n"
1804 : "+r"(src_uyvy), // %0
1805 "+r"(dst_u), // %1
1806 "+r"(dst_v), // %2
1807 "+r"(width) // %3
1808 :
1809 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
1810 );
1811 }
1812
YUY2ToUVRow_NEON(const uint8_t * src_yuy2,int stride_yuy2,uint8_t * dst_u,uint8_t * dst_v,int width)1813 void YUY2ToUVRow_NEON(const uint8_t* src_yuy2,
1814 int stride_yuy2,
1815 uint8_t* dst_u,
1816 uint8_t* dst_v,
1817 int width) {
1818 const uint8_t* src_yuy2b = src_yuy2 + stride_yuy2;
1819 asm volatile(
1820 "1: \n"
1821 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels
1822 "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs.
1823 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row
1824 "urhadd v1.8b, v1.8b, v5.8b \n" // average rows of U
1825 "prfm pldl1keep, [%0, 448] \n"
1826 "urhadd v3.8b, v3.8b, v7.8b \n" // average rows of V
1827 "st1 {v1.8b}, [%2], #8 \n" // store 8 U.
1828 "st1 {v3.8b}, [%3], #8 \n" // store 8 V.
1829 "b.gt 1b \n"
1830 : "+r"(src_yuy2), // %0
1831 "+r"(src_yuy2b), // %1
1832 "+r"(dst_u), // %2
1833 "+r"(dst_v), // %3
1834 "+r"(width) // %4
1835 :
1836 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
1837 "v7" // Clobber List
1838 );
1839 }
1840
UYVYToUVRow_NEON(const uint8_t * src_uyvy,int stride_uyvy,uint8_t * dst_u,uint8_t * dst_v,int width)1841 void UYVYToUVRow_NEON(const uint8_t* src_uyvy,
1842 int stride_uyvy,
1843 uint8_t* dst_u,
1844 uint8_t* dst_v,
1845 int width) {
1846 const uint8_t* src_uyvyb = src_uyvy + stride_uyvy;
1847 asm volatile(
1848 "1: \n"
1849 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels
1850 "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs.
1851 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row
1852 "urhadd v0.8b, v0.8b, v4.8b \n" // average rows of U
1853 "prfm pldl1keep, [%0, 448] \n"
1854 "urhadd v2.8b, v2.8b, v6.8b \n" // average rows of V
1855 "st1 {v0.8b}, [%2], #8 \n" // store 8 U.
1856 "st1 {v2.8b}, [%3], #8 \n" // store 8 V.
1857 "b.gt 1b \n"
1858 : "+r"(src_uyvy), // %0
1859 "+r"(src_uyvyb), // %1
1860 "+r"(dst_u), // %2
1861 "+r"(dst_v), // %3
1862 "+r"(width) // %4
1863 :
1864 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
1865 "v7" // Clobber List
1866 );
1867 }
1868
YUY2ToNVUVRow_NEON(const uint8_t * src_yuy2,int stride_yuy2,uint8_t * dst_uv,int width)1869 void YUY2ToNVUVRow_NEON(const uint8_t* src_yuy2,
1870 int stride_yuy2,
1871 uint8_t* dst_uv,
1872 int width) {
1873 const uint8_t* src_yuy2b = src_yuy2 + stride_yuy2;
1874 asm volatile(
1875 "1: \n"
1876 "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels
1877 "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs.
1878 "ld2 {v2.16b,v3.16b}, [%1], #32 \n" // load next row
1879 "urhadd v4.16b, v1.16b, v3.16b \n" // average rows of UV
1880 "prfm pldl1keep, [%0, 448] \n"
1881 "st1 {v4.16b}, [%2], #16 \n" // store 8 UV.
1882 "b.gt 1b \n"
1883 : "+r"(src_yuy2), // %0
1884 "+r"(src_yuy2b), // %1
1885 "+r"(dst_uv), // %2
1886 "+r"(width) // %3
1887 :
1888 : "cc", "memory", "v0", "v1", "v2", "v3", "v4" // Clobber List
1889 );
1890 }
1891
1892 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
ARGBShuffleRow_NEON(const uint8_t * src_argb,uint8_t * dst_argb,const uint8_t * shuffler,int width)1893 void ARGBShuffleRow_NEON(const uint8_t* src_argb,
1894 uint8_t* dst_argb,
1895 const uint8_t* shuffler,
1896 int width) {
1897 asm volatile(
1898 "ld1 {v2.16b}, [%3] \n" // shuffler
1899 "1: \n"
1900 "ld1 {v0.16b}, [%0], #16 \n" // load 4 pixels.
1901 "subs %w2, %w2, #4 \n" // 4 processed per loop
1902 "prfm pldl1keep, [%0, 448] \n"
1903 "tbl v1.16b, {v0.16b}, v2.16b \n" // look up 4 pixels
1904 "st1 {v1.16b}, [%1], #16 \n" // store 4.
1905 "b.gt 1b \n"
1906 : "+r"(src_argb), // %0
1907 "+r"(dst_argb), // %1
1908 "+r"(width) // %2
1909 : "r"(shuffler) // %3
1910 : "cc", "memory", "v0", "v1", "v2" // Clobber List
1911 );
1912 }
1913
I422ToYUY2Row_NEON(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_yuy2,int width)1914 void I422ToYUY2Row_NEON(const uint8_t* src_y,
1915 const uint8_t* src_u,
1916 const uint8_t* src_v,
1917 uint8_t* dst_yuy2,
1918 int width) {
1919 asm volatile(
1920 "1: \n"
1921 "ld2 {v0.8b, v1.8b}, [%0], #16 \n" // load 16 Ys
1922 "subs %w4, %w4, #16 \n" // 16 pixels
1923 "orr v2.8b, v1.8b, v1.8b \n"
1924 "prfm pldl1keep, [%0, 448] \n"
1925 "ld1 {v1.8b}, [%1], #8 \n" // load 8 Us
1926 "ld1 {v3.8b}, [%2], #8 \n" // load 8 Vs
1927 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels.
1928 "b.gt 1b \n"
1929 : "+r"(src_y), // %0
1930 "+r"(src_u), // %1
1931 "+r"(src_v), // %2
1932 "+r"(dst_yuy2), // %3
1933 "+r"(width) // %4
1934 :
1935 : "cc", "memory", "v0", "v1", "v2", "v3");
1936 }
1937
I422ToUYVYRow_NEON(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_uyvy,int width)1938 void I422ToUYVYRow_NEON(const uint8_t* src_y,
1939 const uint8_t* src_u,
1940 const uint8_t* src_v,
1941 uint8_t* dst_uyvy,
1942 int width) {
1943 asm volatile(
1944 "1: \n"
1945 "ld2 {v1.8b,v2.8b}, [%0], #16 \n" // load 16 Ys
1946 "orr v3.8b, v2.8b, v2.8b \n"
1947 "prfm pldl1keep, [%0, 448] \n"
1948 "ld1 {v0.8b}, [%1], #8 \n" // load 8 Us
1949 "ld1 {v2.8b}, [%2], #8 \n" // load 8 Vs
1950 "subs %w4, %w4, #16 \n" // 16 pixels
1951 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels.
1952 "b.gt 1b \n"
1953 : "+r"(src_y), // %0
1954 "+r"(src_u), // %1
1955 "+r"(src_v), // %2
1956 "+r"(dst_uyvy), // %3
1957 "+r"(width) // %4
1958 :
1959 : "cc", "memory", "v0", "v1", "v2", "v3");
1960 }
1961
ARGBToRGB565Row_NEON(const uint8_t * src_argb,uint8_t * dst_rgb565,int width)1962 void ARGBToRGB565Row_NEON(const uint8_t* src_argb,
1963 uint8_t* dst_rgb565,
1964 int width) {
1965 asm volatile(
1966 "1: \n"
1967 "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8
1968 // pixels
1969 "subs %w2, %w2, #8 \n" // 8 processed per loop.
1970 "prfm pldl1keep, [%0, 448] \n" ARGBTORGB565
1971 "st1 {v18.16b}, [%1], #16 \n" // store 8 pixels RGB565.
1972 "b.gt 1b \n"
1973 : "+r"(src_argb), // %0
1974 "+r"(dst_rgb565), // %1
1975 "+r"(width) // %2
1976 :
1977 : "cc", "memory", "v16", "v17", "v18", "v19");
1978 }
1979
ARGBToRGB565DitherRow_NEON(const uint8_t * src_argb,uint8_t * dst_rgb,uint32_t dither4,int width)1980 void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb,
1981 uint8_t* dst_rgb,
1982 uint32_t dither4,
1983 int width) {
1984 asm volatile(
1985 "dup v1.4s, %w3 \n" // dither4
1986 "1: \n"
1987 "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 ARGB
1988 "subs %w2, %w2, #8 \n" // 8 processed per loop.
1989 "uqadd v16.8b, v16.8b, v1.8b \n"
1990 "prfm pldl1keep, [%0, 448] \n"
1991 "uqadd v17.8b, v17.8b, v1.8b \n"
1992 "uqadd v18.8b, v18.8b, v1.8b \n" ARGBTORGB565
1993 "st1 {v18.16b}, [%1], #16 \n" // store 8 pixels RGB565.
1994 "b.gt 1b \n"
1995 : "+r"(src_argb), // %0
1996 "+r"(dst_rgb), // %1
1997 "+r"(width) // %2
1998 : "r"(dither4) // %3
1999 : "cc", "memory", "v1", "v16", "v17", "v18", "v19");
2000 }
2001
ARGBToARGB1555Row_NEON(const uint8_t * src_argb,uint8_t * dst_argb1555,int width)2002 void ARGBToARGB1555Row_NEON(const uint8_t* src_argb,
2003 uint8_t* dst_argb1555,
2004 int width) {
2005 asm volatile(
2006 "1: \n"
2007 "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8
2008 // pixels
2009 "subs %w2, %w2, #8 \n" // 8 processed per loop.
2010 "prfm pldl1keep, [%0, 448] \n" ARGBTOARGB1555
2011 "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels
2012 "b.gt 1b \n"
2013 : "+r"(src_argb), // %0
2014 "+r"(dst_argb1555), // %1
2015 "+r"(width) // %2
2016 :
2017 : "cc", "memory", "v0", "v16", "v17", "v18", "v19");
2018 }
2019
ARGBToARGB4444Row_NEON(const uint8_t * src_argb,uint8_t * dst_argb4444,int width)2020 void ARGBToARGB4444Row_NEON(const uint8_t* src_argb,
2021 uint8_t* dst_argb4444,
2022 int width) {
2023 asm volatile(
2024 "movi v23.16b, #0x0f \n" // bits to clear with
2025 // vbic.
2026 "1: \n"
2027 "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8
2028 // pixels
2029 "subs %w2, %w2, #8 \n" // 8 processed per loop.
2030 "prfm pldl1keep, [%0, 448] \n" ARGBTOARGB4444
2031 "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels
2032 "b.gt 1b \n"
2033 : "+r"(src_argb), // %0
2034 "+r"(dst_argb4444), // %1
2035 "+r"(width) // %2
2036 :
2037 : "cc", "memory", "v0", "v1", "v16", "v17", "v18", "v19", "v23");
2038 }
2039
2040 #if LIBYUV_USE_ST2
ARGBToAR64Row_NEON(const uint8_t * src_argb,uint16_t * dst_ar64,int width)2041 void ARGBToAR64Row_NEON(const uint8_t* src_argb,
2042 uint16_t* dst_ar64,
2043 int width) {
2044 asm volatile(
2045 "1: \n"
2046 "ldp q0, q2, [%0], #32 \n" // load 8 pixels
2047 "mov v1.16b, v0.16b \n"
2048 "prfm pldl1keep, [%0, 448] \n"
2049 "mov v3.16b, v2.16b \n"
2050 "subs %w2, %w2, #8 \n" // 8 processed per loop.
2051 "st2 {v0.16b, v1.16b}, [%1], #32 \n" // store 4 pixels
2052 "st2 {v2.16b, v3.16b}, [%1], #32 \n" // store 4 pixels
2053 "b.gt 1b \n"
2054 : "+r"(src_argb), // %0
2055 "+r"(dst_ar64), // %1
2056 "+r"(width) // %2
2057 :
2058 : "cc", "memory", "v0", "v1", "v2", "v3");
2059 }
2060
2061 static const uvec8 kShuffleARGBToABGR = {2, 1, 0, 3, 6, 5, 4, 7,
2062 10, 9, 8, 11, 14, 13, 12, 15};
2063
ARGBToAB64Row_NEON(const uint8_t * src_argb,uint16_t * dst_ab64,int width)2064 void ARGBToAB64Row_NEON(const uint8_t* src_argb,
2065 uint16_t* dst_ab64,
2066 int width) {
2067 asm volatile(
2068 "ldr q4, [%3] \n" // shuffler
2069 "1: \n"
2070 "ldp q0, q2, [%0], #32 \n" // load 8 pixels
2071 "tbl v0.16b, {v0.16b}, v4.16b \n"
2072 "tbl v2.16b, {v2.16b}, v4.16b \n"
2073 "prfm pldl1keep, [%0, 448] \n"
2074 "mov v1.16b, v0.16b \n"
2075 "mov v3.16b, v2.16b \n"
2076 "subs %w2, %w2, #8 \n" // 8 processed per loop.
2077 "st2 {v0.16b, v1.16b}, [%1], #32 \n" // store 4 pixels
2078 "st2 {v2.16b, v3.16b}, [%1], #32 \n" // store 4 pixels
2079 "b.gt 1b \n"
2080 : "+r"(src_argb), // %0
2081 "+r"(dst_ab64), // %1
2082 "+r"(width) // %2
2083 : "r"(&kShuffleARGBToABGR) // %3
2084 : "cc", "memory", "v0", "v1", "v2", "v3", "v4");
2085 }
2086 #else
ARGBToAR64Row_NEON(const uint8_t * src_argb,uint16_t * dst_ar64,int width)2087 void ARGBToAR64Row_NEON(const uint8_t* src_argb,
2088 uint16_t* dst_ar64,
2089 int width) {
2090 asm volatile(
2091 "1: \n"
2092 "ldp q0, q1, [%0], #32 \n" // load 8 ARGB pixels
2093 "subs %w2, %w2, #8 \n" // 8 processed per loop.
2094 "zip1 v2.16b, v0.16b, v0.16b \n"
2095 "zip2 v3.16b, v0.16b, v0.16b \n"
2096 "prfm pldl1keep, [%0, 448] \n"
2097 "zip1 v4.16b, v1.16b, v1.16b \n"
2098 "zip2 v5.16b, v1.16b, v1.16b \n"
2099 "st1 {v2.8h, v3.8h, v4.8h, v5.8h}, [%1], #64 \n" // 8 AR64
2100 "b.gt 1b \n"
2101 : "+r"(src_argb), // %0
2102 "+r"(dst_ar64), // %1
2103 "+r"(width) // %2
2104 :
2105 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5");
2106 }
2107
2108 static const uvec8 kShuffleARGBToAB64[2] = {
2109 {2, 2, 1, 1, 0, 0, 3, 3, 6, 6, 5, 5, 4, 4, 7, 7},
2110 {10, 10, 9, 9, 8, 8, 11, 11, 14, 14, 13, 13, 12, 12, 15, 15}};
2111
ARGBToAB64Row_NEON(const uint8_t * src_argb,uint16_t * dst_ab64,int width)2112 void ARGBToAB64Row_NEON(const uint8_t* src_argb,
2113 uint16_t* dst_ab64,
2114 int width) {
2115 asm volatile(
2116 "ldp q6, q7, [%3] \n" // 2 shufflers
2117 "1: \n"
2118 "ldp q0, q1, [%0], #32 \n" // load 8 pixels
2119 "subs %w2, %w2, #8 \n" // 8 processed per loop.
2120 "tbl v2.16b, {v0.16b}, v6.16b \n" // ARGB to AB64
2121 "tbl v3.16b, {v0.16b}, v7.16b \n"
2122 "prfm pldl1keep, [%0, 448] \n"
2123 "tbl v4.16b, {v1.16b}, v6.16b \n"
2124 "tbl v5.16b, {v1.16b}, v7.16b \n"
2125 "st1 {v2.8h, v3.8h, v4.8h, v5.8h}, [%1], #64 \n" // 8 AR64
2126 "b.gt 1b \n"
2127 : "+r"(src_argb), // %0
2128 "+r"(dst_ab64), // %1
2129 "+r"(width) // %2
2130 : "r"(&kShuffleARGBToAB64[0]) // %3
2131 : "cc", "memory", "v0", "v1", "v2", "v3", "v4");
2132 }
2133 #endif // LIBYUV_USE_ST2
2134
2135 static const uvec8 kShuffleAR64ToARGB = {1, 3, 5, 7, 9, 11, 13, 15,
2136 17, 19, 21, 23, 25, 27, 29, 31};
2137
AR64ToARGBRow_NEON(const uint16_t * src_ar64,uint8_t * dst_argb,int width)2138 void AR64ToARGBRow_NEON(const uint16_t* src_ar64,
2139 uint8_t* dst_argb,
2140 int width) {
2141 asm volatile(
2142 "ldr q4, [%3] \n" // shuffler
2143 "1: \n"
2144 "ldp q0, q1, [%0], #32 \n" // load 4 pixels
2145 "ldp q2, q3, [%0], #32 \n" // load 4 pixels
2146 "tbl v0.16b, {v0.16b, v1.16b}, v4.16b \n"
2147 "prfm pldl1keep, [%0, 448] \n"
2148 "tbl v2.16b, {v2.16b, v3.16b}, v4.16b \n"
2149 "subs %w2, %w2, #8 \n" // 8 processed per loop.
2150 "stp q0, q2, [%1], #32 \n" // store 8 pixels
2151 "b.gt 1b \n"
2152 : "+r"(src_ar64), // %0
2153 "+r"(dst_argb), // %1
2154 "+r"(width) // %2
2155 : "r"(&kShuffleAR64ToARGB) // %3
2156 : "cc", "memory", "v0", "v1", "v2", "v3", "v4");
2157 }
2158
2159 static const uvec8 kShuffleAB64ToARGB = {5, 3, 1, 7, 13, 11, 9, 15,
2160 21, 19, 17, 23, 29, 27, 25, 31};
2161
AB64ToARGBRow_NEON(const uint16_t * src_ab64,uint8_t * dst_argb,int width)2162 void AB64ToARGBRow_NEON(const uint16_t* src_ab64,
2163 uint8_t* dst_argb,
2164 int width) {
2165 asm volatile(
2166 "ldr q4, [%3] \n" // shuffler
2167 "1: \n"
2168 "ldp q0, q1, [%0], #32 \n" // load 4 pixels
2169 "ldp q2, q3, [%0], #32 \n" // load 4 pixels
2170 "tbl v0.16b, {v0.16b, v1.16b}, v4.16b \n"
2171 "prfm pldl1keep, [%0, 448] \n"
2172 "tbl v2.16b, {v2.16b, v3.16b}, v4.16b \n"
2173 "subs %w2, %w2, #8 \n" // 8 processed per loop.
2174 "stp q0, q2, [%1], #32 \n" // store 8 pixels
2175 "b.gt 1b \n"
2176 : "+r"(src_ab64), // %0
2177 "+r"(dst_argb), // %1
2178 "+r"(width) // %2
2179 : "r"(&kShuffleAB64ToARGB) // %3
2180 : "cc", "memory", "v0", "v1", "v2", "v3", "v4");
2181 }
2182
ARGBExtractAlphaRow_NEON(const uint8_t * src_argb,uint8_t * dst_a,int width)2183 void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb,
2184 uint8_t* dst_a,
2185 int width) {
2186 asm volatile(
2187 "1: \n"
2188 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16
2189 "prfm pldl1keep, [%0, 448] \n"
2190 "subs %w2, %w2, #16 \n" // 16 processed per loop
2191 "st1 {v3.16b}, [%1], #16 \n" // store 16 A's.
2192 "b.gt 1b \n"
2193 : "+r"(src_argb), // %0
2194 "+r"(dst_a), // %1
2195 "+r"(width) // %2
2196 :
2197 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
2198 );
2199 }
2200
2201 struct RgbUVConstants {
2202 uint8_t kRGBToU[4];
2203 uint8_t kRGBToV[4];
2204 };
2205
2206 // 8x1 pixels.
ARGBToUV444MatrixRow_NEON(const uint8_t * src_argb,uint8_t * dst_u,uint8_t * dst_v,int width,const struct RgbUVConstants * rgbuvconstants)2207 void ARGBToUV444MatrixRow_NEON(const uint8_t* src_argb,
2208 uint8_t* dst_u,
2209 uint8_t* dst_v,
2210 int width,
2211 const struct RgbUVConstants* rgbuvconstants) {
2212 asm volatile(
2213 "ldr d0, [%4] \n" // load rgbuvconstants
2214 "dup v24.16b, v0.b[0] \n" // UB 0.875 coefficient
2215 "dup v25.16b, v0.b[1] \n" // UG -0.5781 coefficient
2216 "dup v26.16b, v0.b[2] \n" // UR -0.2969 coefficient
2217 "dup v27.16b, v0.b[4] \n" // VB -0.1406 coefficient
2218 "dup v28.16b, v0.b[5] \n" // VG -0.7344 coefficient
2219 "movi v29.16b, #0x80 \n" // 128.5
2220
2221 "1: \n"
2222 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
2223 "subs %w3, %w3, #8 \n" // 8 processed per loop.
2224 "umull v4.8h, v0.8b, v24.8b \n" // B
2225 "umlsl v4.8h, v1.8b, v25.8b \n" // G
2226 "umlsl v4.8h, v2.8b, v26.8b \n" // R
2227 "prfm pldl1keep, [%0, 448] \n"
2228
2229 "umull v3.8h, v2.8b, v24.8b \n" // R
2230 "umlsl v3.8h, v1.8b, v28.8b \n" // G
2231 "umlsl v3.8h, v0.8b, v27.8b \n" // B
2232
2233 "addhn v0.8b, v4.8h, v29.8h \n" // +128 -> unsigned
2234 "addhn v1.8b, v3.8h, v29.8h \n" // +128 -> unsigned
2235
2236 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U.
2237 "st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V.
2238 "b.gt 1b \n"
2239 : "+r"(src_argb), // %0
2240 "+r"(dst_u), // %1
2241 "+r"(dst_v), // %2
2242 "+r"(width) // %3
2243 : "r"(rgbuvconstants) // %4
2244 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26",
2245 "v27", "v28", "v29");
2246 }
2247
2248 // RGB to bt601 coefficients
2249 // UB 0.875 coefficient = 112
2250 // UG -0.5781 coefficient = 74
2251 // UR -0.2969 coefficient = 38
2252 // VB -0.1406 coefficient = 18
2253 // VG -0.7344 coefficient = 94
2254 // VR 0.875 coefficient = 112 (ignored)
2255
2256 static const struct RgbUVConstants kRgb24I601UVConstants = {{112, 74, 38, 0},
2257 {18, 94, 112, 0}};
2258
2259 // RGB to JPeg coefficients
2260 // UB coeff 0.500 = 127
2261 // UG coeff -0.33126 = 84
2262 // UR coeff -0.16874 = 43
2263 // VB coeff -0.08131 = 20
2264 // VG coeff -0.41869 = 107
2265 // VR coeff 0.500 = 127 (ignored)
2266
2267 static const struct RgbUVConstants kRgb24JPegUVConstants = {{127, 84, 43, 0},
2268 {20, 107, 127, 0}};
2269
ARGBToUV444Row_NEON(const uint8_t * src_argb,uint8_t * dst_u,uint8_t * dst_v,int width)2270 void ARGBToUV444Row_NEON(const uint8_t* src_argb,
2271 uint8_t* dst_u,
2272 uint8_t* dst_v,
2273 int width) {
2274 ARGBToUV444MatrixRow_NEON(src_argb, dst_u, dst_v, width,
2275 &kRgb24I601UVConstants);
2276 }
2277
ARGBToUVJ444Row_NEON(const uint8_t * src_argb,uint8_t * dst_u,uint8_t * dst_v,int width)2278 void ARGBToUVJ444Row_NEON(const uint8_t* src_argb,
2279 uint8_t* dst_u,
2280 uint8_t* dst_v,
2281 int width) {
2282 ARGBToUV444MatrixRow_NEON(src_argb, dst_u, dst_v, width,
2283 &kRgb24JPegUVConstants);
2284 }
2285
2286 #define RGBTOUV_SETUP_REG \
2287 "movi v20.8h, #56, lsl #0 \n" /* UB/VR coefficient (0.875) / 2 */ \
2288 "movi v21.8h, #37, lsl #0 \n" /* UG coefficient (-0.5781) / 2 */ \
2289 "movi v22.8h, #19, lsl #0 \n" /* UR coefficient (-0.2969) / 2 */ \
2290 "movi v23.8h, #9, lsl #0 \n" /* VB coefficient (-0.1406) / 2 */ \
2291 "movi v24.8h, #47, lsl #0 \n" /* VG coefficient (-0.7344) / 2 */ \
2292 "movi v25.16b, #0x80 \n" /* 128.5 (0x8080 in 16-bit) */
2293
2294 // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
2295 // clang-format off
2296 #define RGBTOUV(QB, QG, QR) \
2297 "mul v3.8h, " #QB ",v20.8h \n" /* B */ \
2298 "mul v4.8h, " #QR ",v20.8h \n" /* R */ \
2299 "mls v3.8h, " #QG ",v21.8h \n" /* G */ \
2300 "mls v4.8h, " #QG ",v24.8h \n" /* G */ \
2301 "mls v3.8h, " #QR ",v22.8h \n" /* R */ \
2302 "mls v4.8h, " #QB ",v23.8h \n" /* B */ \
2303 "addhn v0.8b, v3.8h, v25.8h \n" /* +128 -> unsigned */ \
2304 "addhn v1.8b, v4.8h, v25.8h \n" /* +128 -> unsigned */
2305 // clang-format on
2306
2307 // TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
2308 // TODO(fbarchard): consider ptrdiff_t for all strides.
2309
ARGBToUVRow_NEON(const uint8_t * src_argb,int src_stride_argb,uint8_t * dst_u,uint8_t * dst_v,int width)2310 void ARGBToUVRow_NEON(const uint8_t* src_argb,
2311 int src_stride_argb,
2312 uint8_t* dst_u,
2313 uint8_t* dst_v,
2314 int width) {
2315 const uint8_t* src_argb_1 = src_argb + src_stride_argb;
2316 asm volatile (
2317 RGBTOUV_SETUP_REG
2318 "1: \n"
2319 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
2320 "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
2321 "prfm pldl1keep, [%0, 448] \n"
2322 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
2323 "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
2324
2325 "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16
2326 "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts.
2327 "prfm pldl1keep, [%1, 448] \n"
2328 "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
2329 "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts.
2330
2331 "urshr v0.8h, v0.8h, #1 \n" // 2x average
2332 "urshr v1.8h, v1.8h, #1 \n"
2333 "urshr v2.8h, v2.8h, #1 \n"
2334
2335 "subs %w4, %w4, #16 \n" // 16 processed per loop.
2336 RGBTOUV(v0.8h, v1.8h, v2.8h)
2337 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
2338 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
2339 "b.gt 1b \n"
2340 : "+r"(src_argb), // %0
2341 "+r"(src_argb_1), // %1
2342 "+r"(dst_u), // %2
2343 "+r"(dst_v), // %3
2344 "+r"(width) // %4
2345 :
2346 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
2347 "v20", "v21", "v22", "v23", "v24", "v25"
2348 );
2349 }
2350
2351 // TODO(fbarchard): Subsample match Intel code.
ARGBToUVJRow_NEON(const uint8_t * src_argb,int src_stride_argb,uint8_t * dst_u,uint8_t * dst_v,int width)2352 void ARGBToUVJRow_NEON(const uint8_t* src_argb,
2353 int src_stride_argb,
2354 uint8_t* dst_u,
2355 uint8_t* dst_v,
2356 int width) {
2357 const uint8_t* src_argb_1 = src_argb + src_stride_argb;
2358 asm volatile (
2359 "movi v20.8h, #63, lsl #0 \n" // UB/VR coeff (0.500) / 2
2360 "movi v21.8h, #42, lsl #0 \n" // UG coeff (-0.33126) / 2
2361 "movi v22.8h, #21, lsl #0 \n" // UR coeff (-0.16874) / 2
2362 "movi v23.8h, #10, lsl #0 \n" // VB coeff (-0.08131) / 2
2363 "movi v24.8h, #53, lsl #0 \n" // VG coeff (-0.41869) / 2
2364 "movi v25.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit)
2365 "1: \n"
2366 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
2367 "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
2368 "prfm pldl1keep, [%0, 448] \n"
2369 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
2370 "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
2371 "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16
2372 "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts.
2373 "prfm pldl1keep, [%1, 448] \n"
2374 "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
2375 "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts.
2376
2377 "urshr v0.8h, v0.8h, #1 \n" // 2x average
2378 "urshr v1.8h, v1.8h, #1 \n"
2379 "urshr v2.8h, v2.8h, #1 \n"
2380
2381 "subs %w4, %w4, #16 \n" // 16 processed per loop.
2382 RGBTOUV(v0.8h, v1.8h, v2.8h)
2383 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
2384 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
2385 "b.gt 1b \n"
2386 : "+r"(src_argb), // %0
2387 "+r"(src_argb_1), // %1
2388 "+r"(dst_u), // %2
2389 "+r"(dst_v), // %3
2390 "+r"(width) // %4
2391 :
2392 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
2393 "v20", "v21", "v22", "v23", "v24", "v25"
2394 );
2395 }
2396
ABGRToUVJRow_NEON(const uint8_t * src_abgr,int src_stride_abgr,uint8_t * dst_uj,uint8_t * dst_vj,int width)2397 void ABGRToUVJRow_NEON(const uint8_t* src_abgr,
2398 int src_stride_abgr,
2399 uint8_t* dst_uj,
2400 uint8_t* dst_vj,
2401 int width) {
2402 const uint8_t* src_abgr_1 = src_abgr + src_stride_abgr;
2403 asm volatile (
2404 "movi v20.8h, #63, lsl #0 \n" // UB/VR coeff (0.500) / 2
2405 "movi v21.8h, #42, lsl #0 \n" // UG coeff (-0.33126) / 2
2406 "movi v22.8h, #21, lsl #0 \n" // UR coeff (-0.16874) / 2
2407 "movi v23.8h, #10, lsl #0 \n" // VB coeff (-0.08131) / 2
2408 "movi v24.8h, #53, lsl #0 \n" // VG coeff (-0.41869) / 2
2409 "movi v25.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit)
2410 "1: \n"
2411 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
2412 "uaddlp v0.8h, v0.16b \n" // R 16 bytes -> 8 shorts.
2413 "prfm pldl1keep, [%0, 448] \n"
2414 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
2415 "uaddlp v2.8h, v2.16b \n" // B 16 bytes -> 8 shorts.
2416 "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16
2417 "uadalp v0.8h, v4.16b \n" // R 16 bytes -> 8 shorts.
2418 "prfm pldl1keep, [%1, 448] \n"
2419 "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
2420 "uadalp v2.8h, v6.16b \n" // B 16 bytes -> 8 shorts.
2421
2422 "urshr v0.8h, v0.8h, #1 \n" // 2x average
2423 "urshr v1.8h, v1.8h, #1 \n"
2424 "urshr v2.8h, v2.8h, #1 \n"
2425
2426 "subs %w4, %w4, #16 \n" // 16 processed per loop.
2427 RGBTOUV(v2.8h, v1.8h, v0.8h)
2428 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
2429 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
2430 "b.gt 1b \n"
2431 : "+r"(src_abgr), // %0
2432 "+r"(src_abgr_1), // %1
2433 "+r"(dst_uj), // %2
2434 "+r"(dst_vj), // %3
2435 "+r"(width) // %4
2436 :
2437 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
2438 "v20", "v21", "v22", "v23", "v24", "v25"
2439 );
2440 }
2441
RGB24ToUVJRow_NEON(const uint8_t * src_rgb24,int src_stride_rgb24,uint8_t * dst_u,uint8_t * dst_v,int width)2442 void RGB24ToUVJRow_NEON(const uint8_t* src_rgb24,
2443 int src_stride_rgb24,
2444 uint8_t* dst_u,
2445 uint8_t* dst_v,
2446 int width) {
2447 const uint8_t* src_rgb24_1 = src_rgb24 + src_stride_rgb24;
2448 asm volatile (
2449 "movi v20.8h, #63, lsl #0 \n" // UB/VR coeff (0.500) / 2
2450 "movi v21.8h, #42, lsl #0 \n" // UG coeff (-0.33126) / 2
2451 "movi v22.8h, #21, lsl #0 \n" // UR coeff (-0.16874) / 2
2452 "movi v23.8h, #10, lsl #0 \n" // VB coeff (-0.08131) / 2
2453 "movi v24.8h, #53, lsl #0 \n" // VG coeff (-0.41869) / 2
2454 "movi v25.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit)
2455 "1: \n"
2456 "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 pixels.
2457 "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
2458 "prfm pldl1keep, [%0, 448] \n"
2459 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
2460 "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
2461 "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load next 16
2462 "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts.
2463 "prfm pldl1keep, [%1, 448] \n"
2464 "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
2465 "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts.
2466
2467 "urshr v0.8h, v0.8h, #1 \n" // 2x average
2468 "urshr v1.8h, v1.8h, #1 \n"
2469 "urshr v2.8h, v2.8h, #1 \n"
2470
2471 "subs %w4, %w4, #16 \n" // 16 processed per loop.
2472 RGBTOUV(v0.8h, v1.8h, v2.8h)
2473 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
2474 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
2475 "b.gt 1b \n"
2476 : "+r"(src_rgb24), // %0
2477 "+r"(src_rgb24_1), // %1
2478 "+r"(dst_u), // %2
2479 "+r"(dst_v), // %3
2480 "+r"(width) // %4
2481 :
2482 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
2483 "v20", "v21", "v22", "v23", "v24", "v25"
2484 );
2485 }
2486
RAWToUVJRow_NEON(const uint8_t * src_raw,int src_stride_raw,uint8_t * dst_u,uint8_t * dst_v,int width)2487 void RAWToUVJRow_NEON(const uint8_t* src_raw,
2488 int src_stride_raw,
2489 uint8_t* dst_u,
2490 uint8_t* dst_v,
2491 int width) {
2492 const uint8_t* src_raw_1 = src_raw + src_stride_raw;
2493 asm volatile (
2494 "movi v20.8h, #63, lsl #0 \n" // UB/VR coeff (0.500) / 2
2495 "movi v21.8h, #42, lsl #0 \n" // UG coeff (-0.33126) / 2
2496 "movi v22.8h, #21, lsl #0 \n" // UR coeff (-0.16874) / 2
2497 "movi v23.8h, #10, lsl #0 \n" // VB coeff (-0.08131) / 2
2498 "movi v24.8h, #53, lsl #0 \n" // VG coeff (-0.41869) / 2
2499 "movi v25.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit)
2500 "1: \n"
2501 "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 pixels.
2502 "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
2503 "prfm pldl1keep, [%0, 448] \n"
2504 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
2505 "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
2506 "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load next 16
2507 "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts.
2508 "prfm pldl1keep, [%1, 448] \n"
2509 "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
2510 "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts.
2511
2512 "urshr v0.8h, v0.8h, #1 \n" // 2x average
2513 "urshr v1.8h, v1.8h, #1 \n"
2514 "urshr v2.8h, v2.8h, #1 \n"
2515
2516 "subs %w4, %w4, #16 \n" // 16 processed per loop.
2517 RGBTOUV(v2.8h, v1.8h, v0.8h)
2518 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
2519 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
2520 "b.gt 1b \n"
2521 : "+r"(src_raw), // %0
2522 "+r"(src_raw_1), // %1
2523 "+r"(dst_u), // %2
2524 "+r"(dst_v), // %3
2525 "+r"(width) // %4
2526 :
2527 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
2528 "v20", "v21", "v22", "v23", "v24", "v25"
2529 );
2530 }
2531
BGRAToUVRow_NEON(const uint8_t * src_bgra,int src_stride_bgra,uint8_t * dst_u,uint8_t * dst_v,int width)2532 void BGRAToUVRow_NEON(const uint8_t* src_bgra,
2533 int src_stride_bgra,
2534 uint8_t* dst_u,
2535 uint8_t* dst_v,
2536 int width) {
2537 const uint8_t* src_bgra_1 = src_bgra + src_stride_bgra;
2538 asm volatile (
2539 RGBTOUV_SETUP_REG
2540 "1: \n"
2541 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
2542 "uaddlp v0.8h, v3.16b \n" // B 16 bytes -> 8 shorts.
2543 "prfm pldl1keep, [%0, 448] \n"
2544 "uaddlp v3.8h, v2.16b \n" // G 16 bytes -> 8 shorts.
2545 "uaddlp v2.8h, v1.16b \n" // R 16 bytes -> 8 shorts.
2546 "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more
2547 "uadalp v0.8h, v7.16b \n" // B 16 bytes -> 8 shorts.
2548 "prfm pldl1keep, [%1, 448] \n"
2549 "uadalp v3.8h, v6.16b \n" // G 16 bytes -> 8 shorts.
2550 "uadalp v2.8h, v5.16b \n" // R 16 bytes -> 8 shorts.
2551
2552 "urshr v0.8h, v0.8h, #1 \n" // 2x average
2553 "urshr v1.8h, v3.8h, #1 \n"
2554 "urshr v2.8h, v2.8h, #1 \n"
2555
2556 "subs %w4, %w4, #16 \n" // 16 processed per loop.
2557 RGBTOUV(v0.8h, v1.8h, v2.8h)
2558 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
2559 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
2560 "b.gt 1b \n"
2561 : "+r"(src_bgra), // %0
2562 "+r"(src_bgra_1), // %1
2563 "+r"(dst_u), // %2
2564 "+r"(dst_v), // %3
2565 "+r"(width) // %4
2566 :
2567 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
2568 "v20", "v21", "v22", "v23", "v24", "v25"
2569 );
2570 }
2571
ABGRToUVRow_NEON(const uint8_t * src_abgr,int src_stride_abgr,uint8_t * dst_u,uint8_t * dst_v,int width)2572 void ABGRToUVRow_NEON(const uint8_t* src_abgr,
2573 int src_stride_abgr,
2574 uint8_t* dst_u,
2575 uint8_t* dst_v,
2576 int width) {
2577 const uint8_t* src_abgr_1 = src_abgr + src_stride_abgr;
2578 asm volatile (
2579 RGBTOUV_SETUP_REG
2580 "1: \n"
2581 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
2582 "uaddlp v3.8h, v2.16b \n" // B 16 bytes -> 8 shorts.
2583 "prfm pldl1keep, [%0, 448] \n"
2584 "uaddlp v2.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
2585 "uaddlp v1.8h, v0.16b \n" // R 16 bytes -> 8 shorts.
2586 "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more.
2587 "uadalp v3.8h, v6.16b \n" // B 16 bytes -> 8 shorts.
2588 "prfm pldl1keep, [%1, 448] \n"
2589 "uadalp v2.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
2590 "uadalp v1.8h, v4.16b \n" // R 16 bytes -> 8 shorts.
2591
2592 "urshr v0.8h, v3.8h, #1 \n" // 2x average
2593 "urshr v2.8h, v2.8h, #1 \n"
2594 "urshr v1.8h, v1.8h, #1 \n"
2595
2596 "subs %w4, %w4, #16 \n" // 16 processed per loop.
2597 RGBTOUV(v0.8h, v2.8h, v1.8h)
2598 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
2599 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
2600 "b.gt 1b \n"
2601 : "+r"(src_abgr), // %0
2602 "+r"(src_abgr_1), // %1
2603 "+r"(dst_u), // %2
2604 "+r"(dst_v), // %3
2605 "+r"(width) // %4
2606 :
2607 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
2608 "v20", "v21", "v22", "v23", "v24", "v25"
2609 );
2610 }
2611
RGBAToUVRow_NEON(const uint8_t * src_rgba,int src_stride_rgba,uint8_t * dst_u,uint8_t * dst_v,int width)2612 void RGBAToUVRow_NEON(const uint8_t* src_rgba,
2613 int src_stride_rgba,
2614 uint8_t* dst_u,
2615 uint8_t* dst_v,
2616 int width) {
2617 const uint8_t* src_rgba_1 = src_rgba + src_stride_rgba;
2618 asm volatile (
2619 RGBTOUV_SETUP_REG
2620 "1: \n"
2621 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
2622 "uaddlp v0.8h, v1.16b \n" // B 16 bytes -> 8 shorts.
2623 "prfm pldl1keep, [%0, 448] \n"
2624 "uaddlp v1.8h, v2.16b \n" // G 16 bytes -> 8 shorts.
2625 "uaddlp v2.8h, v3.16b \n" // R 16 bytes -> 8 shorts.
2626 "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more.
2627 "uadalp v0.8h, v5.16b \n" // B 16 bytes -> 8 shorts.
2628 "prfm pldl1keep, [%1, 448] \n"
2629 "uadalp v1.8h, v6.16b \n" // G 16 bytes -> 8 shorts.
2630 "uadalp v2.8h, v7.16b \n" // R 16 bytes -> 8 shorts.
2631
2632 "urshr v0.8h, v0.8h, #1 \n" // 2x average
2633 "urshr v1.8h, v1.8h, #1 \n"
2634 "urshr v2.8h, v2.8h, #1 \n"
2635
2636 "subs %w4, %w4, #16 \n" // 16 processed per loop.
2637 RGBTOUV(v0.8h, v1.8h, v2.8h)
2638 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
2639 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
2640 "b.gt 1b \n"
2641 : "+r"(src_rgba), // %0
2642 "+r"(src_rgba_1), // %1
2643 "+r"(dst_u), // %2
2644 "+r"(dst_v), // %3
2645 "+r"(width) // %4
2646 :
2647 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
2648 "v20", "v21", "v22", "v23", "v24", "v25"
2649 );
2650 }
2651
RGB24ToUVRow_NEON(const uint8_t * src_rgb24,int src_stride_rgb24,uint8_t * dst_u,uint8_t * dst_v,int width)2652 void RGB24ToUVRow_NEON(const uint8_t* src_rgb24,
2653 int src_stride_rgb24,
2654 uint8_t* dst_u,
2655 uint8_t* dst_v,
2656 int width) {
2657 const uint8_t* src_rgb24_1 = src_rgb24 + src_stride_rgb24;
2658 asm volatile (
2659 RGBTOUV_SETUP_REG
2660 "1: \n"
2661 "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 pixels.
2662 "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
2663 "prfm pldl1keep, [%0, 448] \n"
2664 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
2665 "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
2666 "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load 16 more.
2667 "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts.
2668 "prfm pldl1keep, [%1, 448] \n"
2669 "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
2670 "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts.
2671
2672 "urshr v0.8h, v0.8h, #1 \n" // 2x average
2673 "urshr v1.8h, v1.8h, #1 \n"
2674 "urshr v2.8h, v2.8h, #1 \n"
2675
2676 "subs %w4, %w4, #16 \n" // 16 processed per loop.
2677 RGBTOUV(v0.8h, v1.8h, v2.8h)
2678 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
2679 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
2680 "b.gt 1b \n"
2681 : "+r"(src_rgb24), // %0
2682 "+r"(src_rgb24_1), // %1
2683 "+r"(dst_u), // %2
2684 "+r"(dst_v), // %3
2685 "+r"(width) // %4
2686 :
2687 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
2688 "v20", "v21", "v22", "v23", "v24", "v25"
2689 );
2690 }
2691
RAWToUVRow_NEON(const uint8_t * src_raw,int src_stride_raw,uint8_t * dst_u,uint8_t * dst_v,int width)2692 void RAWToUVRow_NEON(const uint8_t* src_raw,
2693 int src_stride_raw,
2694 uint8_t* dst_u,
2695 uint8_t* dst_v,
2696 int width) {
2697 const uint8_t* src_raw_1 = src_raw + src_stride_raw;
2698 asm volatile (
2699 RGBTOUV_SETUP_REG
2700 "1: \n"
2701 "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 RAW pixels.
2702 "uaddlp v2.8h, v2.16b \n" // B 16 bytes -> 8 shorts.
2703 "prfm pldl1keep, [%0, 448] \n"
2704 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
2705 "uaddlp v0.8h, v0.16b \n" // R 16 bytes -> 8 shorts.
2706 "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load 8 more RAW pixels
2707 "uadalp v2.8h, v6.16b \n" // B 16 bytes -> 8 shorts.
2708 "prfm pldl1keep, [%1, 448] \n"
2709 "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
2710 "uadalp v0.8h, v4.16b \n" // R 16 bytes -> 8 shorts.
2711
2712 "urshr v2.8h, v2.8h, #1 \n" // 2x average
2713 "urshr v1.8h, v1.8h, #1 \n"
2714 "urshr v0.8h, v0.8h, #1 \n"
2715
2716 "subs %w4, %w4, #16 \n" // 16 processed per loop.
2717 RGBTOUV(v2.8h, v1.8h, v0.8h)
2718 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
2719 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
2720 "b.gt 1b \n"
2721 : "+r"(src_raw), // %0
2722 "+r"(src_raw_1), // %1
2723 "+r"(dst_u), // %2
2724 "+r"(dst_v), // %3
2725 "+r"(width) // %4
2726 :
2727 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
2728 "v20", "v21", "v22", "v23", "v24", "v25"
2729 );
2730 }
2731
2732 // 16x2 pixels -> 8x1. width is number of rgb pixels. e.g. 16.
RGB565ToUVRow_NEON(const uint8_t * src_rgb565,int src_stride_rgb565,uint8_t * dst_u,uint8_t * dst_v,int width)2733 void RGB565ToUVRow_NEON(const uint8_t* src_rgb565,
2734 int src_stride_rgb565,
2735 uint8_t* dst_u,
2736 uint8_t* dst_v,
2737 int width) {
2738 const uint8_t* src_rgb565_1 = src_rgb565 + src_stride_rgb565;
2739 asm volatile(
2740 RGBTOUV_SETUP_REG
2741 "1: \n"
2742 "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels.
2743 RGB565TOARGB
2744 "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
2745 "prfm pldl1keep, [%0, 448] \n"
2746 "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
2747 "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
2748 "ld1 {v0.16b}, [%0], #16 \n" // next 8 RGB565 pixels.
2749 RGB565TOARGB
2750 "uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
2751 "uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
2752 "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
2753
2754 "ld1 {v0.16b}, [%1], #16 \n" // load 8 RGB565 pixels.
2755 RGB565TOARGB
2756 "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
2757 "prfm pldl1keep, [%1, 448] \n"
2758 "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
2759 "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
2760 "ld1 {v0.16b}, [%1], #16 \n" // next 8 RGB565 pixels.
2761 RGB565TOARGB
2762 "uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
2763 "uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
2764 "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
2765
2766 "ins v16.D[1], v26.D[0] \n"
2767 "ins v17.D[1], v27.D[0] \n"
2768 "ins v18.D[1], v28.D[0] \n"
2769
2770 "urshr v0.8h, v16.8h, #1 \n" // 2x average
2771 "urshr v1.8h, v17.8h, #1 \n"
2772 "urshr v2.8h, v18.8h, #1 \n"
2773
2774 "subs %w4, %w4, #16 \n" // 16 processed per loop.
2775 RGBTOUV(v0.8h, v1.8h, v2.8h)
2776 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
2777 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
2778 "b.gt 1b \n"
2779 : "+r"(src_rgb565), // %0
2780 "+r"(src_rgb565_1), // %1
2781 "+r"(dst_u), // %2
2782 "+r"(dst_v), // %3
2783 "+r"(width) // %4
2784 :
2785 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17",
2786 "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27",
2787 "v28");
2788 }
2789
2790 // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
ARGB1555ToUVRow_NEON(const uint8_t * src_argb1555,int src_stride_argb1555,uint8_t * dst_u,uint8_t * dst_v,int width)2791 void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555,
2792 int src_stride_argb1555,
2793 uint8_t* dst_u,
2794 uint8_t* dst_v,
2795 int width) {
2796 const uint8_t* src_argb1555_1 = src_argb1555 + src_stride_argb1555;
2797 asm volatile(
2798 RGBTOUV_SETUP_REG
2799 "1: \n"
2800 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
2801 RGB555TOARGB
2802 "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
2803 "prfm pldl1keep, [%0, 448] \n"
2804 "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
2805 "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
2806 "ld1 {v0.16b}, [%0], #16 \n" // next 8 ARGB1555 pixels.
2807 RGB555TOARGB
2808 "uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
2809 "uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
2810 "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
2811
2812 "ld1 {v0.16b}, [%1], #16 \n" // load 8 ARGB1555 pixels.
2813 RGB555TOARGB
2814 "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
2815 "prfm pldl1keep, [%1, 448] \n"
2816 "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
2817 "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
2818 "ld1 {v0.16b}, [%1], #16 \n" // next 8 ARGB1555 pixels.
2819 RGB555TOARGB
2820 "uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
2821 "uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
2822 "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
2823
2824 "ins v16.D[1], v26.D[0] \n"
2825 "ins v17.D[1], v27.D[0] \n"
2826 "ins v18.D[1], v28.D[0] \n"
2827
2828 "urshr v0.8h, v16.8h, #1 \n" // 2x average
2829 "urshr v1.8h, v17.8h, #1 \n"
2830 "urshr v2.8h, v18.8h, #1 \n"
2831
2832 "subs %w4, %w4, #16 \n" // 16 processed per loop.
2833 RGBTOUV(v0.8h, v1.8h, v2.8h)
2834 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
2835 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
2836 "b.gt 1b \n"
2837 : "+r"(src_argb1555), // %0
2838 "+r"(src_argb1555_1), // %1
2839 "+r"(dst_u), // %2
2840 "+r"(dst_v), // %3
2841 "+r"(width) // %4
2842 :
2843 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17",
2844 "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27",
2845 "v28");
2846 }
2847
2848 // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
ARGB4444ToUVRow_NEON(const uint8_t * src_argb4444,int src_stride_argb4444,uint8_t * dst_u,uint8_t * dst_v,int width)2849 void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
2850 int src_stride_argb4444,
2851 uint8_t* dst_u,
2852 uint8_t* dst_v,
2853 int width) {
2854 const uint8_t* src_argb4444_1 = src_argb4444 + src_stride_argb4444;
2855 asm volatile(
2856 RGBTOUV_SETUP_REG // sets v20-v25
2857 "1: \n"
2858 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels.
2859 ARGB4444TOARGB
2860 "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
2861 "prfm pldl1keep, [%0, 448] \n"
2862 "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
2863 "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
2864 "ld1 {v0.16b}, [%0], #16 \n" // next 8 ARGB4444 pixels.
2865 ARGB4444TOARGB
2866 "uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
2867 "uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
2868 "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
2869
2870 "ld1 {v0.16b}, [%1], #16 \n" // load 8 ARGB4444 pixels.
2871 ARGB4444TOARGB
2872 "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
2873 "prfm pldl1keep, [%1, 448] \n"
2874 "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
2875 "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
2876 "ld1 {v0.16b}, [%1], #16 \n" // next 8 ARGB4444 pixels.
2877 ARGB4444TOARGB
2878 "uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
2879 "uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
2880 "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
2881
2882 "ins v16.D[1], v26.D[0] \n"
2883 "ins v17.D[1], v27.D[0] \n"
2884 "ins v18.D[1], v28.D[0] \n"
2885
2886 "urshr v0.8h, v16.8h, #1 \n" // 2x average
2887 "urshr v1.8h, v17.8h, #1 \n"
2888 "urshr v2.8h, v18.8h, #1 \n"
2889
2890 "subs %w4, %w4, #16 \n" // 16 processed per loop.
2891 RGBTOUV(v0.8h, v1.8h, v2.8h)
2892 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
2893 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
2894 "b.gt 1b \n"
2895 : "+r"(src_argb4444), // %0
2896 "+r"(src_argb4444_1), // %1
2897 "+r"(dst_u), // %2
2898 "+r"(dst_v), // %3
2899 "+r"(width) // %4
2900 :
2901 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17",
2902 "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27",
2903 "v28"
2904
2905 );
2906 }
2907
RGB565ToYRow_NEON(const uint8_t * src_rgb565,uint8_t * dst_y,int width)2908 void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
2909 asm volatile(
2910 "movi v24.8b, #25 \n" // B * 0.1016 coefficient
2911 "movi v25.8b, #129 \n" // G * 0.5078 coefficient
2912 "movi v26.8b, #66 \n" // R * 0.2578 coefficient
2913 "movi v27.8b, #16 \n" // Add 16 constant
2914 "1: \n"
2915 "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels.
2916 "subs %w2, %w2, #8 \n" // 8 processed per loop.
2917 RGB565TOARGB
2918 "umull v3.8h, v0.8b, v24.8b \n" // B
2919 "prfm pldl1keep, [%0, 448] \n"
2920 "umlal v3.8h, v1.8b, v25.8b \n" // G
2921 "umlal v3.8h, v2.8b, v26.8b \n" // R
2922 "uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y
2923 "uqadd v0.8b, v0.8b, v27.8b \n"
2924 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
2925 "b.gt 1b \n"
2926 : "+r"(src_rgb565), // %0
2927 "+r"(dst_y), // %1
2928 "+r"(width) // %2
2929 :
2930 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6", "v24", "v25", "v26",
2931 "v27");
2932 }
2933
ARGB1555ToYRow_NEON(const uint8_t * src_argb1555,uint8_t * dst_y,int width)2934 void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555,
2935 uint8_t* dst_y,
2936 int width) {
2937 asm volatile(
2938 "movi v4.8b, #25 \n" // B * 0.1016 coefficient
2939 "movi v5.8b, #129 \n" // G * 0.5078 coefficient
2940 "movi v6.8b, #66 \n" // R * 0.2578 coefficient
2941 "movi v7.8b, #16 \n" // Add 16 constant
2942 "1: \n"
2943 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
2944 "subs %w2, %w2, #8 \n" // 8 processed per loop.
2945 ARGB1555TOARGB
2946 "umull v3.8h, v0.8b, v4.8b \n" // B
2947 "prfm pldl1keep, [%0, 448] \n"
2948 "umlal v3.8h, v1.8b, v5.8b \n" // G
2949 "umlal v3.8h, v2.8b, v6.8b \n" // R
2950 "uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y
2951 "uqadd v0.8b, v0.8b, v7.8b \n"
2952 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
2953 "b.gt 1b \n"
2954 : "+r"(src_argb1555), // %0
2955 "+r"(dst_y), // %1
2956 "+r"(width) // %2
2957 :
2958 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
2959 }
2960
ARGB4444ToYRow_NEON(const uint8_t * src_argb4444,uint8_t * dst_y,int width)2961 void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
2962 uint8_t* dst_y,
2963 int width) {
2964 asm volatile(
2965 "movi v24.8b, #25 \n" // B * 0.1016 coefficient
2966 "movi v25.8b, #129 \n" // G * 0.5078 coefficient
2967 "movi v26.8b, #66 \n" // R * 0.2578 coefficient
2968 "movi v27.8b, #16 \n" // Add 16 constant
2969 "1: \n"
2970 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels.
2971 "subs %w2, %w2, #8 \n" // 8 processed per loop.
2972 ARGB4444TOARGB
2973 "umull v3.8h, v0.8b, v24.8b \n" // B
2974 "prfm pldl1keep, [%0, 448] \n"
2975 "umlal v3.8h, v1.8b, v25.8b \n" // G
2976 "umlal v3.8h, v2.8b, v26.8b \n" // R
2977 "uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y
2978 "uqadd v0.8b, v0.8b, v27.8b \n"
2979 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
2980 "b.gt 1b \n"
2981 : "+r"(src_argb4444), // %0
2982 "+r"(dst_y), // %1
2983 "+r"(width) // %2
2984 :
2985 : "cc", "memory", "v0", "v1", "v2", "v3", "v24", "v25", "v26", "v27");
2986 }
2987
2988 struct RgbConstants {
2989 uint8_t kRGBToY[4];
2990 uint16_t kAddY;
2991 };
2992
2993 // ARGB expects first 3 values to contain RGB and 4th value is ignored.
ARGBToYMatrixRow_NEON(const uint8_t * src_argb,uint8_t * dst_y,int width,const struct RgbConstants * rgbconstants)2994 void ARGBToYMatrixRow_NEON(const uint8_t* src_argb,
2995 uint8_t* dst_y,
2996 int width,
2997 const struct RgbConstants* rgbconstants) {
2998 asm volatile(
2999 "ldr d0, [%3] \n" // load rgbconstants
3000 "dup v6.16b, v0.b[0] \n"
3001 "dup v7.16b, v0.b[1] \n"
3002 "dup v16.16b, v0.b[2] \n"
3003 "dup v17.8h, v0.h[2] \n"
3004 "1: \n"
3005 "ld4 {v2.16b,v3.16b,v4.16b,v5.16b}, [%0], #64 \n" // load 16
3006 // pixels.
3007 "subs %w2, %w2, #16 \n" // 16 processed per loop.
3008 "umull v0.8h, v2.8b, v6.8b \n" // B
3009 "umull2 v1.8h, v2.16b, v6.16b \n"
3010 "prfm pldl1keep, [%0, 448] \n"
3011 "umlal v0.8h, v3.8b, v7.8b \n" // G
3012 "umlal2 v1.8h, v3.16b, v7.16b \n"
3013 "umlal v0.8h, v4.8b, v16.8b \n" // R
3014 "umlal2 v1.8h, v4.16b, v16.16b \n"
3015 "addhn v0.8b, v0.8h, v17.8h \n" // 16 bit to 8 bit Y
3016 "addhn v1.8b, v1.8h, v17.8h \n"
3017 "st1 {v0.8b, v1.8b}, [%1], #16 \n" // store 16 pixels Y.
3018 "b.gt 1b \n"
3019 : "+r"(src_argb), // %0
3020 "+r"(dst_y), // %1
3021 "+r"(width) // %2
3022 : "r"(rgbconstants) // %3
3023 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
3024 "v17");
3025 }
3026
3027 // RGB to JPeg coefficients
3028 // B * 0.1140 coefficient = 29
3029 // G * 0.5870 coefficient = 150
3030 // R * 0.2990 coefficient = 77
3031 // Add 0.5 = 0x80
3032 static const struct RgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0}, 128};
3033
3034 static const struct RgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128};
3035
3036 // RGB to BT.601 coefficients
3037 // B * 0.1016 coefficient = 25
3038 // G * 0.5078 coefficient = 129
3039 // R * 0.2578 coefficient = 66
3040 // Add 16.5 = 0x1080
3041
3042 static const struct RgbConstants kRgb24I601Constants = {{25, 129, 66, 0},
3043 0x1080};
3044
3045 static const struct RgbConstants kRawI601Constants = {{66, 129, 25, 0}, 0x1080};
3046
ARGBToYRow_NEON(const uint8_t * src_argb,uint8_t * dst_y,int width)3047 void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
3048 ARGBToYMatrixRow_NEON(src_argb, dst_y, width, &kRgb24I601Constants);
3049 }
3050
ARGBToYJRow_NEON(const uint8_t * src_argb,uint8_t * dst_yj,int width)3051 void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_yj, int width) {
3052 ARGBToYMatrixRow_NEON(src_argb, dst_yj, width, &kRgb24JPEGConstants);
3053 }
3054
ABGRToYRow_NEON(const uint8_t * src_abgr,uint8_t * dst_y,int width)3055 void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
3056 ARGBToYMatrixRow_NEON(src_abgr, dst_y, width, &kRawI601Constants);
3057 }
3058
ABGRToYJRow_NEON(const uint8_t * src_abgr,uint8_t * dst_yj,int width)3059 void ABGRToYJRow_NEON(const uint8_t* src_abgr, uint8_t* dst_yj, int width) {
3060 ARGBToYMatrixRow_NEON(src_abgr, dst_yj, width, &kRawJPEGConstants);
3061 }
3062
3063 // RGBA expects first value to be A and ignored, then 3 values to contain RGB.
3064 // Same code as ARGB, except the LD4
RGBAToYMatrixRow_NEON(const uint8_t * src_rgba,uint8_t * dst_y,int width,const struct RgbConstants * rgbconstants)3065 void RGBAToYMatrixRow_NEON(const uint8_t* src_rgba,
3066 uint8_t* dst_y,
3067 int width,
3068 const struct RgbConstants* rgbconstants) {
3069 asm volatile(
3070 "ldr d0, [%3] \n" // load rgbconstants
3071 "dup v6.16b, v0.b[0] \n"
3072 "dup v7.16b, v0.b[1] \n"
3073 "dup v16.16b, v0.b[2] \n"
3074 "dup v17.8h, v0.h[2] \n"
3075 "1: \n"
3076 "ld4 {v1.16b,v2.16b,v3.16b,v4.16b}, [%0], #64 \n" // load 16
3077 // pixels.
3078 "subs %w2, %w2, #16 \n" // 16 processed per loop.
3079 "umull v0.8h, v2.8b, v6.8b \n" // B
3080 "umull2 v1.8h, v2.16b, v6.16b \n"
3081 "prfm pldl1keep, [%0, 448] \n"
3082 "umlal v0.8h, v3.8b, v7.8b \n" // G
3083 "umlal2 v1.8h, v3.16b, v7.16b \n"
3084 "umlal v0.8h, v4.8b, v16.8b \n" // R
3085 "umlal2 v1.8h, v4.16b, v16.16b \n"
3086 "addhn v0.8b, v0.8h, v17.8h \n" // 16 bit to 8 bit Y
3087 "addhn v1.8b, v1.8h, v17.8h \n"
3088 "st1 {v0.8b, v1.8b}, [%1], #16 \n" // store 16 pixels Y.
3089 "b.gt 1b \n"
3090 : "+r"(src_rgba), // %0
3091 "+r"(dst_y), // %1
3092 "+r"(width) // %2
3093 : "r"(rgbconstants) // %3
3094 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
3095 "v17");
3096 }
3097
RGBAToYRow_NEON(const uint8_t * src_rgba,uint8_t * dst_y,int width)3098 void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
3099 RGBAToYMatrixRow_NEON(src_rgba, dst_y, width, &kRgb24I601Constants);
3100 }
3101
RGBAToYJRow_NEON(const uint8_t * src_rgba,uint8_t * dst_yj,int width)3102 void RGBAToYJRow_NEON(const uint8_t* src_rgba, uint8_t* dst_yj, int width) {
3103 RGBAToYMatrixRow_NEON(src_rgba, dst_yj, width, &kRgb24JPEGConstants);
3104 }
3105
BGRAToYRow_NEON(const uint8_t * src_bgra,uint8_t * dst_y,int width)3106 void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
3107 RGBAToYMatrixRow_NEON(src_bgra, dst_y, width, &kRawI601Constants);
3108 }
3109
RGBToYMatrixRow_NEON(const uint8_t * src_rgb,uint8_t * dst_y,int width,const struct RgbConstants * rgbconstants)3110 void RGBToYMatrixRow_NEON(const uint8_t* src_rgb,
3111 uint8_t* dst_y,
3112 int width,
3113 const struct RgbConstants* rgbconstants) {
3114 asm volatile(
3115 "ldr d0, [%3] \n" // load rgbconstants
3116 "dup v5.16b, v0.b[0] \n"
3117 "dup v6.16b, v0.b[1] \n"
3118 "dup v7.16b, v0.b[2] \n"
3119 "dup v16.8h, v0.h[2] \n"
3120 "1: \n"
3121 "ld3 {v2.16b,v3.16b,v4.16b}, [%0], #48 \n" // load 16 pixels.
3122 "subs %w2, %w2, #16 \n" // 16 processed per loop.
3123 "umull v0.8h, v2.8b, v5.8b \n" // B
3124 "umull2 v1.8h, v2.16b, v5.16b \n"
3125 "prfm pldl1keep, [%0, 448] \n"
3126 "umlal v0.8h, v3.8b, v6.8b \n" // G
3127 "umlal2 v1.8h, v3.16b, v6.16b \n"
3128 "umlal v0.8h, v4.8b, v7.8b \n" // R
3129 "umlal2 v1.8h, v4.16b, v7.16b \n"
3130 "addhn v0.8b, v0.8h, v16.8h \n" // 16 bit to 8 bit Y
3131 "addhn v1.8b, v1.8h, v16.8h \n"
3132 "st1 {v0.8b, v1.8b}, [%1], #16 \n" // store 16 pixels Y.
3133 "b.gt 1b \n"
3134 : "+r"(src_rgb), // %0
3135 "+r"(dst_y), // %1
3136 "+r"(width) // %2
3137 : "r"(rgbconstants) // %3
3138 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
3139 }
3140
RGB24ToYJRow_NEON(const uint8_t * src_rgb24,uint8_t * dst_yj,int width)3141 void RGB24ToYJRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) {
3142 RGBToYMatrixRow_NEON(src_rgb24, dst_yj, width, &kRgb24JPEGConstants);
3143 }
3144
RAWToYJRow_NEON(const uint8_t * src_raw,uint8_t * dst_yj,int width)3145 void RAWToYJRow_NEON(const uint8_t* src_raw, uint8_t* dst_yj, int width) {
3146 RGBToYMatrixRow_NEON(src_raw, dst_yj, width, &kRawJPEGConstants);
3147 }
3148
RGB24ToYRow_NEON(const uint8_t * src_rgb24,uint8_t * dst_y,int width)3149 void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width) {
3150 RGBToYMatrixRow_NEON(src_rgb24, dst_y, width, &kRgb24I601Constants);
3151 }
3152
RAWToYRow_NEON(const uint8_t * src_raw,uint8_t * dst_y,int width)3153 void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width) {
3154 RGBToYMatrixRow_NEON(src_raw, dst_y, width, &kRawI601Constants);
3155 }
3156
3157 // Bilinear filter 16x2 -> 16x1
InterpolateRow_NEON(uint8_t * dst_ptr,const uint8_t * src_ptr,ptrdiff_t src_stride,int dst_width,int source_y_fraction)3158 void InterpolateRow_NEON(uint8_t* dst_ptr,
3159 const uint8_t* src_ptr,
3160 ptrdiff_t src_stride,
3161 int dst_width,
3162 int source_y_fraction) {
3163 int y1_fraction = source_y_fraction;
3164 int y0_fraction = 256 - y1_fraction;
3165 const uint8_t* src_ptr1 = src_ptr + src_stride;
3166 asm volatile(
3167 "cmp %w4, #0 \n"
3168 "b.eq 100f \n"
3169 "cmp %w4, #128 \n"
3170 "b.eq 50f \n"
3171
3172 "dup v5.16b, %w4 \n"
3173 "dup v4.16b, %w5 \n"
3174 // General purpose row blend.
3175 "1: \n"
3176 "ld1 {v0.16b}, [%1], #16 \n"
3177 "ld1 {v1.16b}, [%2], #16 \n"
3178 "subs %w3, %w3, #16 \n"
3179 "umull v2.8h, v0.8b, v4.8b \n"
3180 "prfm pldl1keep, [%1, 448] \n"
3181 "umull2 v3.8h, v0.16b, v4.16b \n"
3182 "prfm pldl1keep, [%2, 448] \n"
3183 "umlal v2.8h, v1.8b, v5.8b \n"
3184 "umlal2 v3.8h, v1.16b, v5.16b \n"
3185 "rshrn v0.8b, v2.8h, #8 \n"
3186 "rshrn2 v0.16b, v3.8h, #8 \n"
3187 "st1 {v0.16b}, [%0], #16 \n"
3188 "b.gt 1b \n"
3189 "b 99f \n"
3190
3191 // Blend 50 / 50.
3192 "50: \n"
3193 "ld1 {v0.16b}, [%1], #16 \n"
3194 "ld1 {v1.16b}, [%2], #16 \n"
3195 "subs %w3, %w3, #16 \n"
3196 "prfm pldl1keep, [%1, 448] \n"
3197 "urhadd v0.16b, v0.16b, v1.16b \n"
3198 "prfm pldl1keep, [%2, 448] \n"
3199 "st1 {v0.16b}, [%0], #16 \n"
3200 "b.gt 50b \n"
3201 "b 99f \n"
3202
3203 // Blend 100 / 0 - Copy row unchanged.
3204 "100: \n"
3205 "ld1 {v0.16b}, [%1], #16 \n"
3206 "subs %w3, %w3, #16 \n"
3207 "prfm pldl1keep, [%1, 448] \n"
3208 "st1 {v0.16b}, [%0], #16 \n"
3209 "b.gt 100b \n"
3210
3211 "99: \n"
3212 : "+r"(dst_ptr), // %0
3213 "+r"(src_ptr), // %1
3214 "+r"(src_ptr1), // %2
3215 "+r"(dst_width), // %3
3216 "+r"(y1_fraction), // %4
3217 "+r"(y0_fraction) // %5
3218 :
3219 : "cc", "memory", "v0", "v1", "v3", "v4", "v5");
3220 }
3221
3222 // Bilinear filter 8x2 -> 8x1
InterpolateRow_16_NEON(uint16_t * dst_ptr,const uint16_t * src_ptr,ptrdiff_t src_stride,int dst_width,int source_y_fraction)3223 void InterpolateRow_16_NEON(uint16_t* dst_ptr,
3224 const uint16_t* src_ptr,
3225 ptrdiff_t src_stride,
3226 int dst_width,
3227 int source_y_fraction) {
3228 int y1_fraction = source_y_fraction;
3229 int y0_fraction = 256 - y1_fraction;
3230 const uint16_t* src_ptr1 = src_ptr + src_stride;
3231
3232 asm volatile(
3233 "cmp %w4, #0 \n"
3234 "b.eq 100f \n"
3235 "cmp %w4, #128 \n"
3236 "b.eq 50f \n"
3237
3238 "dup v5.8h, %w4 \n"
3239 "dup v4.8h, %w5 \n"
3240 // General purpose row blend.
3241 "1: \n"
3242 "ld1 {v0.8h}, [%1], #16 \n"
3243 "ld1 {v1.8h}, [%2], #16 \n"
3244 "subs %w3, %w3, #8 \n"
3245 "umull v2.4s, v0.4h, v4.4h \n"
3246 "prfm pldl1keep, [%1, 448] \n"
3247 "umull2 v3.4s, v0.8h, v4.8h \n"
3248 "prfm pldl1keep, [%2, 448] \n"
3249 "umlal v2.4s, v1.4h, v5.4h \n"
3250 "umlal2 v3.4s, v1.8h, v5.8h \n"
3251 "rshrn v0.4h, v2.4s, #8 \n"
3252 "rshrn2 v0.8h, v3.4s, #8 \n"
3253 "st1 {v0.8h}, [%0], #16 \n"
3254 "b.gt 1b \n"
3255 "b 99f \n"
3256
3257 // Blend 50 / 50.
3258 "50: \n"
3259 "ld1 {v0.8h}, [%1], #16 \n"
3260 "ld1 {v1.8h}, [%2], #16 \n"
3261 "subs %w3, %w3, #8 \n"
3262 "prfm pldl1keep, [%1, 448] \n"
3263 "urhadd v0.8h, v0.8h, v1.8h \n"
3264 "prfm pldl1keep, [%2, 448] \n"
3265 "st1 {v0.8h}, [%0], #16 \n"
3266 "b.gt 50b \n"
3267 "b 99f \n"
3268
3269 // Blend 100 / 0 - Copy row unchanged.
3270 "100: \n"
3271 "ld1 {v0.8h}, [%1], #16 \n"
3272 "subs %w3, %w3, #8 \n"
3273 "prfm pldl1keep, [%1, 448] \n"
3274 "st1 {v0.8h}, [%0], #16 \n"
3275 "b.gt 100b \n"
3276
3277 "99: \n"
3278 : "+r"(dst_ptr), // %0
3279 "+r"(src_ptr), // %1
3280 "+r"(src_ptr1), // %2
3281 "+r"(dst_width) // %3
3282 : "r"(y1_fraction), // %4
3283 "r"(y0_fraction) // %5
3284 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5");
3285 }
3286
3287 // Bilinear filter 8x2 -> 8x1
3288 // Use scale to convert lsb formats to msb, depending how many bits there are:
3289 // 32768 = 9 bits
3290 // 16384 = 10 bits
3291 // 4096 = 12 bits
3292 // 256 = 16 bits
InterpolateRow_16To8_NEON(uint8_t * dst_ptr,const uint16_t * src_ptr,ptrdiff_t src_stride,int scale,int dst_width,int source_y_fraction)3293 void InterpolateRow_16To8_NEON(uint8_t* dst_ptr,
3294 const uint16_t* src_ptr,
3295 ptrdiff_t src_stride,
3296 int scale,
3297 int dst_width,
3298 int source_y_fraction) {
3299 int y1_fraction = source_y_fraction;
3300 int y0_fraction = 256 - y1_fraction;
3301 const uint16_t* src_ptr1 = src_ptr + src_stride;
3302 int shift = 15 - __builtin_clz((int32_t)scale); // Negative shl is shr
3303
3304 asm volatile(
3305 "dup v6.8h, %w6 \n"
3306 "cmp %w4, #0 \n"
3307 "b.eq 100f \n"
3308 "cmp %w4, #128 \n"
3309 "b.eq 50f \n"
3310
3311 "dup v5.8h, %w4 \n"
3312 "dup v4.8h, %w5 \n"
3313 // General purpose row blend.
3314 "1: \n"
3315 "ld1 {v0.8h}, [%1], #16 \n"
3316 "ld1 {v1.8h}, [%2], #16 \n"
3317 "subs %w3, %w3, #8 \n"
3318 "umull v2.4s, v0.4h, v4.4h \n"
3319 "prfm pldl1keep, [%1, 448] \n"
3320 "umull2 v3.4s, v0.8h, v4.8h \n"
3321 "prfm pldl1keep, [%2, 448] \n"
3322 "umlal v2.4s, v1.4h, v5.4h \n"
3323 "umlal2 v3.4s, v1.8h, v5.8h \n"
3324 "rshrn v0.4h, v2.4s, #8 \n"
3325 "rshrn2 v0.8h, v3.4s, #8 \n"
3326 "ushl v0.8h, v0.8h, v6.8h \n"
3327 "uqxtn v0.8b, v0.8h \n"
3328 "st1 {v0.8b}, [%0], #8 \n"
3329 "b.gt 1b \n"
3330 "b 99f \n"
3331
3332 // Blend 50 / 50.
3333 "50: \n"
3334 "ld1 {v0.8h}, [%1], #16 \n"
3335 "ld1 {v1.8h}, [%2], #16 \n"
3336 "subs %w3, %w3, #8 \n"
3337 "prfm pldl1keep, [%1, 448] \n"
3338 "urhadd v0.8h, v0.8h, v1.8h \n"
3339 "prfm pldl1keep, [%2, 448] \n"
3340 "ushl v0.8h, v0.8h, v6.8h \n"
3341 "uqxtn v0.8b, v0.8h \n"
3342 "st1 {v0.8b}, [%0], #8 \n"
3343 "b.gt 50b \n"
3344 "b 99f \n"
3345
3346 // Blend 100 / 0 - Copy row unchanged.
3347 "100: \n"
3348 "ldr q0, [%1], #16 \n"
3349 "ushl v0.8h, v0.8h, v2.8h \n" // shr = v2 is negative
3350 "prfm pldl1keep, [%1, 448] \n"
3351 "uqxtn v0.8b, v0.8h \n"
3352 "subs %w3, %w3, #8 \n" // 8 src pixels per loop
3353 "str d0, [%0], #8 \n" // store 8 pixels
3354 "b.gt 100b \n"
3355
3356 "99: \n"
3357 : "+r"(dst_ptr), // %0
3358 "+r"(src_ptr), // %1
3359 "+r"(src_ptr1), // %2
3360 "+r"(dst_width) // %3
3361 : "r"(y1_fraction), // %4
3362 "r"(y0_fraction), // %5
3363 "r"(shift) // %6
3364 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6");
3365 }
3366
3367 // dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr
ARGBBlendRow_NEON(const uint8_t * src_argb,const uint8_t * src_argb1,uint8_t * dst_argb,int width)3368 void ARGBBlendRow_NEON(const uint8_t* src_argb,
3369 const uint8_t* src_argb1,
3370 uint8_t* dst_argb,
3371 int width) {
3372 asm volatile(
3373 "subs %w3, %w3, #8 \n"
3374 "b.lt 89f \n"
3375 // Blend 8 pixels.
3376 "8: \n"
3377 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB0
3378 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 ARGB1
3379 "subs %w3, %w3, #8 \n" // 8 processed per loop.
3380 "umull v16.8h, v4.8b, v3.8b \n" // db * a
3381 "prfm pldl1keep, [%0, 448] \n"
3382 "umull v17.8h, v5.8b, v3.8b \n" // dg * a
3383 "prfm pldl1keep, [%1, 448] \n"
3384 "umull v18.8h, v6.8b, v3.8b \n" // dr * a
3385 "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8
3386 "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8
3387 "uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8
3388 "uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256)
3389 "uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256)
3390 "uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256)
3391 "uqadd v0.8b, v0.8b, v4.8b \n" // + sb
3392 "uqadd v1.8b, v1.8b, v5.8b \n" // + sg
3393 "uqadd v2.8b, v2.8b, v6.8b \n" // + sr
3394 "movi v3.8b, #255 \n" // a = 255
3395 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
3396 // pixels
3397 "b.ge 8b \n"
3398
3399 "89: \n"
3400 "adds %w3, %w3, #8-1 \n"
3401 "b.lt 99f \n"
3402
3403 // Blend 1 pixels.
3404 "1: \n"
3405 "ld4 {v0.b,v1.b,v2.b,v3.b}[0], [%0], #4 \n" // load 1 pixel
3406 // ARGB0.
3407 "ld4 {v4.b,v5.b,v6.b,v7.b}[0], [%1], #4 \n" // load 1 pixel
3408 // ARGB1.
3409 "subs %w3, %w3, #1 \n" // 1 processed per loop.
3410 "umull v16.8h, v4.8b, v3.8b \n" // db * a
3411 "prfm pldl1keep, [%0, 448] \n"
3412 "umull v17.8h, v5.8b, v3.8b \n" // dg * a
3413 "prfm pldl1keep, [%1, 448] \n"
3414 "umull v18.8h, v6.8b, v3.8b \n" // dr * a
3415 "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8
3416 "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8
3417 "uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8
3418 "uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256)
3419 "uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256)
3420 "uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256)
3421 "uqadd v0.8b, v0.8b, v4.8b \n" // + sb
3422 "uqadd v1.8b, v1.8b, v5.8b \n" // + sg
3423 "uqadd v2.8b, v2.8b, v6.8b \n" // + sr
3424 "movi v3.8b, #255 \n" // a = 255
3425 "st4 {v0.b,v1.b,v2.b,v3.b}[0], [%2], #4 \n" // store 1 pixel.
3426 "b.ge 1b \n"
3427
3428 "99: \n"
3429
3430 : "+r"(src_argb), // %0
3431 "+r"(src_argb1), // %1
3432 "+r"(dst_argb), // %2
3433 "+r"(width) // %3
3434 :
3435 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
3436 "v17", "v18");
3437 }
3438
3439 // Attenuate 8 pixels at a time.
ARGBAttenuateRow_NEON(const uint8_t * src_argb,uint8_t * dst_argb,int width)3440 void ARGBAttenuateRow_NEON(const uint8_t* src_argb,
3441 uint8_t* dst_argb,
3442 int width) {
3443 asm volatile(
3444 "movi v7.8h, #0x00ff \n" // 255 for rounding up
3445
3446 // Attenuate 8 pixels.
3447 "1: \n"
3448 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
3449 "subs %w2, %w2, #8 \n" // 8 processed per loop.
3450 "umull v4.8h, v0.8b, v3.8b \n" // b * a
3451 "prfm pldl1keep, [%0, 448] \n"
3452 "umull v5.8h, v1.8b, v3.8b \n" // g * a
3453 "umull v6.8h, v2.8b, v3.8b \n" // r * a
3454 "addhn v0.8b, v4.8h, v7.8h \n" // (b + 255) >> 8
3455 "addhn v1.8b, v5.8h, v7.8h \n" // (g + 255) >> 8
3456 "addhn v2.8b, v6.8h, v7.8h \n" // (r + 255) >> 8
3457 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB
3458 "b.gt 1b \n"
3459 : "+r"(src_argb), // %0
3460 "+r"(dst_argb), // %1
3461 "+r"(width) // %2
3462 :
3463 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
3464 }
3465
3466 // Quantize 8 ARGB pixels (32 bytes).
3467 // dst = (dst * scale >> 16) * interval_size + interval_offset;
ARGBQuantizeRow_NEON(uint8_t * dst_argb,int scale,int interval_size,int interval_offset,int width)3468 void ARGBQuantizeRow_NEON(uint8_t* dst_argb,
3469 int scale,
3470 int interval_size,
3471 int interval_offset,
3472 int width) {
3473 asm volatile(
3474 "dup v4.8h, %w2 \n"
3475 "ushr v4.8h, v4.8h, #1 \n" // scale >>= 1
3476 "dup v5.8h, %w3 \n" // interval multiply.
3477 "dup v6.8h, %w4 \n" // interval add
3478
3479 // 8 pixel loop.
3480 "1: \n"
3481 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB.
3482 "subs %w1, %w1, #8 \n" // 8 processed per loop.
3483 "uxtl v0.8h, v0.8b \n" // b (0 .. 255)
3484 "prfm pldl1keep, [%0, 448] \n"
3485 "uxtl v1.8h, v1.8b \n"
3486 "uxtl v2.8h, v2.8b \n"
3487 "sqdmulh v0.8h, v0.8h, v4.8h \n" // b * scale
3488 "sqdmulh v1.8h, v1.8h, v4.8h \n" // g
3489 "sqdmulh v2.8h, v2.8h, v4.8h \n" // r
3490 "mul v0.8h, v0.8h, v5.8h \n" // b * interval_size
3491 "mul v1.8h, v1.8h, v5.8h \n" // g
3492 "mul v2.8h, v2.8h, v5.8h \n" // r
3493 "add v0.8h, v0.8h, v6.8h \n" // b + interval_offset
3494 "add v1.8h, v1.8h, v6.8h \n" // g
3495 "add v2.8h, v2.8h, v6.8h \n" // r
3496 "uqxtn v0.8b, v0.8h \n"
3497 "uqxtn v1.8b, v1.8h \n"
3498 "uqxtn v2.8b, v2.8h \n"
3499 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 ARGB
3500 "b.gt 1b \n"
3501 : "+r"(dst_argb), // %0
3502 "+r"(width) // %1
3503 : "r"(scale), // %2
3504 "r"(interval_size), // %3
3505 "r"(interval_offset) // %4
3506 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6");
3507 }
3508
3509 // Shade 8 pixels at a time by specified value.
3510 // NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8.
3511 // Rounding in vqrdmulh does +1 to high if high bit of low s16 is set.
ARGBShadeRow_NEON(const uint8_t * src_argb,uint8_t * dst_argb,int width,uint32_t value)3512 void ARGBShadeRow_NEON(const uint8_t* src_argb,
3513 uint8_t* dst_argb,
3514 int width,
3515 uint32_t value) {
3516 asm volatile(
3517 "dup v0.4s, %w3 \n" // duplicate scale value.
3518 "zip1 v0.8b, v0.8b, v0.8b \n" // v0.8b aarrggbb.
3519 "ushr v0.8h, v0.8h, #1 \n" // scale / 2.
3520
3521 // 8 pixel loop.
3522 "1: \n"
3523 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n" // load 8 ARGB
3524 "subs %w2, %w2, #8 \n" // 8 processed per loop.
3525 "uxtl v4.8h, v4.8b \n" // b (0 .. 255)
3526 "prfm pldl1keep, [%0, 448] \n"
3527 "uxtl v5.8h, v5.8b \n"
3528 "uxtl v6.8h, v6.8b \n"
3529 "uxtl v7.8h, v7.8b \n"
3530 "sqrdmulh v4.8h, v4.8h, v0.h[0] \n" // b * scale * 2
3531 "sqrdmulh v5.8h, v5.8h, v0.h[1] \n" // g
3532 "sqrdmulh v6.8h, v6.8h, v0.h[2] \n" // r
3533 "sqrdmulh v7.8h, v7.8h, v0.h[3] \n" // a
3534 "uqxtn v4.8b, v4.8h \n"
3535 "uqxtn v5.8b, v5.8h \n"
3536 "uqxtn v6.8b, v6.8h \n"
3537 "uqxtn v7.8b, v7.8h \n"
3538 "st4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // store 8 ARGB
3539 "b.gt 1b \n"
3540 : "+r"(src_argb), // %0
3541 "+r"(dst_argb), // %1
3542 "+r"(width) // %2
3543 : "r"(value) // %3
3544 : "cc", "memory", "v0", "v4", "v5", "v6", "v7");
3545 }
3546
3547 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
3548 // Similar to ARGBToYJ but stores ARGB.
3549 // C code is (29 * b + 150 * g + 77 * r + 128) >> 8;
ARGBGrayRow_NEON(const uint8_t * src_argb,uint8_t * dst_argb,int width)3550 void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
3551 asm volatile(
3552 "movi v24.8b, #29 \n" // B * 0.1140 coefficient
3553 "movi v25.8b, #150 \n" // G * 0.5870 coefficient
3554 "movi v26.8b, #77 \n" // R * 0.2990 coefficient
3555 "1: \n"
3556 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
3557 "subs %w2, %w2, #8 \n" // 8 processed per loop.
3558 "umull v4.8h, v0.8b, v24.8b \n" // B
3559 "prfm pldl1keep, [%0, 448] \n"
3560 "umlal v4.8h, v1.8b, v25.8b \n" // G
3561 "umlal v4.8h, v2.8b, v26.8b \n" // R
3562 "uqrshrn v0.8b, v4.8h, #8 \n" // 16 bit to 8 bit B
3563 "orr v1.8b, v0.8b, v0.8b \n" // G
3564 "orr v2.8b, v0.8b, v0.8b \n" // R
3565 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 pixels.
3566 "b.gt 1b \n"
3567 : "+r"(src_argb), // %0
3568 "+r"(dst_argb), // %1
3569 "+r"(width) // %2
3570 :
3571 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26");
3572 }
3573
3574 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
3575 // b = (r * 35 + g * 68 + b * 17) >> 7
3576 // g = (r * 45 + g * 88 + b * 22) >> 7
3577 // r = (r * 50 + g * 98 + b * 24) >> 7
3578
ARGBSepiaRow_NEON(uint8_t * dst_argb,int width)3579 void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width) {
3580 asm volatile(
3581 "movi v20.8b, #17 \n" // BB coefficient
3582 "movi v21.8b, #68 \n" // BG coefficient
3583 "movi v22.8b, #35 \n" // BR coefficient
3584 "movi v24.8b, #22 \n" // GB coefficient
3585 "movi v25.8b, #88 \n" // GG coefficient
3586 "movi v26.8b, #45 \n" // GR coefficient
3587 "movi v28.8b, #24 \n" // BB coefficient
3588 "movi v29.8b, #98 \n" // BG coefficient
3589 "movi v30.8b, #50 \n" // BR coefficient
3590 "1: \n"
3591 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB pixels.
3592 "subs %w1, %w1, #8 \n" // 8 processed per loop.
3593 "umull v4.8h, v0.8b, v20.8b \n" // B to Sepia B
3594 "prfm pldl1keep, [%0, 448] \n"
3595 "umlal v4.8h, v1.8b, v21.8b \n" // G
3596 "umlal v4.8h, v2.8b, v22.8b \n" // R
3597 "umull v5.8h, v0.8b, v24.8b \n" // B to Sepia G
3598 "umlal v5.8h, v1.8b, v25.8b \n" // G
3599 "umlal v5.8h, v2.8b, v26.8b \n" // R
3600 "umull v6.8h, v0.8b, v28.8b \n" // B to Sepia R
3601 "umlal v6.8h, v1.8b, v29.8b \n" // G
3602 "umlal v6.8h, v2.8b, v30.8b \n" // R
3603 "uqshrn v0.8b, v4.8h, #7 \n" // 16 bit to 8 bit B
3604 "uqshrn v1.8b, v5.8h, #7 \n" // 16 bit to 8 bit G
3605 "uqshrn v2.8b, v6.8h, #7 \n" // 16 bit to 8 bit R
3606 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 pixels.
3607 "b.gt 1b \n"
3608 : "+r"(dst_argb), // %0
3609 "+r"(width) // %1
3610 :
3611 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
3612 "v21", "v22", "v24", "v25", "v26", "v28", "v29", "v30");
3613 }
3614
3615 // Tranform 8 ARGB pixels (32 bytes) with color matrix.
3616 // TODO(fbarchard): Was same as Sepia except matrix is provided. This function
3617 // needs to saturate. Consider doing a non-saturating version.
ARGBColorMatrixRow_NEON(const uint8_t * src_argb,uint8_t * dst_argb,const int8_t * matrix_argb,int width)3618 void ARGBColorMatrixRow_NEON(const uint8_t* src_argb,
3619 uint8_t* dst_argb,
3620 const int8_t* matrix_argb,
3621 int width) {
3622 asm volatile(
3623 "ld1 {v2.16b}, [%3] \n" // load 3 ARGB vectors.
3624 "sxtl v0.8h, v2.8b \n" // B,G coefficients s16.
3625 "sxtl2 v1.8h, v2.16b \n" // R,A coefficients s16.
3626
3627 "1: \n"
3628 "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 ARGB
3629 "subs %w2, %w2, #8 \n" // 8 processed per loop.
3630 "uxtl v16.8h, v16.8b \n" // b (0 .. 255) 16 bit
3631 "prfm pldl1keep, [%0, 448] \n"
3632 "uxtl v17.8h, v17.8b \n" // g
3633 "uxtl v18.8h, v18.8b \n" // r
3634 "uxtl v19.8h, v19.8b \n" // a
3635 "mul v22.8h, v16.8h, v0.h[0] \n" // B = B * Matrix B
3636 "mul v23.8h, v16.8h, v0.h[4] \n" // G = B * Matrix G
3637 "mul v24.8h, v16.8h, v1.h[0] \n" // R = B * Matrix R
3638 "mul v25.8h, v16.8h, v1.h[4] \n" // A = B * Matrix A
3639 "mul v4.8h, v17.8h, v0.h[1] \n" // B += G * Matrix B
3640 "mul v5.8h, v17.8h, v0.h[5] \n" // G += G * Matrix G
3641 "mul v6.8h, v17.8h, v1.h[1] \n" // R += G * Matrix R
3642 "mul v7.8h, v17.8h, v1.h[5] \n" // A += G * Matrix A
3643 "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B
3644 "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G
3645 "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R
3646 "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A
3647 "mul v4.8h, v18.8h, v0.h[2] \n" // B += R * Matrix B
3648 "mul v5.8h, v18.8h, v0.h[6] \n" // G += R * Matrix G
3649 "mul v6.8h, v18.8h, v1.h[2] \n" // R += R * Matrix R
3650 "mul v7.8h, v18.8h, v1.h[6] \n" // A += R * Matrix A
3651 "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B
3652 "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G
3653 "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R
3654 "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A
3655 "mul v4.8h, v19.8h, v0.h[3] \n" // B += A * Matrix B
3656 "mul v5.8h, v19.8h, v0.h[7] \n" // G += A * Matrix G
3657 "mul v6.8h, v19.8h, v1.h[3] \n" // R += A * Matrix R
3658 "mul v7.8h, v19.8h, v1.h[7] \n" // A += A * Matrix A
3659 "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B
3660 "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G
3661 "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R
3662 "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A
3663 "sqshrun v16.8b, v22.8h, #6 \n" // 16 bit to 8 bit B
3664 "sqshrun v17.8b, v23.8h, #6 \n" // 16 bit to 8 bit G
3665 "sqshrun v18.8b, v24.8h, #6 \n" // 16 bit to 8 bit R
3666 "sqshrun v19.8b, v25.8h, #6 \n" // 16 bit to 8 bit A
3667 "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n" // store 8 ARGB
3668 "b.gt 1b \n"
3669 : "+r"(src_argb), // %0
3670 "+r"(dst_argb), // %1
3671 "+r"(width) // %2
3672 : "r"(matrix_argb) // %3
3673 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
3674 "v17", "v18", "v19", "v22", "v23", "v24", "v25");
3675 }
3676
3677 // TODO(fbarchard): fix vqshrun in ARGBMultiplyRow_NEON and reenable.
3678 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
ARGBMultiplyRow_NEON(const uint8_t * src_argb,const uint8_t * src_argb1,uint8_t * dst_argb,int width)3679 void ARGBMultiplyRow_NEON(const uint8_t* src_argb,
3680 const uint8_t* src_argb1,
3681 uint8_t* dst_argb,
3682 int width) {
3683 asm volatile(
3684 // 8 pixel loop.
3685 "1: \n"
3686 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
3687 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more
3688 "subs %w3, %w3, #8 \n" // 8 processed per loop.
3689 "umull v0.8h, v0.8b, v4.8b \n" // multiply B
3690 "prfm pldl1keep, [%0, 448] \n"
3691 "umull v1.8h, v1.8b, v5.8b \n" // multiply G
3692 "prfm pldl1keep, [%1, 448] \n"
3693 "umull v2.8h, v2.8b, v6.8b \n" // multiply R
3694 "umull v3.8h, v3.8b, v7.8b \n" // multiply A
3695 "rshrn v0.8b, v0.8h, #8 \n" // 16 bit to 8 bit B
3696 "rshrn v1.8b, v1.8h, #8 \n" // 16 bit to 8 bit G
3697 "rshrn v2.8b, v2.8h, #8 \n" // 16 bit to 8 bit R
3698 "rshrn v3.8b, v3.8h, #8 \n" // 16 bit to 8 bit A
3699 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
3700 "b.gt 1b \n"
3701 : "+r"(src_argb), // %0
3702 "+r"(src_argb1), // %1
3703 "+r"(dst_argb), // %2
3704 "+r"(width) // %3
3705 :
3706 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
3707 }
3708
3709 // Add 2 rows of ARGB pixels together, 8 pixels at a time.
ARGBAddRow_NEON(const uint8_t * src_argb,const uint8_t * src_argb1,uint8_t * dst_argb,int width)3710 void ARGBAddRow_NEON(const uint8_t* src_argb,
3711 const uint8_t* src_argb1,
3712 uint8_t* dst_argb,
3713 int width) {
3714 asm volatile(
3715 // 8 pixel loop.
3716 "1: \n"
3717 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
3718 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more
3719 "subs %w3, %w3, #8 \n" // 8 processed per loop.
3720 "uqadd v0.8b, v0.8b, v4.8b \n"
3721 "prfm pldl1keep, [%0, 448] \n"
3722 "uqadd v1.8b, v1.8b, v5.8b \n"
3723 "prfm pldl1keep, [%1, 448] \n"
3724 "uqadd v2.8b, v2.8b, v6.8b \n"
3725 "uqadd v3.8b, v3.8b, v7.8b \n"
3726 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
3727 "b.gt 1b \n"
3728 : "+r"(src_argb), // %0
3729 "+r"(src_argb1), // %1
3730 "+r"(dst_argb), // %2
3731 "+r"(width) // %3
3732 :
3733 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
3734 }
3735
3736 // Subtract 2 rows of ARGB pixels, 8 pixels at a time.
ARGBSubtractRow_NEON(const uint8_t * src_argb,const uint8_t * src_argb1,uint8_t * dst_argb,int width)3737 void ARGBSubtractRow_NEON(const uint8_t* src_argb,
3738 const uint8_t* src_argb1,
3739 uint8_t* dst_argb,
3740 int width) {
3741 asm volatile(
3742 // 8 pixel loop.
3743 "1: \n"
3744 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
3745 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more
3746 "subs %w3, %w3, #8 \n" // 8 processed per loop.
3747 "uqsub v0.8b, v0.8b, v4.8b \n"
3748 "prfm pldl1keep, [%0, 448] \n"
3749 "uqsub v1.8b, v1.8b, v5.8b \n"
3750 "prfm pldl1keep, [%1, 448] \n"
3751 "uqsub v2.8b, v2.8b, v6.8b \n"
3752 "uqsub v3.8b, v3.8b, v7.8b \n"
3753 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
3754 "b.gt 1b \n"
3755 : "+r"(src_argb), // %0
3756 "+r"(src_argb1), // %1
3757 "+r"(dst_argb), // %2
3758 "+r"(width) // %3
3759 :
3760 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
3761 }
3762
3763 // Adds Sobel X and Sobel Y and stores Sobel into ARGB.
3764 // A = 255
3765 // R = Sobel
3766 // G = Sobel
3767 // B = Sobel
SobelRow_NEON(const uint8_t * src_sobelx,const uint8_t * src_sobely,uint8_t * dst_argb,int width)3768 void SobelRow_NEON(const uint8_t* src_sobelx,
3769 const uint8_t* src_sobely,
3770 uint8_t* dst_argb,
3771 int width) {
3772 asm volatile(
3773 "movi v3.8b, #255 \n" // alpha
3774 // 8 pixel loop.
3775 "1: \n"
3776 "ld1 {v0.8b}, [%0], #8 \n" // load 8 sobelx.
3777 "ld1 {v1.8b}, [%1], #8 \n" // load 8 sobely.
3778 "subs %w3, %w3, #8 \n" // 8 processed per loop.
3779 "uqadd v0.8b, v0.8b, v1.8b \n" // add
3780 "prfm pldl1keep, [%0, 448] \n"
3781 "orr v1.8b, v0.8b, v0.8b \n"
3782 "prfm pldl1keep, [%1, 448] \n"
3783 "orr v2.8b, v0.8b, v0.8b \n"
3784 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
3785 "b.gt 1b \n"
3786 : "+r"(src_sobelx), // %0
3787 "+r"(src_sobely), // %1
3788 "+r"(dst_argb), // %2
3789 "+r"(width) // %3
3790 :
3791 : "cc", "memory", "v0", "v1", "v2", "v3");
3792 }
3793
3794 // Adds Sobel X and Sobel Y and stores Sobel into plane.
SobelToPlaneRow_NEON(const uint8_t * src_sobelx,const uint8_t * src_sobely,uint8_t * dst_y,int width)3795 void SobelToPlaneRow_NEON(const uint8_t* src_sobelx,
3796 const uint8_t* src_sobely,
3797 uint8_t* dst_y,
3798 int width) {
3799 asm volatile(
3800 // 16 pixel loop.
3801 "1: \n"
3802 "ld1 {v0.16b}, [%0], #16 \n" // load 16 sobelx.
3803 "ld1 {v1.16b}, [%1], #16 \n" // load 16 sobely.
3804 "subs %w3, %w3, #16 \n" // 16 processed per loop.
3805 "prfm pldl1keep, [%0, 448] \n"
3806 "uqadd v0.16b, v0.16b, v1.16b \n" // add
3807 "prfm pldl1keep, [%1, 448] \n"
3808 "st1 {v0.16b}, [%2], #16 \n" // store 16 pixels.
3809 "b.gt 1b \n"
3810 : "+r"(src_sobelx), // %0
3811 "+r"(src_sobely), // %1
3812 "+r"(dst_y), // %2
3813 "+r"(width) // %3
3814 :
3815 : "cc", "memory", "v0", "v1");
3816 }
3817
3818 // Mixes Sobel X, Sobel Y and Sobel into ARGB.
3819 // A = 255
3820 // R = Sobel X
3821 // G = Sobel
3822 // B = Sobel Y
SobelXYRow_NEON(const uint8_t * src_sobelx,const uint8_t * src_sobely,uint8_t * dst_argb,int width)3823 void SobelXYRow_NEON(const uint8_t* src_sobelx,
3824 const uint8_t* src_sobely,
3825 uint8_t* dst_argb,
3826 int width) {
3827 asm volatile(
3828 "movi v3.8b, #255 \n" // alpha
3829 // 8 pixel loop.
3830 "1: \n"
3831 "ld1 {v2.8b}, [%0], #8 \n" // load 8 sobelx.
3832 "ld1 {v0.8b}, [%1], #8 \n" // load 8 sobely.
3833 "subs %w3, %w3, #8 \n" // 8 processed per loop.
3834 "prfm pldl1keep, [%0, 448] \n"
3835 "uqadd v1.8b, v0.8b, v2.8b \n" // add
3836 "prfm pldl1keep, [%1, 448] \n"
3837 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
3838 "b.gt 1b \n"
3839 : "+r"(src_sobelx), // %0
3840 "+r"(src_sobely), // %1
3841 "+r"(dst_argb), // %2
3842 "+r"(width) // %3
3843 :
3844 : "cc", "memory", "v0", "v1", "v2", "v3");
3845 }
3846
3847 // SobelX as a matrix is
3848 // -1 0 1
3849 // -2 0 2
3850 // -1 0 1
SobelXRow_NEON(const uint8_t * src_y0,const uint8_t * src_y1,const uint8_t * src_y2,uint8_t * dst_sobelx,int width)3851 void SobelXRow_NEON(const uint8_t* src_y0,
3852 const uint8_t* src_y1,
3853 const uint8_t* src_y2,
3854 uint8_t* dst_sobelx,
3855 int width) {
3856 asm volatile(
3857 "1: \n"
3858 "ld1 {v0.8b}, [%0],%5 \n" // top
3859 "ld1 {v1.8b}, [%0],%6 \n"
3860 "usubl v0.8h, v0.8b, v1.8b \n"
3861 "prfm pldl1keep, [%0, 448] \n"
3862 "ld1 {v2.8b}, [%1],%5 \n" // center * 2
3863 "ld1 {v3.8b}, [%1],%6 \n"
3864 "usubl v1.8h, v2.8b, v3.8b \n"
3865 "prfm pldl1keep, [%1, 448] \n"
3866 "add v0.8h, v0.8h, v1.8h \n"
3867 "add v0.8h, v0.8h, v1.8h \n"
3868 "ld1 {v2.8b}, [%2],%5 \n" // bottom
3869 "ld1 {v3.8b}, [%2],%6 \n"
3870 "subs %w4, %w4, #8 \n" // 8 pixels
3871 "prfm pldl1keep, [%2, 448] \n"
3872 "usubl v1.8h, v2.8b, v3.8b \n"
3873 "add v0.8h, v0.8h, v1.8h \n"
3874 "abs v0.8h, v0.8h \n"
3875 "uqxtn v0.8b, v0.8h \n"
3876 "st1 {v0.8b}, [%3], #8 \n" // store 8 sobelx
3877 "b.gt 1b \n"
3878 : "+r"(src_y0), // %0
3879 "+r"(src_y1), // %1
3880 "+r"(src_y2), // %2
3881 "+r"(dst_sobelx), // %3
3882 "+r"(width) // %4
3883 : "r"(2LL), // %5
3884 "r"(6LL) // %6
3885 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
3886 );
3887 }
3888
3889 // SobelY as a matrix is
3890 // -1 -2 -1
3891 // 0 0 0
3892 // 1 2 1
SobelYRow_NEON(const uint8_t * src_y0,const uint8_t * src_y1,uint8_t * dst_sobely,int width)3893 void SobelYRow_NEON(const uint8_t* src_y0,
3894 const uint8_t* src_y1,
3895 uint8_t* dst_sobely,
3896 int width) {
3897 asm volatile(
3898 "1: \n"
3899 "ld1 {v0.8b}, [%0],%4 \n" // left
3900 "ld1 {v1.8b}, [%1],%4 \n"
3901 "usubl v0.8h, v0.8b, v1.8b \n"
3902 "ld1 {v2.8b}, [%0],%4 \n" // center * 2
3903 "ld1 {v3.8b}, [%1],%4 \n"
3904 "usubl v1.8h, v2.8b, v3.8b \n"
3905 "add v0.8h, v0.8h, v1.8h \n"
3906 "add v0.8h, v0.8h, v1.8h \n"
3907 "ld1 {v2.8b}, [%0],%5 \n" // right
3908 "ld1 {v3.8b}, [%1],%5 \n"
3909 "subs %w3, %w3, #8 \n" // 8 pixels
3910 "usubl v1.8h, v2.8b, v3.8b \n"
3911 "prfm pldl1keep, [%0, 448] \n"
3912 "add v0.8h, v0.8h, v1.8h \n"
3913 "prfm pldl1keep, [%1, 448] \n"
3914 "abs v0.8h, v0.8h \n"
3915 "uqxtn v0.8b, v0.8h \n"
3916 "st1 {v0.8b}, [%2], #8 \n" // store 8 sobely
3917 "b.gt 1b \n"
3918 : "+r"(src_y0), // %0
3919 "+r"(src_y1), // %1
3920 "+r"(dst_sobely), // %2
3921 "+r"(width) // %3
3922 : "r"(1LL), // %4
3923 "r"(6LL) // %5
3924 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
3925 );
3926 }
3927
3928 // Caveat - rounds float to half float whereas scaling version truncates.
HalfFloat1Row_NEON(const uint16_t * src,uint16_t * dst,float,int width)3929 void HalfFloat1Row_NEON(const uint16_t* src,
3930 uint16_t* dst,
3931 float /*unused*/,
3932 int width) {
3933 asm volatile(
3934 "1: \n"
3935 "ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts
3936 "subs %w2, %w2, #8 \n" // 8 pixels per loop
3937 "uxtl v2.4s, v1.4h \n" // 8 int's
3938 "prfm pldl1keep, [%0, 448] \n"
3939 "uxtl2 v3.4s, v1.8h \n"
3940 "scvtf v2.4s, v2.4s \n" // 8 floats
3941 "scvtf v3.4s, v3.4s \n"
3942 "fcvtn v1.4h, v2.4s \n" // 8 half floats
3943 "fcvtn2 v1.8h, v3.4s \n"
3944 "st1 {v1.16b}, [%1], #16 \n" // store 8 shorts
3945 "b.gt 1b \n"
3946 : "+r"(src), // %0
3947 "+r"(dst), // %1
3948 "+r"(width) // %2
3949 :
3950 : "cc", "memory", "v1", "v2", "v3");
3951 }
3952
HalfFloatRow_NEON(const uint16_t * src,uint16_t * dst,float scale,int width)3953 void HalfFloatRow_NEON(const uint16_t* src,
3954 uint16_t* dst,
3955 float scale,
3956 int width) {
3957 asm volatile(
3958 "1: \n"
3959 "ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts
3960 "subs %w2, %w2, #8 \n" // 8 pixels per loop
3961 "uxtl v2.4s, v1.4h \n" // 8 int's
3962 "prfm pldl1keep, [%0, 448] \n"
3963 "uxtl2 v3.4s, v1.8h \n"
3964 "scvtf v2.4s, v2.4s \n" // 8 floats
3965 "scvtf v3.4s, v3.4s \n"
3966 "fmul v2.4s, v2.4s, %3.s[0] \n" // adjust exponent
3967 "fmul v3.4s, v3.4s, %3.s[0] \n"
3968 "uqshrn v1.4h, v2.4s, #13 \n" // isolate halffloat
3969 "uqshrn2 v1.8h, v3.4s, #13 \n"
3970 "st1 {v1.16b}, [%1], #16 \n" // store 8 shorts
3971 "b.gt 1b \n"
3972 : "+r"(src), // %0
3973 "+r"(dst), // %1
3974 "+r"(width) // %2
3975 : "w"(scale * 1.9259299444e-34f) // %3
3976 : "cc", "memory", "v1", "v2", "v3");
3977 }
3978
ByteToFloatRow_NEON(const uint8_t * src,float * dst,float scale,int width)3979 void ByteToFloatRow_NEON(const uint8_t* src,
3980 float* dst,
3981 float scale,
3982 int width) {
3983 asm volatile(
3984 "1: \n"
3985 "ld1 {v1.8b}, [%0], #8 \n" // load 8 bytes
3986 "subs %w2, %w2, #8 \n" // 8 pixels per loop
3987 "uxtl v1.8h, v1.8b \n" // 8 shorts
3988 "prfm pldl1keep, [%0, 448] \n"
3989 "uxtl v2.4s, v1.4h \n" // 8 ints
3990 "uxtl2 v3.4s, v1.8h \n"
3991 "scvtf v2.4s, v2.4s \n" // 8 floats
3992 "scvtf v3.4s, v3.4s \n"
3993 "fmul v2.4s, v2.4s, %3.s[0] \n" // scale
3994 "fmul v3.4s, v3.4s, %3.s[0] \n"
3995 "st1 {v2.16b, v3.16b}, [%1], #32 \n" // store 8 floats
3996 "b.gt 1b \n"
3997 : "+r"(src), // %0
3998 "+r"(dst), // %1
3999 "+r"(width) // %2
4000 : "w"(scale) // %3
4001 : "cc", "memory", "v1", "v2", "v3");
4002 }
4003
4004 // Convert FP16 Half Floats to FP32 Floats
ConvertFP16ToFP32Row_NEON(const uint16_t * src,float * dst,int width)4005 void ConvertFP16ToFP32Row_NEON(const uint16_t* src, // fp16
4006 float* dst,
4007 int width) {
4008 asm volatile(
4009 "1: \n"
4010 "ld1 {v1.8h}, [%0], #16 \n" // load 8 halffloats
4011 "subs %w2, %w2, #8 \n" // 8 floats per loop
4012 "prfm pldl1keep, [%0, 448] \n"
4013 "fcvtl v2.4s, v1.4h \n" // 8 floats
4014 "fcvtl2 v3.4s, v1.8h \n"
4015 "stp q2, q3, [%1], #32 \n" // store 8 floats
4016 "b.gt 1b \n"
4017 : "+r"(src), // %0
4018 "+r"(dst), // %1
4019 "+r"(width) // %2
4020 :
4021 : "cc", "memory", "v1", "v2", "v3");
4022 }
4023
4024 // Convert FP16 Half Floats to FP32 Floats
4025 // Read a column and write a row
ConvertFP16ToFP32Column_NEON(const uint16_t * src,int src_stride,float * dst,int width)4026 void ConvertFP16ToFP32Column_NEON(const uint16_t* src, // fp16
4027 int src_stride, // stride in elements
4028 float* dst,
4029 int width) {
4030 asm volatile(
4031 "cmp %w2, #8 \n" // Is there 8 rows?
4032 "b.lo 2f \n"
4033 "1: \n"
4034 "ld1 {v0.h}[0], [%0], %3 \n" // load 8 halffloats
4035 "ld1 {v0.h}[1], [%0], %3 \n"
4036 "ld1 {v0.h}[2], [%0], %3 \n"
4037 "ld1 {v0.h}[3], [%0], %3 \n"
4038 "ld1 {v1.h}[0], [%0], %3 \n"
4039 "ld1 {v1.h}[1], [%0], %3 \n"
4040 "ld1 {v1.h}[2], [%0], %3 \n"
4041 "ld1 {v1.h}[3], [%0], %3 \n"
4042 "subs %w2, %w2, #8 \n" // 8 rows per loop
4043 "prfm pldl1keep, [%0, 448] \n"
4044 "fcvtl v2.4s, v0.4h \n" // 4 floats
4045 "fcvtl v3.4s, v1.4h \n" // 4 more floats
4046 "stp q2, q3, [%1], #32 \n" // store 8 floats
4047 "b.gt 1b \n"
4048 "cmp %w2, #1 \n" // Is there 1 value?
4049 "b.lo 3f \n"
4050 "2: \n"
4051 "ld1 {v1.h}[0], [%0], %3 \n" // load 1 halffloats
4052 "subs %w2, %w2, #1 \n" // 1 floats per loop
4053 "fcvtl v2.4s, v1.4h \n" // 1 floats
4054 "str s2, [%1], #4 \n" // store 1 floats
4055 "b.gt 2b \n"
4056 "3: \n"
4057 : "+r"(src), // %0
4058 "+r"(dst), // %1
4059 "+r"(width) // %2
4060 : "r"((ptrdiff_t)(src_stride * 2)) // %3
4061 : "cc", "memory", "v0", "v1", "v2", "v3");
4062 }
4063
4064 // Convert FP32 Floats to FP16 Half Floats
ConvertFP32ToFP16Row_NEON(const float * src,uint16_t * dst,int width)4065 void ConvertFP32ToFP16Row_NEON(const float* src,
4066 uint16_t* dst, // fp16
4067 int width) {
4068 asm volatile(
4069 "1: \n"
4070 "ldp q2, q3, [%0], #32 \n" // load 8 floats
4071 "subs %w2, %w2, #8 \n" // 8 floats per loop
4072 "prfm pldl1keep, [%0, 448] \n"
4073 "fcvtn v1.4h, v2.4s \n" // 8 fp16 halffloats
4074 "fcvtn2 v1.8h, v3.4s \n"
4075 "str q1, [%1], #16 \n" // store 8 fp16 halffloats
4076 "b.gt 1b \n"
4077 : "+r"(src), // %0
4078 "+r"(dst), // %1
4079 "+r"(width) // %2
4080 :
4081 : "cc", "memory", "v1", "v2", "v3");
4082 }
4083
ScaleMaxSamples_NEON(const float * src,float * dst,float scale,int width)4084 float ScaleMaxSamples_NEON(const float* src,
4085 float* dst,
4086 float scale,
4087 int width) {
4088 float fmax;
4089 asm volatile(
4090 "movi v5.4s, #0 \n" // max
4091 "movi v6.4s, #0 \n"
4092
4093 "1: \n"
4094 "ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples
4095 "subs %w2, %w2, #8 \n" // 8 processed per loop
4096 "fmul v3.4s, v1.4s, %4.s[0] \n" // scale
4097 "prfm pldl1keep, [%0, 448] \n"
4098 "fmul v4.4s, v2.4s, %4.s[0] \n" // scale
4099 "fmax v5.4s, v5.4s, v1.4s \n" // max
4100 "fmax v6.4s, v6.4s, v2.4s \n"
4101 "st1 {v3.4s, v4.4s}, [%1], #32 \n" // store 8 samples
4102 "b.gt 1b \n"
4103 "fmax v5.4s, v5.4s, v6.4s \n" // max
4104 "fmaxv %s3, v5.4s \n" // signed max acculator
4105 : "+r"(src), // %0
4106 "+r"(dst), // %1
4107 "+r"(width), // %2
4108 "=w"(fmax) // %3
4109 : "w"(scale) // %4
4110 : "cc", "memory", "v1", "v2", "v3", "v4", "v5", "v6");
4111 return fmax;
4112 }
4113
ScaleSumSamples_NEON(const float * src,float * dst,float scale,int width)4114 float ScaleSumSamples_NEON(const float* src,
4115 float* dst,
4116 float scale,
4117 int width) {
4118 float fsum;
4119 asm volatile(
4120 "movi v5.4s, #0 \n" // max
4121 "movi v6.4s, #0 \n" // max
4122
4123 "1: \n"
4124 "ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples
4125 "subs %w2, %w2, #8 \n" // 8 processed per loop
4126 "fmul v3.4s, v1.4s, %4.s[0] \n" // scale
4127 "prfm pldl1keep, [%0, 448] \n"
4128 "fmul v4.4s, v2.4s, %4.s[0] \n"
4129 "fmla v5.4s, v1.4s, v1.4s \n" // sum of squares
4130 "fmla v6.4s, v2.4s, v2.4s \n"
4131 "st1 {v3.4s, v4.4s}, [%1], #32 \n" // store 8 samples
4132 "b.gt 1b \n"
4133 "faddp v5.4s, v5.4s, v6.4s \n"
4134 "faddp v5.4s, v5.4s, v5.4s \n"
4135 "faddp %3.4s, v5.4s, v5.4s \n" // sum
4136 : "+r"(src), // %0
4137 "+r"(dst), // %1
4138 "+r"(width), // %2
4139 "=w"(fsum) // %3
4140 : "w"(scale) // %4
4141 : "cc", "memory", "v1", "v2", "v3", "v4", "v5", "v6");
4142 return fsum;
4143 }
4144
ScaleSamples_NEON(const float * src,float * dst,float scale,int width)4145 void ScaleSamples_NEON(const float* src, float* dst, float scale, int width) {
4146 asm volatile(
4147 "1: \n"
4148 "ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples
4149 "prfm pldl1keep, [%0, 448] \n"
4150 "subs %w2, %w2, #8 \n" // 8 processed per loop
4151 "fmul v1.4s, v1.4s, %3.s[0] \n" // scale
4152 "fmul v2.4s, v2.4s, %3.s[0] \n" // scale
4153 "st1 {v1.4s, v2.4s}, [%1], #32 \n" // store 8 samples
4154 "b.gt 1b \n"
4155 : "+r"(src), // %0
4156 "+r"(dst), // %1
4157 "+r"(width) // %2
4158 : "w"(scale) // %3
4159 : "cc", "memory", "v1", "v2");
4160 }
4161
4162 // filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
GaussCol_NEON(const uint16_t * src0,const uint16_t * src1,const uint16_t * src2,const uint16_t * src3,const uint16_t * src4,uint32_t * dst,int width)4163 void GaussCol_NEON(const uint16_t* src0,
4164 const uint16_t* src1,
4165 const uint16_t* src2,
4166 const uint16_t* src3,
4167 const uint16_t* src4,
4168 uint32_t* dst,
4169 int width) {
4170 asm volatile(
4171 "movi v6.8h, #4 \n" // constant 4
4172 "movi v7.8h, #6 \n" // constant 6
4173
4174 "1: \n"
4175 "ld1 {v1.8h}, [%0], #16 \n" // load 8 samples, 5 rows
4176 "ld1 {v2.8h}, [%4], #16 \n"
4177 "uaddl v0.4s, v1.4h, v2.4h \n" // * 1
4178 "prfm pldl1keep, [%0, 448] \n"
4179 "uaddl2 v1.4s, v1.8h, v2.8h \n" // * 1
4180 "ld1 {v2.8h}, [%1], #16 \n"
4181 "umlal v0.4s, v2.4h, v6.4h \n" // * 4
4182 "prfm pldl1keep, [%1, 448] \n"
4183 "umlal2 v1.4s, v2.8h, v6.8h \n" // * 4
4184 "ld1 {v2.8h}, [%2], #16 \n"
4185 "umlal v0.4s, v2.4h, v7.4h \n" // * 6
4186 "prfm pldl1keep, [%2, 448] \n"
4187 "umlal2 v1.4s, v2.8h, v7.8h \n" // * 6
4188 "ld1 {v2.8h}, [%3], #16 \n"
4189 "umlal v0.4s, v2.4h, v6.4h \n" // * 4
4190 "prfm pldl1keep, [%3, 448] \n"
4191 "umlal2 v1.4s, v2.8h, v6.8h \n" // * 4
4192 "subs %w6, %w6, #8 \n" // 8 processed per loop
4193 "st1 {v0.4s,v1.4s}, [%5], #32 \n" // store 8 samples
4194 "prfm pldl1keep, [%4, 448] \n"
4195 "b.gt 1b \n"
4196 : "+r"(src0), // %0
4197 "+r"(src1), // %1
4198 "+r"(src2), // %2
4199 "+r"(src3), // %3
4200 "+r"(src4), // %4
4201 "+r"(dst), // %5
4202 "+r"(width) // %6
4203 :
4204 : "cc", "memory", "v0", "v1", "v2", "v6", "v7");
4205 }
4206
4207 // filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
GaussRow_NEON(const uint32_t * src,uint16_t * dst,int width)4208 void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width) {
4209 const uint32_t* src1 = src + 1;
4210 const uint32_t* src2 = src + 2;
4211 const uint32_t* src3 = src + 3;
4212 asm volatile(
4213 "movi v6.4s, #4 \n" // constant 4
4214 "movi v7.4s, #6 \n" // constant 6
4215
4216 "1: \n"
4217 "ld1 {v0.4s,v1.4s,v2.4s}, [%0], %6 \n" // load 12 source samples
4218 "add v0.4s, v0.4s, v1.4s \n" // * 1
4219 "add v1.4s, v1.4s, v2.4s \n" // * 1
4220 "ld1 {v2.4s,v3.4s}, [%2], #32 \n"
4221 "mla v0.4s, v2.4s, v7.4s \n" // * 6
4222 "mla v1.4s, v3.4s, v7.4s \n" // * 6
4223 "ld1 {v2.4s,v3.4s}, [%1], #32 \n"
4224 "ld1 {v4.4s,v5.4s}, [%3], #32 \n"
4225 "add v2.4s, v2.4s, v4.4s \n" // add rows for * 4
4226 "add v3.4s, v3.4s, v5.4s \n"
4227 "prfm pldl1keep, [%0, 448] \n"
4228 "mla v0.4s, v2.4s, v6.4s \n" // * 4
4229 "mla v1.4s, v3.4s, v6.4s \n" // * 4
4230 "subs %w5, %w5, #8 \n" // 8 processed per loop
4231 "uqrshrn v0.4h, v0.4s, #8 \n" // round and pack
4232 "uqrshrn2 v0.8h, v1.4s, #8 \n"
4233 "st1 {v0.8h}, [%4], #16 \n" // store 8 samples
4234 "b.gt 1b \n"
4235 : "+r"(src), // %0
4236 "+r"(src1), // %1
4237 "+r"(src2), // %2
4238 "+r"(src3), // %3
4239 "+r"(dst), // %4
4240 "+r"(width) // %5
4241 : "r"(32LL) // %6
4242 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
4243 }
4244
4245 static const vecf32 kGaussCoefficients = {4.0f, 6.0f, 1.0f / 256.0f, 0.0f};
4246
4247 // filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
GaussCol_F32_NEON(const float * src0,const float * src1,const float * src2,const float * src3,const float * src4,float * dst,int width)4248 void GaussCol_F32_NEON(const float* src0,
4249 const float* src1,
4250 const float* src2,
4251 const float* src3,
4252 const float* src4,
4253 float* dst,
4254 int width) {
4255 asm volatile(
4256 "ld2r {v6.4s, v7.4s}, [%7] \n" // constants 4 and 6
4257
4258 "1: \n"
4259 "ld1 {v0.4s, v1.4s}, [%0], #32 \n" // load 8 samples, 5 rows
4260 "ld1 {v2.4s, v3.4s}, [%1], #32 \n"
4261 "fmla v0.4s, v2.4s, v6.4s \n" // * 4
4262 "ld1 {v4.4s, v5.4s}, [%2], #32 \n"
4263 "fmla v1.4s, v3.4s, v6.4s \n"
4264 "prfm pldl1keep, [%0, 448] \n"
4265 "fmla v0.4s, v4.4s, v7.4s \n" // * 6
4266 "ld1 {v2.4s, v3.4s}, [%3], #32 \n"
4267 "fmla v1.4s, v5.4s, v7.4s \n"
4268 "prfm pldl1keep, [%1, 448] \n"
4269 "fmla v0.4s, v2.4s, v6.4s \n" // * 4
4270 "ld1 {v4.4s, v5.4s}, [%4], #32 \n"
4271 "fmla v1.4s, v3.4s, v6.4s \n"
4272 "prfm pldl1keep, [%2, 448] \n"
4273 "fadd v0.4s, v0.4s, v4.4s \n" // * 1
4274 "prfm pldl1keep, [%3, 448] \n"
4275 "fadd v1.4s, v1.4s, v5.4s \n"
4276 "prfm pldl1keep, [%4, 448] \n"
4277 "subs %w6, %w6, #8 \n" // 8 processed per loop
4278 "st1 {v0.4s, v1.4s}, [%5], #32 \n" // store 8 samples
4279 "b.gt 1b \n"
4280 : "+r"(src0), // %0
4281 "+r"(src1), // %1
4282 "+r"(src2), // %2
4283 "+r"(src3), // %3
4284 "+r"(src4), // %4
4285 "+r"(dst), // %5
4286 "+r"(width) // %6
4287 : "r"(&kGaussCoefficients) // %7
4288 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
4289 }
4290
4291 // filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
GaussRow_F32_NEON(const float * src,float * dst,int width)4292 void GaussRow_F32_NEON(const float* src, float* dst, int width) {
4293 asm volatile(
4294 "ld3r {v6.4s, v7.4s, v8.4s}, [%3] \n" // constants 4, 6, 1/256
4295
4296 "1: \n"
4297 "ld1 {v0.4s, v1.4s, v2.4s}, [%0], %4 \n" // load 12 samples, 5
4298 // rows
4299 "fadd v0.4s, v0.4s, v1.4s \n" // * 1
4300 "ld1 {v4.4s, v5.4s}, [%0], %5 \n"
4301 "fadd v1.4s, v1.4s, v2.4s \n"
4302 "fmla v0.4s, v4.4s, v7.4s \n" // * 6
4303 "ld1 {v2.4s, v3.4s}, [%0], %4 \n"
4304 "fmla v1.4s, v5.4s, v7.4s \n"
4305 "ld1 {v4.4s, v5.4s}, [%0], %6 \n"
4306 "fadd v2.4s, v2.4s, v4.4s \n"
4307 "fadd v3.4s, v3.4s, v5.4s \n"
4308 "fmla v0.4s, v2.4s, v6.4s \n" // * 4
4309 "fmla v1.4s, v3.4s, v6.4s \n"
4310 "prfm pldl1keep, [%0, 448] \n"
4311 "fmul v0.4s, v0.4s, v8.4s \n" // / 256
4312 "fmul v1.4s, v1.4s, v8.4s \n"
4313 "subs %w2, %w2, #8 \n" // 8 processed per loop
4314 "st1 {v0.4s, v1.4s}, [%1], #32 \n" // store 8 samples
4315 "b.gt 1b \n"
4316 : "+r"(src), // %0
4317 "+r"(dst), // %1
4318 "+r"(width) // %2
4319 : "r"(&kGaussCoefficients), // %3
4320 "r"(8LL), // %4
4321 "r"(-4LL), // %5
4322 "r"(20LL) // %6
4323 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8");
4324 }
4325
4326 #if LIBYUV_USE_ST3
4327 // Convert biplanar NV21 to packed YUV24
NV21ToYUV24Row_NEON(const uint8_t * src_y,const uint8_t * src_vu,uint8_t * dst_yuv24,int width)4328 void NV21ToYUV24Row_NEON(const uint8_t* src_y,
4329 const uint8_t* src_vu,
4330 uint8_t* dst_yuv24,
4331 int width) {
4332 asm volatile(
4333 "1: \n"
4334 "ld1 {v2.16b}, [%0], #16 \n" // load 16 Y values
4335 "ld2 {v0.8b, v1.8b}, [%1], #16 \n" // load 8 VU values
4336 "zip1 v0.16b, v0.16b, v0.16b \n" // replicate V values
4337 "prfm pldl1keep, [%0, 448] \n"
4338 "zip1 v1.16b, v1.16b, v1.16b \n" // replicate U values
4339 "prfm pldl1keep, [%1, 448] \n"
4340 "subs %w3, %w3, #16 \n" // 16 pixels per loop
4341 "st3 {v0.16b,v1.16b,v2.16b}, [%2], #48 \n" // store 16 YUV pixels
4342 "b.gt 1b \n"
4343 : "+r"(src_y), // %0
4344 "+r"(src_vu), // %1
4345 "+r"(dst_yuv24), // %2
4346 "+r"(width) // %3
4347 :
4348 : "cc", "memory", "v0", "v1", "v2");
4349 }
4350 #else
4351 static const uvec8 kYUV24Shuffle[3] = {
4352 {16, 17, 0, 16, 17, 1, 18, 19, 2, 18, 19, 3, 20, 21, 4, 20},
4353 {21, 5, 22, 23, 6, 22, 23, 7, 24, 25, 8, 24, 25, 9, 26, 27},
4354 {10, 26, 27, 11, 28, 29, 12, 28, 29, 13, 30, 31, 14, 30, 31, 15}};
4355
4356 // Convert biplanar NV21 to packed YUV24
4357 // NV21 has VU in memory for chroma.
4358 // YUV24 is VUY in memory
NV21ToYUV24Row_NEON(const uint8_t * src_y,const uint8_t * src_vu,uint8_t * dst_yuv24,int width)4359 void NV21ToYUV24Row_NEON(const uint8_t* src_y,
4360 const uint8_t* src_vu,
4361 uint8_t* dst_yuv24,
4362 int width) {
4363 asm volatile(
4364 "ld1 {v5.16b,v6.16b,v7.16b}, [%4] \n" // 3 shuffler constants
4365 "1: \n"
4366 "ld1 {v0.16b}, [%0], #16 \n" // load 16 Y values
4367 "ld1 {v1.16b}, [%1], #16 \n" // load 8 VU values
4368 "tbl v2.16b, {v0.16b,v1.16b}, v5.16b \n" // weave into YUV24
4369 "prfm pldl1keep, [%0, 448] \n"
4370 "tbl v3.16b, {v0.16b,v1.16b}, v6.16b \n"
4371 "prfm pldl1keep, [%1, 448] \n"
4372 "tbl v4.16b, {v0.16b,v1.16b}, v7.16b \n"
4373 "subs %w3, %w3, #16 \n" // 16 pixels per loop
4374 "st1 {v2.16b,v3.16b,v4.16b}, [%2], #48 \n" // store 16 YUV pixels
4375 "b.gt 1b \n"
4376 : "+r"(src_y), // %0
4377 "+r"(src_vu), // %1
4378 "+r"(dst_yuv24), // %2
4379 "+r"(width) // %3
4380 : "r"(&kYUV24Shuffle[0]) // %4
4381 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
4382 }
4383 #endif // LIBYUV_USE_ST3
4384
4385 // Note ST2 8b version is faster than zip+ST1
4386
4387 // AYUV is VUYA in memory. UV for NV12 is UV order in memory.
AYUVToUVRow_NEON(const uint8_t * src_ayuv,int src_stride_ayuv,uint8_t * dst_uv,int width)4388 void AYUVToUVRow_NEON(const uint8_t* src_ayuv,
4389 int src_stride_ayuv,
4390 uint8_t* dst_uv,
4391 int width) {
4392 const uint8_t* src_ayuv_1 = src_ayuv + src_stride_ayuv;
4393 asm volatile(
4394
4395 "1: \n"
4396 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ayuv
4397 "uaddlp v0.8h, v0.16b \n" // V 16 bytes -> 8 shorts.
4398 "prfm pldl1keep, [%0, 448] \n"
4399 "uaddlp v1.8h, v1.16b \n" // U 16 bytes -> 8 shorts.
4400 "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16
4401 "uadalp v0.8h, v4.16b \n" // V 16 bytes -> 8 shorts.
4402 "uadalp v1.8h, v5.16b \n" // U 16 bytes -> 8 shorts.
4403 "prfm pldl1keep, [%1, 448] \n"
4404 "uqrshrn v3.8b, v0.8h, #2 \n" // 2x2 average
4405 "uqrshrn v2.8b, v1.8h, #2 \n"
4406 "subs %w3, %w3, #16 \n" // 16 processed per loop.
4407 "st2 {v2.8b,v3.8b}, [%2], #16 \n" // store 8 pixels UV.
4408 "b.gt 1b \n"
4409 : "+r"(src_ayuv), // %0
4410 "+r"(src_ayuv_1), // %1
4411 "+r"(dst_uv), // %2
4412 "+r"(width) // %3
4413 :
4414 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
4415 }
4416
AYUVToVURow_NEON(const uint8_t * src_ayuv,int src_stride_ayuv,uint8_t * dst_vu,int width)4417 void AYUVToVURow_NEON(const uint8_t* src_ayuv,
4418 int src_stride_ayuv,
4419 uint8_t* dst_vu,
4420 int width) {
4421 const uint8_t* src_ayuv_1 = src_ayuv + src_stride_ayuv;
4422 asm volatile(
4423
4424 "1: \n"
4425 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ayuv
4426 "uaddlp v0.8h, v0.16b \n" // V 16 bytes -> 8 shorts.
4427 "prfm pldl1keep, [%0, 448] \n"
4428 "uaddlp v1.8h, v1.16b \n" // U 16 bytes -> 8 shorts.
4429 "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16
4430 "uadalp v0.8h, v4.16b \n" // V 16 bytes -> 8 shorts.
4431 "uadalp v1.8h, v5.16b \n" // U 16 bytes -> 8 shorts.
4432 "prfm pldl1keep, [%1, 448] \n"
4433 "uqrshrn v0.8b, v0.8h, #2 \n" // 2x2 average
4434 "uqrshrn v1.8b, v1.8h, #2 \n"
4435 "subs %w3, %w3, #16 \n" // 16 processed per loop.
4436 "st2 {v0.8b,v1.8b}, [%2], #16 \n" // store 8 pixels VU.
4437 "b.gt 1b \n"
4438 : "+r"(src_ayuv), // %0
4439 "+r"(src_ayuv_1), // %1
4440 "+r"(dst_vu), // %2
4441 "+r"(width) // %3
4442 :
4443 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
4444 }
4445
4446 // Copy row of AYUV Y's into Y
AYUVToYRow_NEON(const uint8_t * src_ayuv,uint8_t * dst_y,int width)4447 void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) {
4448 asm volatile(
4449 "1: \n"
4450 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16
4451 "subs %w2, %w2, #16 \n" // 16 pixels per loop
4452 "prfm pldl1keep, [%0, 448] \n"
4453 "st1 {v2.16b}, [%1], #16 \n" // store 16 Y pixels
4454 "b.gt 1b \n"
4455 : "+r"(src_ayuv), // %0
4456 "+r"(dst_y), // %1
4457 "+r"(width) // %2
4458 :
4459 : "cc", "memory", "v0", "v1", "v2", "v3");
4460 }
4461
4462 // Shuffle table for swapping UV bytes.
4463 static const uvec8 kShuffleSwapUV = {1u, 0u, 3u, 2u, 5u, 4u, 7u, 6u,
4464 9u, 8u, 11u, 10u, 13u, 12u, 15u, 14u};
4465
4466 // Convert UV plane of NV12 to VU of NV21.
SwapUVRow_NEON(const uint8_t * src_uv,uint8_t * dst_vu,int width)4467 void SwapUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
4468 asm volatile(
4469 "ld1 {v2.16b}, [%3] \n" // shuffler
4470 "1: \n"
4471 "ld1 {v0.16b}, [%0], 16 \n" // load 16 UV values
4472 "ld1 {v1.16b}, [%0], 16 \n"
4473 "subs %w2, %w2, #16 \n" // 16 pixels per loop
4474 "tbl v0.16b, {v0.16b}, v2.16b \n"
4475 "prfm pldl1keep, [%0, 448] \n"
4476 "tbl v1.16b, {v1.16b}, v2.16b \n"
4477 "stp q0, q1, [%1], 32 \n" // store 16 VU pixels
4478 "b.gt 1b \n"
4479 : "+r"(src_uv), // %0
4480 "+r"(dst_vu), // %1
4481 "+r"(width) // %2
4482 : "r"(&kShuffleSwapUV) // %3
4483 : "cc", "memory", "v0", "v1", "v2");
4484 }
4485
HalfMergeUVRow_NEON(const uint8_t * src_u,int src_stride_u,const uint8_t * src_v,int src_stride_v,uint8_t * dst_uv,int width)4486 void HalfMergeUVRow_NEON(const uint8_t* src_u,
4487 int src_stride_u,
4488 const uint8_t* src_v,
4489 int src_stride_v,
4490 uint8_t* dst_uv,
4491 int width) {
4492 const uint8_t* src_u_1 = src_u + src_stride_u;
4493 const uint8_t* src_v_1 = src_v + src_stride_v;
4494 asm volatile(
4495 "1: \n"
4496 "ld1 {v0.16b}, [%0], #16 \n" // load 16 U values
4497 "ld1 {v1.16b}, [%2], #16 \n" // load 16 V values
4498 "ld1 {v2.16b}, [%1], #16 \n"
4499 "ld1 {v3.16b}, [%3], #16 \n"
4500 "uaddlp v0.8h, v0.16b \n" // half size
4501 "prfm pldl1keep, [%0, 448] \n"
4502 "uaddlp v1.8h, v1.16b \n"
4503 "prfm pldl1keep, [%2, 448] \n"
4504 "uadalp v0.8h, v2.16b \n"
4505 "prfm pldl1keep, [%1, 448] \n"
4506 "uadalp v1.8h, v3.16b \n"
4507 "prfm pldl1keep, [%3, 448] \n"
4508 "uqrshrn v0.8b, v0.8h, #2 \n"
4509 "uqrshrn v1.8b, v1.8h, #2 \n"
4510 "subs %w5, %w5, #16 \n" // 16 src pixels per loop
4511 "st2 {v0.8b, v1.8b}, [%4], #16 \n" // store 8 UV pixels
4512 "b.gt 1b \n"
4513 : "+r"(src_u), // %0
4514 "+r"(src_u_1), // %1
4515 "+r"(src_v), // %2
4516 "+r"(src_v_1), // %3
4517 "+r"(dst_uv), // %4
4518 "+r"(width) // %5
4519 :
4520 : "cc", "memory", "v0", "v1", "v2", "v3");
4521 }
4522
SplitUVRow_16_NEON(const uint16_t * src_uv,uint16_t * dst_u,uint16_t * dst_v,int depth,int width)4523 void SplitUVRow_16_NEON(const uint16_t* src_uv,
4524 uint16_t* dst_u,
4525 uint16_t* dst_v,
4526 int depth,
4527 int width) {
4528 int shift = depth - 16; // Negative for right shift.
4529 asm volatile(
4530 "dup v2.8h, %w4 \n"
4531 "1: \n"
4532 "ld2 {v0.8h, v1.8h}, [%0], #32 \n" // load 8 UV
4533 "subs %w3, %w3, #8 \n" // 8 src pixels per loop
4534 "ushl v0.8h, v0.8h, v2.8h \n"
4535 "prfm pldl1keep, [%0, 448] \n"
4536 "ushl v1.8h, v1.8h, v2.8h \n"
4537 "st1 {v0.8h}, [%1], #16 \n" // store 8 U pixels
4538 "st1 {v1.8h}, [%2], #16 \n" // store 8 V pixels
4539 "b.gt 1b \n"
4540 : "+r"(src_uv), // %0
4541 "+r"(dst_u), // %1
4542 "+r"(dst_v), // %2
4543 "+r"(width) // %3
4544 : "r"(shift) // %4
4545 : "cc", "memory", "v0", "v1", "v2");
4546 }
4547
MultiplyRow_16_NEON(const uint16_t * src_y,uint16_t * dst_y,int scale,int width)4548 void MultiplyRow_16_NEON(const uint16_t* src_y,
4549 uint16_t* dst_y,
4550 int scale,
4551 int width) {
4552 asm volatile(
4553 "dup v2.8h, %w3 \n"
4554 "1: \n"
4555 "ldp q0, q1, [%0], #32 \n"
4556 "mul v0.8h, v0.8h, v2.8h \n"
4557 "prfm pldl1keep, [%0, 448] \n"
4558 "mul v1.8h, v1.8h, v2.8h \n"
4559 "stp q0, q1, [%1], #32 \n" // store 16 pixels
4560 "subs %w2, %w2, #16 \n" // 16 src pixels per loop
4561 "b.gt 1b \n"
4562 : "+r"(src_y), // %0
4563 "+r"(dst_y), // %1
4564 "+r"(width) // %2
4565 : "r"(scale) // %3
4566 : "cc", "memory", "v0", "v1", "v2");
4567 }
4568
DivideRow_16_NEON(const uint16_t * src_y,uint16_t * dst_y,int scale,int width)4569 void DivideRow_16_NEON(const uint16_t* src_y,
4570 uint16_t* dst_y,
4571 int scale,
4572 int width) {
4573 asm volatile(
4574 "dup v4.8h, %w3 \n"
4575 "1: \n"
4576 "ldp q2, q3, [%0], #32 \n"
4577 "umull v0.4s, v2.4h, v4.4h \n"
4578 "umull2 v1.4s, v2.8h, v4.8h \n"
4579 "umull v2.4s, v3.4h, v4.4h \n"
4580 "umull2 v3.4s, v3.8h, v4.8h \n"
4581 "prfm pldl1keep, [%0, 448] \n"
4582 "shrn v0.4h, v0.4s, #16 \n"
4583 "shrn2 v0.8h, v1.4s, #16 \n"
4584 "shrn v1.4h, v2.4s, #16 \n"
4585 "shrn2 v1.8h, v3.4s, #16 \n"
4586 "stp q0, q1, [%1], #32 \n" // store 16 pixels
4587 "subs %w2, %w2, #16 \n" // 16 src pixels per loop
4588 "b.gt 1b \n"
4589 : "+r"(src_y), // %0
4590 "+r"(dst_y), // %1
4591 "+r"(width) // %2
4592 : "r"(scale) // %3
4593 : "cc", "memory", "v0", "v1", "v2", "v3", "v4");
4594 }
4595
4596 // Use scale to convert lsb formats to msb, depending how many bits there are:
4597 // 32768 = 9 bits = shr 1
4598 // 16384 = 10 bits = shr 2
4599 // 4096 = 12 bits = shr 4
4600 // 256 = 16 bits = shr 8
Convert16To8Row_NEON(const uint16_t * src_y,uint8_t * dst_y,int scale,int width)4601 void Convert16To8Row_NEON(const uint16_t* src_y,
4602 uint8_t* dst_y,
4603 int scale,
4604 int width) {
4605 int shift = 15 - __builtin_clz((int32_t)scale); // Negative shl is shr
4606 asm volatile(
4607 "dup v2.8h, %w3 \n"
4608 "1: \n"
4609 "ldp q0, q1, [%0], #32 \n"
4610 "ushl v0.8h, v0.8h, v2.8h \n" // shr = v2 is negative
4611 "ushl v1.8h, v1.8h, v2.8h \n"
4612 "prfm pldl1keep, [%0, 448] \n"
4613 "uqxtn v0.8b, v0.8h \n"
4614 "uqxtn2 v0.16b, v1.8h \n"
4615 "subs %w2, %w2, #16 \n" // 16 src pixels per loop
4616 "str q0, [%1], #16 \n" // store 16 pixels
4617 "b.gt 1b \n"
4618 : "+r"(src_y), // %0
4619 "+r"(dst_y), // %1
4620 "+r"(width) // %2
4621 : "r"(shift) // %3
4622 : "cc", "memory", "v0", "v1", "v2");
4623 }
4624
4625 #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
4626
4627 #ifdef __cplusplus
4628 } // extern "C"
4629 } // namespace libyuv
4630 #endif
4631