1 /*
2 * Copyright 2014 The LibYuv Project Authors. All rights reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "libyuv/row.h"
12
13 #ifdef __cplusplus
14 namespace libyuv {
15 extern "C" {
16 #endif
17
18 // This module is for GCC Neon armv8 64 bit.
19 #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
20
21 // Read 8 Y, 4 U and 4 V from 422
22 #define READYUV422 \
23 "ld1 {v0.8b}, [%0], #8 \n" \
24 "ld1 {v1.s}[0], [%1], #4 \n" \
25 "ld1 {v1.s}[1], [%2], #4 \n"
26
27 // Read 8 Y, 8 U and 8 V from 444
28 #define READYUV444 \
29 "ld1 {v0.8b}, [%0], #8 \n" \
30 "ld1 {v1.d}[0], [%1], #8 \n" \
31 "ld1 {v1.d}[1], [%2], #8 \n" \
32 "uaddlp v1.8h, v1.16b \n" \
33 "rshrn v1.8b, v1.8h, #1 \n"
34
35 // Read 8 Y, and set 4 U and 4 V to 128
36 #define READYUV400 \
37 "ld1 {v0.8b}, [%0], #8 \n" \
38 "movi v1.8b , #128 \n"
39
40 // Read 8 Y and 4 UV from NV12
41 #define READNV12 \
42 "ld1 {v0.8b}, [%0], #8 \n" \
43 "ld1 {v2.8b}, [%1], #8 \n" \
44 "uzp1 v1.8b, v2.8b, v2.8b \n" \
45 "uzp2 v3.8b, v2.8b, v2.8b \n" \
46 "ins v1.s[1], v3.s[0] \n"
47
48 // Read 8 Y and 4 VU from NV21
49 #define READNV21 \
50 "ld1 {v0.8b}, [%0], #8 \n" \
51 "ld1 {v2.8b}, [%1], #8 \n" \
52 "uzp1 v3.8b, v2.8b, v2.8b \n" \
53 "uzp2 v1.8b, v2.8b, v2.8b \n" \
54 "ins v1.s[1], v3.s[0] \n"
55
56 // Read 8 YUY2
57 #define READYUY2 \
58 "ld2 {v0.8b, v1.8b}, [%0], #16 \n" \
59 "uzp2 v3.8b, v1.8b, v1.8b \n" \
60 "uzp1 v1.8b, v1.8b, v1.8b \n" \
61 "ins v1.s[1], v3.s[0] \n"
62
63 // Read 8 UYVY
64 #define READUYVY \
65 "ld2 {v2.8b, v3.8b}, [%0], #16 \n" \
66 "orr v0.8b, v3.8b, v3.8b \n" \
67 "uzp1 v1.8b, v2.8b, v2.8b \n" \
68 "uzp2 v3.8b, v2.8b, v2.8b \n" \
69 "ins v1.s[1], v3.s[0] \n"
70
71 #define YUVTORGB_SETUP \
72 "ld1r {v24.8h}, [%[kUVBiasBGR]], #2 \n" \
73 "ld1r {v25.8h}, [%[kUVBiasBGR]], #2 \n" \
74 "ld1r {v26.8h}, [%[kUVBiasBGR]] \n" \
75 "ld1r {v31.4s}, [%[kYToRgb]] \n" \
76 "ld2 {v27.8h, v28.8h}, [%[kUVToRB]] \n" \
77 "ld2 {v29.8h, v30.8h}, [%[kUVToG]] \n"
78
79 #define YUVTORGB(vR, vG, vB) \
80 "uxtl v0.8h, v0.8b \n" /* Extract Y */ \
81 "shll v2.8h, v1.8b, #8 \n" /* Replicate UV */ \
82 "ushll2 v3.4s, v0.8h, #0 \n" /* Y */ \
83 "ushll v0.4s, v0.4h, #0 \n" \
84 "mul v3.4s, v3.4s, v31.4s \n" \
85 "mul v0.4s, v0.4s, v31.4s \n" \
86 "sqshrun v0.4h, v0.4s, #16 \n" \
87 "sqshrun2 v0.8h, v3.4s, #16 \n" /* Y */ \
88 "uaddw v1.8h, v2.8h, v1.8b \n" /* Replicate UV */ \
89 "mov v2.d[0], v1.d[1] \n" /* Extract V */ \
90 "uxtl v2.8h, v2.8b \n" \
91 "uxtl v1.8h, v1.8b \n" /* Extract U */ \
92 "mul v3.8h, v1.8h, v27.8h \n" \
93 "mul v5.8h, v1.8h, v29.8h \n" \
94 "mul v6.8h, v2.8h, v30.8h \n" \
95 "mul v7.8h, v2.8h, v28.8h \n" \
96 "sqadd v6.8h, v6.8h, v5.8h \n" \
97 "sqadd " #vB \
98 ".8h, v24.8h, v0.8h \n" /* B */ \
99 "sqadd " #vG \
100 ".8h, v25.8h, v0.8h \n" /* G */ \
101 "sqadd " #vR \
102 ".8h, v26.8h, v0.8h \n" /* R */ \
103 "sqadd " #vB ".8h, " #vB \
104 ".8h, v3.8h \n" /* B */ \
105 "sqsub " #vG ".8h, " #vG \
106 ".8h, v6.8h \n" /* G */ \
107 "sqadd " #vR ".8h, " #vR \
108 ".8h, v7.8h \n" /* R */ \
109 "sqshrun " #vB ".8b, " #vB \
110 ".8h, #6 \n" /* B */ \
111 "sqshrun " #vG ".8b, " #vG \
112 ".8h, #6 \n" /* G */ \
113 "sqshrun " #vR ".8b, " #vR ".8h, #6 \n" /* R */
114
I444ToARGBRow_NEON(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)115 void I444ToARGBRow_NEON(const uint8_t* src_y,
116 const uint8_t* src_u,
117 const uint8_t* src_v,
118 uint8_t* dst_argb,
119 const struct YuvConstants* yuvconstants,
120 int width) {
121 asm volatile (
122 YUVTORGB_SETUP
123 "movi v23.8b, #255 \n" /* A */
124 "1: \n"
125 READYUV444
126 YUVTORGB(v22, v21, v20)
127 "subs %w4, %w4, #8 \n"
128 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
129 "b.gt 1b \n"
130 : "+r"(src_y), // %0
131 "+r"(src_u), // %1
132 "+r"(src_v), // %2
133 "+r"(dst_argb), // %3
134 "+r"(width) // %4
135 : [kUVToRB]"r"(&yuvconstants->kUVToRB),
136 [kUVToG]"r"(&yuvconstants->kUVToG),
137 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
138 [kYToRgb]"r"(&yuvconstants->kYToRgb)
139 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
140 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
141 );
142 }
143
I422ToARGBRow_NEON(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)144 void I422ToARGBRow_NEON(const uint8_t* src_y,
145 const uint8_t* src_u,
146 const uint8_t* src_v,
147 uint8_t* dst_argb,
148 const struct YuvConstants* yuvconstants,
149 int width) {
150 asm volatile (
151 YUVTORGB_SETUP
152 "movi v23.8b, #255 \n" /* A */
153 "1: \n"
154 READYUV422
155 YUVTORGB(v22, v21, v20)
156 "subs %w4, %w4, #8 \n"
157 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
158 "b.gt 1b \n"
159 : "+r"(src_y), // %0
160 "+r"(src_u), // %1
161 "+r"(src_v), // %2
162 "+r"(dst_argb), // %3
163 "+r"(width) // %4
164 : [kUVToRB]"r"(&yuvconstants->kUVToRB),
165 [kUVToG]"r"(&yuvconstants->kUVToG),
166 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
167 [kYToRgb]"r"(&yuvconstants->kYToRgb)
168 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
169 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
170 );
171 }
172
I422AlphaToARGBRow_NEON(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,const uint8_t * src_a,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)173 void I422AlphaToARGBRow_NEON(const uint8_t* src_y,
174 const uint8_t* src_u,
175 const uint8_t* src_v,
176 const uint8_t* src_a,
177 uint8_t* dst_argb,
178 const struct YuvConstants* yuvconstants,
179 int width) {
180 asm volatile (
181 YUVTORGB_SETUP
182 "1: \n"
183 READYUV422
184 YUVTORGB(v22, v21, v20)
185 "ld1 {v23.8b}, [%3], #8 \n"
186 "subs %w5, %w5, #8 \n"
187 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%4], #32 \n"
188 "b.gt 1b \n"
189 : "+r"(src_y), // %0
190 "+r"(src_u), // %1
191 "+r"(src_v), // %2
192 "+r"(src_a), // %3
193 "+r"(dst_argb), // %4
194 "+r"(width) // %5
195 : [kUVToRB]"r"(&yuvconstants->kUVToRB),
196 [kUVToG]"r"(&yuvconstants->kUVToG),
197 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
198 [kYToRgb]"r"(&yuvconstants->kYToRgb)
199 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
200 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
201 );
202 }
203
I422ToRGBARow_NEON(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_rgba,const struct YuvConstants * yuvconstants,int width)204 void I422ToRGBARow_NEON(const uint8_t* src_y,
205 const uint8_t* src_u,
206 const uint8_t* src_v,
207 uint8_t* dst_rgba,
208 const struct YuvConstants* yuvconstants,
209 int width) {
210 asm volatile (
211 YUVTORGB_SETUP
212 "movi v20.8b, #255 \n" /* A */
213 "1: \n"
214 READYUV422
215 YUVTORGB(v23, v22, v21)
216 "subs %w4, %w4, #8 \n"
217 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
218 "b.gt 1b \n"
219 : "+r"(src_y), // %0
220 "+r"(src_u), // %1
221 "+r"(src_v), // %2
222 "+r"(dst_rgba), // %3
223 "+r"(width) // %4
224 : [kUVToRB]"r"(&yuvconstants->kUVToRB),
225 [kUVToG]"r"(&yuvconstants->kUVToG),
226 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
227 [kYToRgb]"r"(&yuvconstants->kYToRgb)
228 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
229 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
230 );
231 }
232
I422ToRGB24Row_NEON(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_rgb24,const struct YuvConstants * yuvconstants,int width)233 void I422ToRGB24Row_NEON(const uint8_t* src_y,
234 const uint8_t* src_u,
235 const uint8_t* src_v,
236 uint8_t* dst_rgb24,
237 const struct YuvConstants* yuvconstants,
238 int width) {
239 asm volatile (
240 YUVTORGB_SETUP
241 "1: \n"
242 READYUV422
243 YUVTORGB(v22, v21, v20)
244 "subs %w4, %w4, #8 \n"
245 "st3 {v20.8b,v21.8b,v22.8b}, [%3], #24 \n"
246 "b.gt 1b \n"
247 : "+r"(src_y), // %0
248 "+r"(src_u), // %1
249 "+r"(src_v), // %2
250 "+r"(dst_rgb24), // %3
251 "+r"(width) // %4
252 : [kUVToRB]"r"(&yuvconstants->kUVToRB),
253 [kUVToG]"r"(&yuvconstants->kUVToG),
254 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
255 [kYToRgb]"r"(&yuvconstants->kYToRgb)
256 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
257 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
258 );
259 }
260
261 #define ARGBTORGB565 \
262 "shll v0.8h, v22.8b, #8 \n" /* R */ \
263 "shll v21.8h, v21.8b, #8 \n" /* G */ \
264 "shll v20.8h, v20.8b, #8 \n" /* B */ \
265 "sri v0.8h, v21.8h, #5 \n" /* RG */ \
266 "sri v0.8h, v20.8h, #11 \n" /* RGB */
267
I422ToRGB565Row_NEON(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_rgb565,const struct YuvConstants * yuvconstants,int width)268 void I422ToRGB565Row_NEON(const uint8_t* src_y,
269 const uint8_t* src_u,
270 const uint8_t* src_v,
271 uint8_t* dst_rgb565,
272 const struct YuvConstants* yuvconstants,
273 int width) {
274 asm volatile(
275 YUVTORGB_SETUP
276 "1: \n" READYUV422 YUVTORGB(
277 v22, v21,
278 v20) "subs %w4, %w4, #8 \n" ARGBTORGB565
279 "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels
280 // RGB565.
281 "b.gt 1b \n"
282 : "+r"(src_y), // %0
283 "+r"(src_u), // %1
284 "+r"(src_v), // %2
285 "+r"(dst_rgb565), // %3
286 "+r"(width) // %4
287 : [kUVToRB] "r"(&yuvconstants->kUVToRB),
288 [kUVToG] "r"(&yuvconstants->kUVToG),
289 [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
290 [kYToRgb] "r"(&yuvconstants->kYToRgb)
291 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
292 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30");
293 }
294
295 #define ARGBTOARGB1555 \
296 "shll v0.8h, v23.8b, #8 \n" /* A */ \
297 "shll v22.8h, v22.8b, #8 \n" /* R */ \
298 "shll v21.8h, v21.8b, #8 \n" /* G */ \
299 "shll v20.8h, v20.8b, #8 \n" /* B */ \
300 "sri v0.8h, v22.8h, #1 \n" /* AR */ \
301 "sri v0.8h, v21.8h, #6 \n" /* ARG */ \
302 "sri v0.8h, v20.8h, #11 \n" /* ARGB */
303
I422ToARGB1555Row_NEON(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_argb1555,const struct YuvConstants * yuvconstants,int width)304 void I422ToARGB1555Row_NEON(const uint8_t* src_y,
305 const uint8_t* src_u,
306 const uint8_t* src_v,
307 uint8_t* dst_argb1555,
308 const struct YuvConstants* yuvconstants,
309 int width) {
310 asm volatile(
311 YUVTORGB_SETUP
312 "movi v23.8b, #255 \n"
313 "1: \n" READYUV422 YUVTORGB(
314 v22, v21,
315 v20) "subs %w4, %w4, #8 \n" ARGBTOARGB1555
316 "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels
317 // RGB565.
318 "b.gt 1b \n"
319 : "+r"(src_y), // %0
320 "+r"(src_u), // %1
321 "+r"(src_v), // %2
322 "+r"(dst_argb1555), // %3
323 "+r"(width) // %4
324 : [kUVToRB] "r"(&yuvconstants->kUVToRB),
325 [kUVToG] "r"(&yuvconstants->kUVToG),
326 [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
327 [kYToRgb] "r"(&yuvconstants->kYToRgb)
328 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
329 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30");
330 }
331
332 #define ARGBTOARGB4444 \
333 /* Input v20.8b<=B, v21.8b<=G, v22.8b<=R, v23.8b<=A, v4.8b<=0x0f */ \
334 "ushr v20.8b, v20.8b, #4 \n" /* B */ \
335 "bic v21.8b, v21.8b, v4.8b \n" /* G */ \
336 "ushr v22.8b, v22.8b, #4 \n" /* R */ \
337 "bic v23.8b, v23.8b, v4.8b \n" /* A */ \
338 "orr v0.8b, v20.8b, v21.8b \n" /* BG */ \
339 "orr v1.8b, v22.8b, v23.8b \n" /* RA */ \
340 "zip1 v0.16b, v0.16b, v1.16b \n" /* BGRA */
341
I422ToARGB4444Row_NEON(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_argb4444,const struct YuvConstants * yuvconstants,int width)342 void I422ToARGB4444Row_NEON(const uint8_t* src_y,
343 const uint8_t* src_u,
344 const uint8_t* src_v,
345 uint8_t* dst_argb4444,
346 const struct YuvConstants* yuvconstants,
347 int width) {
348 asm volatile (
349 YUVTORGB_SETUP
350 "movi v4.16b, #0x0f \n" // bits to clear with vbic.
351 "1: \n"
352 READYUV422
353 YUVTORGB(v22, v21, v20)
354 "subs %w4, %w4, #8 \n"
355 "movi v23.8b, #255 \n"
356 ARGBTOARGB4444
357 "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels ARGB4444.
358 "b.gt 1b \n"
359 : "+r"(src_y), // %0
360 "+r"(src_u), // %1
361 "+r"(src_v), // %2
362 "+r"(dst_argb4444), // %3
363 "+r"(width) // %4
364 : [kUVToRB]"r"(&yuvconstants->kUVToRB),
365 [kUVToG]"r"(&yuvconstants->kUVToG),
366 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
367 [kYToRgb]"r"(&yuvconstants->kYToRgb)
368 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
369 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
370 );
371 }
372
I400ToARGBRow_NEON(const uint8_t * src_y,uint8_t * dst_argb,int width)373 void I400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
374 asm volatile (
375 YUVTORGB_SETUP
376 "movi v23.8b, #255 \n"
377 "1: \n"
378 READYUV400
379 YUVTORGB(v22, v21, v20)
380 "subs %w2, %w2, #8 \n"
381 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
382 "b.gt 1b \n"
383 : "+r"(src_y), // %0
384 "+r"(dst_argb), // %1
385 "+r"(width) // %2
386 : [kUVToRB]"r"(&kYuvI601Constants.kUVToRB),
387 [kUVToG]"r"(&kYuvI601Constants.kUVToG),
388 [kUVBiasBGR]"r"(&kYuvI601Constants.kUVBiasBGR),
389 [kYToRgb]"r"(&kYuvI601Constants.kYToRgb)
390 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
391 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
392 );
393 }
394
J400ToARGBRow_NEON(const uint8_t * src_y,uint8_t * dst_argb,int width)395 void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
396 asm volatile(
397 "movi v23.8b, #255 \n"
398 "1: \n"
399 "ld1 {v20.8b}, [%0], #8 \n"
400 "orr v21.8b, v20.8b, v20.8b \n"
401 "orr v22.8b, v20.8b, v20.8b \n"
402 "subs %w2, %w2, #8 \n"
403 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
404 "b.gt 1b \n"
405 : "+r"(src_y), // %0
406 "+r"(dst_argb), // %1
407 "+r"(width) // %2
408 :
409 : "cc", "memory", "v20", "v21", "v22", "v23");
410 }
411
NV12ToARGBRow_NEON(const uint8_t * src_y,const uint8_t * src_uv,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)412 void NV12ToARGBRow_NEON(const uint8_t* src_y,
413 const uint8_t* src_uv,
414 uint8_t* dst_argb,
415 const struct YuvConstants* yuvconstants,
416 int width) {
417 asm volatile (
418 YUVTORGB_SETUP
419 "movi v23.8b, #255 \n"
420 "1: \n"
421 READNV12
422 YUVTORGB(v22, v21, v20)
423 "subs %w3, %w3, #8 \n"
424 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n"
425 "b.gt 1b \n"
426 : "+r"(src_y), // %0
427 "+r"(src_uv), // %1
428 "+r"(dst_argb), // %2
429 "+r"(width) // %3
430 : [kUVToRB]"r"(&yuvconstants->kUVToRB),
431 [kUVToG]"r"(&yuvconstants->kUVToG),
432 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
433 [kYToRgb]"r"(&yuvconstants->kYToRgb)
434 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
435 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
436 );
437 }
438
NV21ToARGBRow_NEON(const uint8_t * src_y,const uint8_t * src_vu,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)439 void NV21ToARGBRow_NEON(const uint8_t* src_y,
440 const uint8_t* src_vu,
441 uint8_t* dst_argb,
442 const struct YuvConstants* yuvconstants,
443 int width) {
444 asm volatile (
445 YUVTORGB_SETUP
446 "movi v23.8b, #255 \n"
447 "1: \n"
448 READNV21
449 YUVTORGB(v22, v21, v20)
450 "subs %w3, %w3, #8 \n"
451 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n"
452 "b.gt 1b \n"
453 : "+r"(src_y), // %0
454 "+r"(src_vu), // %1
455 "+r"(dst_argb), // %2
456 "+r"(width) // %3
457 : [kUVToRB]"r"(&yuvconstants->kUVToRB),
458 [kUVToG]"r"(&yuvconstants->kUVToG),
459 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
460 [kYToRgb]"r"(&yuvconstants->kYToRgb)
461 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
462 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
463 );
464 }
465
NV12ToRGB24Row_NEON(const uint8_t * src_y,const uint8_t * src_uv,uint8_t * dst_rgb24,const struct YuvConstants * yuvconstants,int width)466 void NV12ToRGB24Row_NEON(const uint8_t* src_y,
467 const uint8_t* src_uv,
468 uint8_t* dst_rgb24,
469 const struct YuvConstants* yuvconstants,
470 int width) {
471 asm volatile (
472 YUVTORGB_SETUP
473 "1: \n"
474 READNV12
475 YUVTORGB(v22, v21, v20)
476 "subs %w3, %w3, #8 \n"
477 "st3 {v20.8b,v21.8b,v22.8b}, [%2], #24 \n"
478 "b.gt 1b \n"
479 : "+r"(src_y), // %0
480 "+r"(src_uv), // %1
481 "+r"(dst_rgb24), // %2
482 "+r"(width) // %3
483 : [kUVToRB]"r"(&yuvconstants->kUVToRB),
484 [kUVToG]"r"(&yuvconstants->kUVToG),
485 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
486 [kYToRgb]"r"(&yuvconstants->kYToRgb)
487 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
488 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
489 );
490 }
491
NV21ToRGB24Row_NEON(const uint8_t * src_y,const uint8_t * src_vu,uint8_t * dst_rgb24,const struct YuvConstants * yuvconstants,int width)492 void NV21ToRGB24Row_NEON(const uint8_t* src_y,
493 const uint8_t* src_vu,
494 uint8_t* dst_rgb24,
495 const struct YuvConstants* yuvconstants,
496 int width) {
497 asm volatile (
498 YUVTORGB_SETUP
499 "1: \n"
500 READNV21
501 YUVTORGB(v22, v21, v20)
502 "subs %w3, %w3, #8 \n"
503 "st3 {v20.8b,v21.8b,v22.8b}, [%2], #24 \n"
504 "b.gt 1b \n"
505 : "+r"(src_y), // %0
506 "+r"(src_vu), // %1
507 "+r"(dst_rgb24), // %2
508 "+r"(width) // %3
509 : [kUVToRB]"r"(&yuvconstants->kUVToRB),
510 [kUVToG]"r"(&yuvconstants->kUVToG),
511 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
512 [kYToRgb]"r"(&yuvconstants->kYToRgb)
513 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
514 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
515 );
516 }
517
NV12ToRGB565Row_NEON(const uint8_t * src_y,const uint8_t * src_uv,uint8_t * dst_rgb565,const struct YuvConstants * yuvconstants,int width)518 void NV12ToRGB565Row_NEON(const uint8_t* src_y,
519 const uint8_t* src_uv,
520 uint8_t* dst_rgb565,
521 const struct YuvConstants* yuvconstants,
522 int width) {
523 asm volatile(
524 YUVTORGB_SETUP
525 "1: \n" READNV12 YUVTORGB(
526 v22, v21,
527 v20) "subs %w3, %w3, #8 \n" ARGBTORGB565
528 "st1 {v0.8h}, [%2], 16 \n" // store 8 pixels
529 // RGB565.
530 "b.gt 1b \n"
531 : "+r"(src_y), // %0
532 "+r"(src_uv), // %1
533 "+r"(dst_rgb565), // %2
534 "+r"(width) // %3
535 : [kUVToRB] "r"(&yuvconstants->kUVToRB),
536 [kUVToG] "r"(&yuvconstants->kUVToG),
537 [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
538 [kYToRgb] "r"(&yuvconstants->kYToRgb)
539 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
540 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30");
541 }
542
YUY2ToARGBRow_NEON(const uint8_t * src_yuy2,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)543 void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2,
544 uint8_t* dst_argb,
545 const struct YuvConstants* yuvconstants,
546 int width) {
547 asm volatile (
548 YUVTORGB_SETUP
549 "movi v23.8b, #255 \n"
550 "1: \n"
551 READYUY2
552 YUVTORGB(v22, v21, v20)
553 "subs %w2, %w2, #8 \n"
554 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
555 "b.gt 1b \n"
556 : "+r"(src_yuy2), // %0
557 "+r"(dst_argb), // %1
558 "+r"(width) // %2
559 : [kUVToRB]"r"(&yuvconstants->kUVToRB),
560 [kUVToG]"r"(&yuvconstants->kUVToG),
561 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
562 [kYToRgb]"r"(&yuvconstants->kYToRgb)
563 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
564 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
565 );
566 }
567
UYVYToARGBRow_NEON(const uint8_t * src_uyvy,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)568 void UYVYToARGBRow_NEON(const uint8_t* src_uyvy,
569 uint8_t* dst_argb,
570 const struct YuvConstants* yuvconstants,
571 int width) {
572 asm volatile (
573 YUVTORGB_SETUP
574 "movi v23.8b, #255 \n"
575 "1: \n"
576 READUYVY
577 YUVTORGB(v22, v21, v20)
578 "subs %w2, %w2, #8 \n"
579 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], 32 \n"
580 "b.gt 1b \n"
581 : "+r"(src_uyvy), // %0
582 "+r"(dst_argb), // %1
583 "+r"(width) // %2
584 : [kUVToRB]"r"(&yuvconstants->kUVToRB),
585 [kUVToG]"r"(&yuvconstants->kUVToG),
586 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
587 [kYToRgb]"r"(&yuvconstants->kYToRgb)
588 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
589 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
590 );
591 }
592
593 // Reads 16 pairs of UV and write even values to dst_u and odd to dst_v.
SplitUVRow_NEON(const uint8_t * src_uv,uint8_t * dst_u,uint8_t * dst_v,int width)594 void SplitUVRow_NEON(const uint8_t* src_uv,
595 uint8_t* dst_u,
596 uint8_t* dst_v,
597 int width) {
598 asm volatile(
599 "1: \n"
600 "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pairs of UV
601 "subs %w3, %w3, #16 \n" // 16 processed per loop
602 "st1 {v0.16b}, [%1], #16 \n" // store U
603 "st1 {v1.16b}, [%2], #16 \n" // store V
604 "b.gt 1b \n"
605 : "+r"(src_uv), // %0
606 "+r"(dst_u), // %1
607 "+r"(dst_v), // %2
608 "+r"(width) // %3 // Output registers
609 : // Input registers
610 : "cc", "memory", "v0", "v1" // Clobber List
611 );
612 }
613
614 // Reads 16 U's and V's and writes out 16 pairs of UV.
MergeUVRow_NEON(const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_uv,int width)615 void MergeUVRow_NEON(const uint8_t* src_u,
616 const uint8_t* src_v,
617 uint8_t* dst_uv,
618 int width) {
619 asm volatile(
620 "1: \n"
621 "ld1 {v0.16b}, [%0], #16 \n" // load U
622 "ld1 {v1.16b}, [%1], #16 \n" // load V
623 "subs %w3, %w3, #16 \n" // 16 processed per loop
624 "st2 {v0.16b,v1.16b}, [%2], #32 \n" // store 16 pairs of UV
625 "b.gt 1b \n"
626 : "+r"(src_u), // %0
627 "+r"(src_v), // %1
628 "+r"(dst_uv), // %2
629 "+r"(width) // %3 // Output registers
630 : // Input registers
631 : "cc", "memory", "v0", "v1" // Clobber List
632 );
633 }
634
635 // Reads 16 packed RGB and write to planar dst_r, dst_g, dst_b.
SplitRGBRow_NEON(const uint8_t * src_rgb,uint8_t * dst_r,uint8_t * dst_g,uint8_t * dst_b,int width)636 void SplitRGBRow_NEON(const uint8_t* src_rgb,
637 uint8_t* dst_r,
638 uint8_t* dst_g,
639 uint8_t* dst_b,
640 int width) {
641 asm volatile(
642 "1: \n"
643 "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 RGB
644 "subs %w4, %w4, #16 \n" // 16 processed per loop
645 "st1 {v0.16b}, [%1], #16 \n" // store R
646 "st1 {v1.16b}, [%2], #16 \n" // store G
647 "st1 {v2.16b}, [%3], #16 \n" // store B
648 "b.gt 1b \n"
649 : "+r"(src_rgb), // %0
650 "+r"(dst_r), // %1
651 "+r"(dst_g), // %2
652 "+r"(dst_b), // %3
653 "+r"(width) // %4
654 : // Input registers
655 : "cc", "memory", "v0", "v1", "v2" // Clobber List
656 );
657 }
658
659 // Reads 16 planar R's, G's and B's and writes out 16 packed RGB at a time
MergeRGBRow_NEON(const uint8_t * src_r,const uint8_t * src_g,const uint8_t * src_b,uint8_t * dst_rgb,int width)660 void MergeRGBRow_NEON(const uint8_t* src_r,
661 const uint8_t* src_g,
662 const uint8_t* src_b,
663 uint8_t* dst_rgb,
664 int width) {
665 asm volatile(
666 "1: \n"
667 "ld1 {v0.16b}, [%0], #16 \n" // load R
668 "ld1 {v1.16b}, [%1], #16 \n" // load G
669 "ld1 {v2.16b}, [%2], #16 \n" // load B
670 "subs %w4, %w4, #16 \n" // 16 processed per loop
671 "st3 {v0.16b,v1.16b,v2.16b}, [%3], #48 \n" // store 16 RGB
672 "b.gt 1b \n"
673 : "+r"(src_r), // %0
674 "+r"(src_g), // %1
675 "+r"(src_b), // %2
676 "+r"(dst_rgb), // %3
677 "+r"(width) // %4
678 : // Input registers
679 : "cc", "memory", "v0", "v1", "v2" // Clobber List
680 );
681 }
682
683 // Copy multiple of 32.
CopyRow_NEON(const uint8_t * src,uint8_t * dst,int width)684 void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
685 asm volatile(
686 "1: \n"
687 "ldp q0, q1, [%0], #32 \n"
688 "subs %w2, %w2, #32 \n" // 32 processed per loop
689 "stp q0, q1, [%1], #32 \n"
690 "b.gt 1b \n"
691 : "+r"(src), // %0
692 "+r"(dst), // %1
693 "+r"(width) // %2 // Output registers
694 : // Input registers
695 : "cc", "memory", "v0", "v1" // Clobber List
696 );
697 }
698
699 // SetRow writes 'width' bytes using an 8 bit value repeated.
SetRow_NEON(uint8_t * dst,uint8_t v8,int width)700 void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) {
701 asm volatile(
702 "dup v0.16b, %w2 \n" // duplicate 16 bytes
703 "1: \n"
704 "subs %w1, %w1, #16 \n" // 16 bytes per loop
705 "st1 {v0.16b}, [%0], #16 \n" // store
706 "b.gt 1b \n"
707 : "+r"(dst), // %0
708 "+r"(width) // %1
709 : "r"(v8) // %2
710 : "cc", "memory", "v0");
711 }
712
ARGBSetRow_NEON(uint8_t * dst,uint32_t v32,int width)713 void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width) {
714 asm volatile(
715 "dup v0.4s, %w2 \n" // duplicate 4 ints
716 "1: \n"
717 "subs %w1, %w1, #4 \n" // 4 ints per loop
718 "st1 {v0.16b}, [%0], #16 \n" // store
719 "b.gt 1b \n"
720 : "+r"(dst), // %0
721 "+r"(width) // %1
722 : "r"(v32) // %2
723 : "cc", "memory", "v0");
724 }
725
MirrorRow_NEON(const uint8_t * src,uint8_t * dst,int width)726 void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
727 asm volatile(
728 // Start at end of source row.
729 "add %0, %0, %w2, sxtw \n"
730 "sub %0, %0, #16 \n"
731 "1: \n"
732 "ld1 {v0.16b}, [%0], %3 \n" // src -= 16
733 "subs %w2, %w2, #16 \n" // 16 pixels per loop.
734 "rev64 v0.16b, v0.16b \n"
735 "st1 {v0.D}[1], [%1], #8 \n" // dst += 16
736 "st1 {v0.D}[0], [%1], #8 \n"
737 "b.gt 1b \n"
738 : "+r"(src), // %0
739 "+r"(dst), // %1
740 "+r"(width) // %2
741 : "r"((ptrdiff_t)-16) // %3
742 : "cc", "memory", "v0");
743 }
744
MirrorUVRow_NEON(const uint8_t * src_uv,uint8_t * dst_u,uint8_t * dst_v,int width)745 void MirrorUVRow_NEON(const uint8_t* src_uv,
746 uint8_t* dst_u,
747 uint8_t* dst_v,
748 int width) {
749 asm volatile(
750 // Start at end of source row.
751 "add %0, %0, %w3, sxtw #1 \n"
752 "sub %0, %0, #16 \n"
753 "1: \n"
754 "ld2 {v0.8b, v1.8b}, [%0], %4 \n" // src -= 16
755 "subs %w3, %w3, #8 \n" // 8 pixels per loop.
756 "rev64 v0.8b, v0.8b \n"
757 "rev64 v1.8b, v1.8b \n"
758 "st1 {v0.8b}, [%1], #8 \n" // dst += 8
759 "st1 {v1.8b}, [%2], #8 \n"
760 "b.gt 1b \n"
761 : "+r"(src_uv), // %0
762 "+r"(dst_u), // %1
763 "+r"(dst_v), // %2
764 "+r"(width) // %3
765 : "r"((ptrdiff_t)-16) // %4
766 : "cc", "memory", "v0", "v1");
767 }
768
ARGBMirrorRow_NEON(const uint8_t * src,uint8_t * dst,int width)769 void ARGBMirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
770 asm volatile(
771 // Start at end of source row.
772 "add %0, %0, %w2, sxtw #2 \n"
773 "sub %0, %0, #16 \n"
774 "1: \n"
775 "ld1 {v0.16b}, [%0], %3 \n" // src -= 16
776 "subs %w2, %w2, #4 \n" // 4 pixels per loop.
777 "rev64 v0.4s, v0.4s \n"
778 "st1 {v0.D}[1], [%1], #8 \n" // dst += 16
779 "st1 {v0.D}[0], [%1], #8 \n"
780 "b.gt 1b \n"
781 : "+r"(src), // %0
782 "+r"(dst), // %1
783 "+r"(width) // %2
784 : "r"((ptrdiff_t)-16) // %3
785 : "cc", "memory", "v0");
786 }
787
RGB24ToARGBRow_NEON(const uint8_t * src_rgb24,uint8_t * dst_argb,int width)788 void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24,
789 uint8_t* dst_argb,
790 int width) {
791 asm volatile(
792 "movi v4.8b, #255 \n" // Alpha
793 "1: \n"
794 "ld3 {v1.8b,v2.8b,v3.8b}, [%0], #24 \n" // load 8 pixels of RGB24.
795 "subs %w2, %w2, #8 \n" // 8 processed per loop.
796 "st4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n" // store 8 ARGB
797 "b.gt 1b \n"
798 : "+r"(src_rgb24), // %0
799 "+r"(dst_argb), // %1
800 "+r"(width) // %2
801 :
802 : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List
803 );
804 }
805
RAWToARGBRow_NEON(const uint8_t * src_raw,uint8_t * dst_argb,int width)806 void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
807 asm volatile(
808 "movi v5.8b, #255 \n" // Alpha
809 "1: \n"
810 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b
811 "subs %w2, %w2, #8 \n" // 8 processed per loop.
812 "orr v3.8b, v1.8b, v1.8b \n" // move g
813 "orr v4.8b, v0.8b, v0.8b \n" // move r
814 "st4 {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n" // store b g r a
815 "b.gt 1b \n"
816 : "+r"(src_raw), // %0
817 "+r"(dst_argb), // %1
818 "+r"(width) // %2
819 :
820 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5" // Clobber List
821 );
822 }
823
RAWToRGB24Row_NEON(const uint8_t * src_raw,uint8_t * dst_rgb24,int width)824 void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
825 asm volatile(
826 "1: \n"
827 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b
828 "subs %w2, %w2, #8 \n" // 8 processed per loop.
829 "orr v3.8b, v1.8b, v1.8b \n" // move g
830 "orr v4.8b, v0.8b, v0.8b \n" // move r
831 "st3 {v2.8b,v3.8b,v4.8b}, [%1], #24 \n" // store b g r
832 "b.gt 1b \n"
833 : "+r"(src_raw), // %0
834 "+r"(dst_rgb24), // %1
835 "+r"(width) // %2
836 :
837 : "cc", "memory", "v0", "v1", "v2", "v3", "v4" // Clobber List
838 );
839 }
840
841 #define RGB565TOARGB \
842 "shrn v6.8b, v0.8h, #5 \n" /* G xxGGGGGG */ \
843 "shl v6.8b, v6.8b, #2 \n" /* G GGGGGG00 upper 6 */ \
844 "ushr v4.8b, v6.8b, #6 \n" /* G 000000GG lower 2 */ \
845 "orr v1.8b, v4.8b, v6.8b \n" /* G */ \
846 "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \
847 "ushr v0.8h, v0.8h, #11 \n" /* R 000RRRRR */ \
848 "xtn2 v2.16b,v0.8h \n" /* R in upper part */ \
849 "shl v2.16b, v2.16b, #3 \n" /* R,B BBBBB000 upper 5 */ \
850 "ushr v0.16b, v2.16b, #5 \n" /* R,B 00000BBB lower 3 */ \
851 "orr v0.16b, v0.16b, v2.16b \n" /* R,B */ \
852 "dup v2.2D, v0.D[1] \n" /* R */
853
RGB565ToARGBRow_NEON(const uint8_t * src_rgb565,uint8_t * dst_argb,int width)854 void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565,
855 uint8_t* dst_argb,
856 int width) {
857 asm volatile(
858 "movi v3.8b, #255 \n" // Alpha
859 "1: \n"
860 "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels.
861 "subs %w2, %w2, #8 \n" // 8 processed per loop.
862 RGB565TOARGB
863 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB
864 "b.gt 1b \n"
865 : "+r"(src_rgb565), // %0
866 "+r"(dst_argb), // %1
867 "+r"(width) // %2
868 :
869 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6" // Clobber List
870 );
871 }
872
873 #define ARGB1555TOARGB \
874 "ushr v2.8h, v0.8h, #10 \n" /* R xxxRRRRR */ \
875 "shl v2.8h, v2.8h, #3 \n" /* R RRRRR000 upper 5 */ \
876 "xtn v3.8b, v2.8h \n" /* RRRRR000 AAAAAAAA */ \
877 \
878 "sshr v2.8h, v0.8h, #15 \n" /* A AAAAAAAA */ \
879 "xtn2 v3.16b, v2.8h \n" \
880 \
881 "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \
882 "shrn2 v2.16b,v0.8h, #5 \n" /* G xxxGGGGG */ \
883 \
884 "ushr v1.16b, v3.16b, #5 \n" /* R,A 00000RRR lower 3 */ \
885 "shl v0.16b, v2.16b, #3 \n" /* B,G BBBBB000 upper 5 */ \
886 "ushr v2.16b, v0.16b, #5 \n" /* B,G 00000BBB lower 3 */ \
887 \
888 "orr v0.16b, v0.16b, v2.16b \n" /* B,G */ \
889 "orr v2.16b, v1.16b, v3.16b \n" /* R,A */ \
890 "dup v1.2D, v0.D[1] \n" \
891 "dup v3.2D, v2.D[1] \n"
892
893 // RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha.
894 #define RGB555TOARGB \
895 "ushr v2.8h, v0.8h, #10 \n" /* R xxxRRRRR */ \
896 "shl v2.8h, v2.8h, #3 \n" /* R RRRRR000 upper 5 */ \
897 "xtn v3.8b, v2.8h \n" /* RRRRR000 */ \
898 \
899 "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \
900 "shrn2 v2.16b,v0.8h, #5 \n" /* G xxxGGGGG */ \
901 \
902 "ushr v1.16b, v3.16b, #5 \n" /* R 00000RRR lower 3 */ \
903 "shl v0.16b, v2.16b, #3 \n" /* B,G BBBBB000 upper 5 */ \
904 "ushr v2.16b, v0.16b, #5 \n" /* B,G 00000BBB lower 3 */ \
905 \
906 "orr v0.16b, v0.16b, v2.16b \n" /* B,G */ \
907 "orr v2.16b, v1.16b, v3.16b \n" /* R */ \
908 "dup v1.2D, v0.D[1] \n" /* G */
909
ARGB1555ToARGBRow_NEON(const uint8_t * src_argb1555,uint8_t * dst_argb,int width)910 void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555,
911 uint8_t* dst_argb,
912 int width) {
913 asm volatile(
914 "movi v3.8b, #255 \n" // Alpha
915 "1: \n"
916 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
917 "subs %w2, %w2, #8 \n" // 8 processed per loop.
918 ARGB1555TOARGB
919 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB
920 // pixels
921 "b.gt 1b \n"
922 : "+r"(src_argb1555), // %0
923 "+r"(dst_argb), // %1
924 "+r"(width) // %2
925 :
926 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
927 );
928 }
929
930 #define ARGB4444TOARGB \
931 "shrn v1.8b, v0.8h, #8 \n" /* v1(l) AR */ \
932 "xtn2 v1.16b, v0.8h \n" /* v1(h) GB */ \
933 "shl v2.16b, v1.16b, #4 \n" /* B,R BBBB0000 */ \
934 "ushr v3.16b, v1.16b, #4 \n" /* G,A 0000GGGG */ \
935 "ushr v0.16b, v2.16b, #4 \n" /* B,R 0000BBBB */ \
936 "shl v1.16b, v3.16b, #4 \n" /* G,A GGGG0000 */ \
937 "orr v2.16b, v0.16b, v2.16b \n" /* B,R BBBBBBBB */ \
938 "orr v3.16b, v1.16b, v3.16b \n" /* G,A GGGGGGGG */ \
939 "dup v0.2D, v2.D[1] \n" \
940 "dup v1.2D, v3.D[1] \n"
941
ARGB4444ToARGBRow_NEON(const uint8_t * src_argb4444,uint8_t * dst_argb,int width)942 void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444,
943 uint8_t* dst_argb,
944 int width) {
945 asm volatile(
946 "1: \n"
947 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels.
948 "subs %w2, %w2, #8 \n" // 8 processed per loop.
949 ARGB4444TOARGB
950 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB
951 // pixels
952 "b.gt 1b \n"
953 : "+r"(src_argb4444), // %0
954 "+r"(dst_argb), // %1
955 "+r"(width) // %2
956 :
957 : "cc", "memory", "v0", "v1", "v2", "v3", "v4" // Clobber List
958 );
959 }
960
ARGBToRGB24Row_NEON(const uint8_t * src_argb,uint8_t * dst_rgb24,int width)961 void ARGBToRGB24Row_NEON(const uint8_t* src_argb,
962 uint8_t* dst_rgb24,
963 int width) {
964 asm volatile(
965 "1: \n"
966 "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load 8 ARGB
967 "subs %w2, %w2, #8 \n" // 8 processed per loop.
968 "st3 {v1.8b,v2.8b,v3.8b}, [%1], #24 \n" // store 8 pixels of
969 // RGB24.
970 "b.gt 1b \n"
971 : "+r"(src_argb), // %0
972 "+r"(dst_rgb24), // %1
973 "+r"(width) // %2
974 :
975 : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List
976 );
977 }
978
ARGBToRAWRow_NEON(const uint8_t * src_argb,uint8_t * dst_raw,int width)979 void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) {
980 asm volatile(
981 "1: \n"
982 "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load b g r a
983 "subs %w2, %w2, #8 \n" // 8 processed per loop.
984 "orr v4.8b, v2.8b, v2.8b \n" // mov g
985 "orr v5.8b, v1.8b, v1.8b \n" // mov b
986 "st3 {v3.8b,v4.8b,v5.8b}, [%1], #24 \n" // store r g b
987 "b.gt 1b \n"
988 : "+r"(src_argb), // %0
989 "+r"(dst_raw), // %1
990 "+r"(width) // %2
991 :
992 : "cc", "memory", "v1", "v2", "v3", "v4", "v5" // Clobber List
993 );
994 }
995
YUY2ToYRow_NEON(const uint8_t * src_yuy2,uint8_t * dst_y,int width)996 void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
997 asm volatile(
998 "1: \n"
999 "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of YUY2.
1000 "subs %w2, %w2, #16 \n" // 16 processed per loop.
1001 "st1 {v0.16b}, [%1], #16 \n" // store 16 pixels of Y.
1002 "b.gt 1b \n"
1003 : "+r"(src_yuy2), // %0
1004 "+r"(dst_y), // %1
1005 "+r"(width) // %2
1006 :
1007 : "cc", "memory", "v0", "v1" // Clobber List
1008 );
1009 }
1010
UYVYToYRow_NEON(const uint8_t * src_uyvy,uint8_t * dst_y,int width)1011 void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
1012 asm volatile(
1013 "1: \n"
1014 "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY.
1015 "subs %w2, %w2, #16 \n" // 16 processed per loop.
1016 "st1 {v1.16b}, [%1], #16 \n" // store 16 pixels of Y.
1017 "b.gt 1b \n"
1018 : "+r"(src_uyvy), // %0
1019 "+r"(dst_y), // %1
1020 "+r"(width) // %2
1021 :
1022 : "cc", "memory", "v0", "v1" // Clobber List
1023 );
1024 }
1025
YUY2ToUV422Row_NEON(const uint8_t * src_yuy2,uint8_t * dst_u,uint8_t * dst_v,int width)1026 void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2,
1027 uint8_t* dst_u,
1028 uint8_t* dst_v,
1029 int width) {
1030 asm volatile(
1031 "1: \n"
1032 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 YUY2
1033 "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs.
1034 "st1 {v1.8b}, [%1], #8 \n" // store 8 U.
1035 "st1 {v3.8b}, [%2], #8 \n" // store 8 V.
1036 "b.gt 1b \n"
1037 : "+r"(src_yuy2), // %0
1038 "+r"(dst_u), // %1
1039 "+r"(dst_v), // %2
1040 "+r"(width) // %3
1041 :
1042 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
1043 );
1044 }
1045
UYVYToUV422Row_NEON(const uint8_t * src_uyvy,uint8_t * dst_u,uint8_t * dst_v,int width)1046 void UYVYToUV422Row_NEON(const uint8_t* src_uyvy,
1047 uint8_t* dst_u,
1048 uint8_t* dst_v,
1049 int width) {
1050 asm volatile(
1051 "1: \n"
1052 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 UYVY
1053 "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs.
1054 "st1 {v0.8b}, [%1], #8 \n" // store 8 U.
1055 "st1 {v2.8b}, [%2], #8 \n" // store 8 V.
1056 "b.gt 1b \n"
1057 : "+r"(src_uyvy), // %0
1058 "+r"(dst_u), // %1
1059 "+r"(dst_v), // %2
1060 "+r"(width) // %3
1061 :
1062 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
1063 );
1064 }
1065
YUY2ToUVRow_NEON(const uint8_t * src_yuy2,int stride_yuy2,uint8_t * dst_u,uint8_t * dst_v,int width)1066 void YUY2ToUVRow_NEON(const uint8_t* src_yuy2,
1067 int stride_yuy2,
1068 uint8_t* dst_u,
1069 uint8_t* dst_v,
1070 int width) {
1071 const uint8_t* src_yuy2b = src_yuy2 + stride_yuy2;
1072 asm volatile(
1073 "1: \n"
1074 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels
1075 "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs.
1076 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row
1077 "urhadd v1.8b, v1.8b, v5.8b \n" // average rows of U
1078 "urhadd v3.8b, v3.8b, v7.8b \n" // average rows of V
1079 "st1 {v1.8b}, [%2], #8 \n" // store 8 U.
1080 "st1 {v3.8b}, [%3], #8 \n" // store 8 V.
1081 "b.gt 1b \n"
1082 : "+r"(src_yuy2), // %0
1083 "+r"(src_yuy2b), // %1
1084 "+r"(dst_u), // %2
1085 "+r"(dst_v), // %3
1086 "+r"(width) // %4
1087 :
1088 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
1089 "v7" // Clobber List
1090 );
1091 }
1092
UYVYToUVRow_NEON(const uint8_t * src_uyvy,int stride_uyvy,uint8_t * dst_u,uint8_t * dst_v,int width)1093 void UYVYToUVRow_NEON(const uint8_t* src_uyvy,
1094 int stride_uyvy,
1095 uint8_t* dst_u,
1096 uint8_t* dst_v,
1097 int width) {
1098 const uint8_t* src_uyvyb = src_uyvy + stride_uyvy;
1099 asm volatile(
1100 "1: \n"
1101 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels
1102 "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs.
1103 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row
1104 "urhadd v0.8b, v0.8b, v4.8b \n" // average rows of U
1105 "urhadd v2.8b, v2.8b, v6.8b \n" // average rows of V
1106 "st1 {v0.8b}, [%2], #8 \n" // store 8 U.
1107 "st1 {v2.8b}, [%3], #8 \n" // store 8 V.
1108 "b.gt 1b \n"
1109 : "+r"(src_uyvy), // %0
1110 "+r"(src_uyvyb), // %1
1111 "+r"(dst_u), // %2
1112 "+r"(dst_v), // %3
1113 "+r"(width) // %4
1114 :
1115 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
1116 "v7" // Clobber List
1117 );
1118 }
1119
1120 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
ARGBShuffleRow_NEON(const uint8_t * src_argb,uint8_t * dst_argb,const uint8_t * shuffler,int width)1121 void ARGBShuffleRow_NEON(const uint8_t* src_argb,
1122 uint8_t* dst_argb,
1123 const uint8_t* shuffler,
1124 int width) {
1125 asm volatile(
1126 "ld1 {v2.16b}, [%3] \n" // shuffler
1127 "1: \n"
1128 "ld1 {v0.16b}, [%0], #16 \n" // load 4 pixels.
1129 "subs %w2, %w2, #4 \n" // 4 processed per loop
1130 "tbl v1.16b, {v0.16b}, v2.16b \n" // look up 4 pixels
1131 "st1 {v1.16b}, [%1], #16 \n" // store 4.
1132 "b.gt 1b \n"
1133 : "+r"(src_argb), // %0
1134 "+r"(dst_argb), // %1
1135 "+r"(width) // %2
1136 : "r"(shuffler) // %3
1137 : "cc", "memory", "v0", "v1", "v2" // Clobber List
1138 );
1139 }
1140
I422ToYUY2Row_NEON(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_yuy2,int width)1141 void I422ToYUY2Row_NEON(const uint8_t* src_y,
1142 const uint8_t* src_u,
1143 const uint8_t* src_v,
1144 uint8_t* dst_yuy2,
1145 int width) {
1146 asm volatile(
1147 "1: \n"
1148 "ld2 {v0.8b, v1.8b}, [%0], #16 \n" // load 16 Ys
1149 "orr v2.8b, v1.8b, v1.8b \n"
1150 "ld1 {v1.8b}, [%1], #8 \n" // load 8 Us
1151 "ld1 {v3.8b}, [%2], #8 \n" // load 8 Vs
1152 "subs %w4, %w4, #16 \n" // 16 pixels
1153 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels.
1154 "b.gt 1b \n"
1155 : "+r"(src_y), // %0
1156 "+r"(src_u), // %1
1157 "+r"(src_v), // %2
1158 "+r"(dst_yuy2), // %3
1159 "+r"(width) // %4
1160 :
1161 : "cc", "memory", "v0", "v1", "v2", "v3");
1162 }
1163
I422ToUYVYRow_NEON(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_uyvy,int width)1164 void I422ToUYVYRow_NEON(const uint8_t* src_y,
1165 const uint8_t* src_u,
1166 const uint8_t* src_v,
1167 uint8_t* dst_uyvy,
1168 int width) {
1169 asm volatile(
1170 "1: \n"
1171 "ld2 {v1.8b,v2.8b}, [%0], #16 \n" // load 16 Ys
1172 "orr v3.8b, v2.8b, v2.8b \n"
1173 "ld1 {v0.8b}, [%1], #8 \n" // load 8 Us
1174 "ld1 {v2.8b}, [%2], #8 \n" // load 8 Vs
1175 "subs %w4, %w4, #16 \n" // 16 pixels
1176 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels.
1177 "b.gt 1b \n"
1178 : "+r"(src_y), // %0
1179 "+r"(src_u), // %1
1180 "+r"(src_v), // %2
1181 "+r"(dst_uyvy), // %3
1182 "+r"(width) // %4
1183 :
1184 : "cc", "memory", "v0", "v1", "v2", "v3");
1185 }
1186
ARGBToRGB565Row_NEON(const uint8_t * src_argb,uint8_t * dst_rgb565,int width)1187 void ARGBToRGB565Row_NEON(const uint8_t* src_argb,
1188 uint8_t* dst_rgb565,
1189 int width) {
1190 asm volatile(
1191 "1: \n"
1192 "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels
1193 "subs %w2, %w2, #8 \n" // 8 processed per loop.
1194 ARGBTORGB565
1195 "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels RGB565.
1196 "b.gt 1b \n"
1197 : "+r"(src_argb), // %0
1198 "+r"(dst_rgb565), // %1
1199 "+r"(width) // %2
1200 :
1201 : "cc", "memory", "v0", "v20", "v21", "v22", "v23");
1202 }
1203
ARGBToRGB565DitherRow_NEON(const uint8_t * src_argb,uint8_t * dst_rgb,const uint32_t dither4,int width)1204 void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb,
1205 uint8_t* dst_rgb,
1206 const uint32_t dither4,
1207 int width) {
1208 asm volatile(
1209 "dup v1.4s, %w2 \n" // dither4
1210 "1: \n"
1211 "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" // load 8 pixels
1212 "subs %w3, %w3, #8 \n" // 8 processed per loop.
1213 "uqadd v20.8b, v20.8b, v1.8b \n"
1214 "uqadd v21.8b, v21.8b, v1.8b \n"
1215 "uqadd v22.8b, v22.8b, v1.8b \n" ARGBTORGB565
1216 "st1 {v0.16b}, [%0], #16 \n" // store 8 pixels RGB565.
1217 "b.gt 1b \n"
1218 : "+r"(dst_rgb) // %0
1219 : "r"(src_argb), // %1
1220 "r"(dither4), // %2
1221 "r"(width) // %3
1222 : "cc", "memory", "v0", "v1", "v20", "v21", "v22", "v23");
1223 }
1224
ARGBToARGB1555Row_NEON(const uint8_t * src_argb,uint8_t * dst_argb1555,int width)1225 void ARGBToARGB1555Row_NEON(const uint8_t* src_argb,
1226 uint8_t* dst_argb1555,
1227 int width) {
1228 asm volatile(
1229 "1: \n"
1230 "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels
1231 "subs %w2, %w2, #8 \n" // 8 processed per loop.
1232 ARGBTOARGB1555
1233 "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels
1234 // ARGB1555.
1235 "b.gt 1b \n"
1236 : "+r"(src_argb), // %0
1237 "+r"(dst_argb1555), // %1
1238 "+r"(width) // %2
1239 :
1240 : "cc", "memory", "v0", "v20", "v21", "v22", "v23");
1241 }
1242
ARGBToARGB4444Row_NEON(const uint8_t * src_argb,uint8_t * dst_argb4444,int width)1243 void ARGBToARGB4444Row_NEON(const uint8_t* src_argb,
1244 uint8_t* dst_argb4444,
1245 int width) {
1246 asm volatile(
1247 "movi v4.16b, #0x0f \n" // bits to clear with
1248 // vbic.
1249 "1: \n"
1250 "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels
1251 "subs %w2, %w2, #8 \n" // 8 processed per loop.
1252 ARGBTOARGB4444
1253 "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels
1254 // ARGB4444.
1255 "b.gt 1b \n"
1256 : "+r"(src_argb), // %0
1257 "+r"(dst_argb4444), // %1
1258 "+r"(width) // %2
1259 :
1260 : "cc", "memory", "v0", "v1", "v4", "v20", "v21", "v22", "v23");
1261 }
1262
ARGBToYRow_NEON(const uint8_t * src_argb,uint8_t * dst_y,int width)1263 void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
1264 asm volatile(
1265 "movi v4.8b, #13 \n" // B * 0.1016 coefficient
1266 "movi v5.8b, #65 \n" // G * 0.5078 coefficient
1267 "movi v6.8b, #33 \n" // R * 0.2578 coefficient
1268 "movi v7.8b, #16 \n" // Add 16 constant
1269 "1: \n"
1270 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
1271 "subs %w2, %w2, #8 \n" // 8 processed per loop.
1272 "umull v3.8h, v0.8b, v4.8b \n" // B
1273 "umlal v3.8h, v1.8b, v5.8b \n" // G
1274 "umlal v3.8h, v2.8b, v6.8b \n" // R
1275 "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y
1276 "uqadd v0.8b, v0.8b, v7.8b \n"
1277 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
1278 "b.gt 1b \n"
1279 : "+r"(src_argb), // %0
1280 "+r"(dst_y), // %1
1281 "+r"(width) // %2
1282 :
1283 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
1284 }
1285
ARGBExtractAlphaRow_NEON(const uint8_t * src_argb,uint8_t * dst_a,int width)1286 void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb,
1287 uint8_t* dst_a,
1288 int width) {
1289 asm volatile(
1290 "1: \n"
1291 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load row 16
1292 // pixels
1293 "subs %w2, %w2, #16 \n" // 16 processed per loop
1294 "st1 {v3.16b}, [%1], #16 \n" // store 16 A's.
1295 "b.gt 1b \n"
1296 : "+r"(src_argb), // %0
1297 "+r"(dst_a), // %1
1298 "+r"(width) // %2
1299 :
1300 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
1301 );
1302 }
1303
ARGBToYJRow_NEON(const uint8_t * src_argb,uint8_t * dst_y,int width)1304 void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
1305 asm volatile(
1306 "movi v4.8b, #15 \n" // B * 0.11400 coefficient
1307 "movi v5.8b, #75 \n" // G * 0.58700 coefficient
1308 "movi v6.8b, #38 \n" // R * 0.29900 coefficient
1309 "1: \n"
1310 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
1311 "subs %w2, %w2, #8 \n" // 8 processed per loop.
1312 "umull v3.8h, v0.8b, v4.8b \n" // B
1313 "umlal v3.8h, v1.8b, v5.8b \n" // G
1314 "umlal v3.8h, v2.8b, v6.8b \n" // R
1315 "sqrshrun v0.8b, v3.8h, #7 \n" // 15 bit to 8 bit Y
1316 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
1317 "b.gt 1b \n"
1318 : "+r"(src_argb), // %0
1319 "+r"(dst_y), // %1
1320 "+r"(width) // %2
1321 :
1322 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6");
1323 }
1324
1325 // 8x1 pixels.
ARGBToUV444Row_NEON(const uint8_t * src_argb,uint8_t * dst_u,uint8_t * dst_v,int width)1326 void ARGBToUV444Row_NEON(const uint8_t* src_argb,
1327 uint8_t* dst_u,
1328 uint8_t* dst_v,
1329 int width) {
1330 asm volatile(
1331 "movi v24.8b, #112 \n" // UB / VR 0.875
1332 // coefficient
1333 "movi v25.8b, #74 \n" // UG -0.5781 coefficient
1334 "movi v26.8b, #38 \n" // UR -0.2969 coefficient
1335 "movi v27.8b, #18 \n" // VB -0.1406 coefficient
1336 "movi v28.8b, #94 \n" // VG -0.7344 coefficient
1337 "movi v29.16b,#0x80 \n" // 128.5
1338 "1: \n"
1339 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
1340 // pixels.
1341 "subs %w3, %w3, #8 \n" // 8 processed per loop.
1342 "umull v4.8h, v0.8b, v24.8b \n" // B
1343 "umlsl v4.8h, v1.8b, v25.8b \n" // G
1344 "umlsl v4.8h, v2.8b, v26.8b \n" // R
1345 "add v4.8h, v4.8h, v29.8h \n" // +128 -> unsigned
1346
1347 "umull v3.8h, v2.8b, v24.8b \n" // R
1348 "umlsl v3.8h, v1.8b, v28.8b \n" // G
1349 "umlsl v3.8h, v0.8b, v27.8b \n" // B
1350 "add v3.8h, v3.8h, v29.8h \n" // +128 -> unsigned
1351
1352 "uqshrn v0.8b, v4.8h, #8 \n" // 16 bit to 8 bit U
1353 "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V
1354
1355 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U.
1356 "st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V.
1357 "b.gt 1b \n"
1358 : "+r"(src_argb), // %0
1359 "+r"(dst_u), // %1
1360 "+r"(dst_v), // %2
1361 "+r"(width) // %3
1362 :
1363 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26",
1364 "v27", "v28", "v29");
1365 }
1366
1367 #define RGBTOUV_SETUP_REG \
1368 "movi v20.8h, #56, lsl #0 \n" /* UB/VR coefficient (0.875) / 2 */ \
1369 "movi v21.8h, #37, lsl #0 \n" /* UG coefficient (-0.5781) / 2 */ \
1370 "movi v22.8h, #19, lsl #0 \n" /* UR coefficient (-0.2969) / 2 */ \
1371 "movi v23.8h, #9, lsl #0 \n" /* VB coefficient (-0.1406) / 2 */ \
1372 "movi v24.8h, #47, lsl #0 \n" /* VG coefficient (-0.7344) / 2 */ \
1373 "movi v25.16b, #0x80 \n" /* 128.5 (0x8080 in 16-bit) */
1374
1375 // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
1376 // clang-format off
1377 #define RGBTOUV(QB, QG, QR) \
1378 "mul v3.8h, " #QB ",v20.8h \n" /* B */ \
1379 "mul v4.8h, " #QR ",v20.8h \n" /* R */ \
1380 "mls v3.8h, " #QG ",v21.8h \n" /* G */ \
1381 "mls v4.8h, " #QG ",v24.8h \n" /* G */ \
1382 "mls v3.8h, " #QR ",v22.8h \n" /* R */ \
1383 "mls v4.8h, " #QB ",v23.8h \n" /* B */ \
1384 "add v3.8h, v3.8h, v25.8h \n" /* +128 -> unsigned */ \
1385 "add v4.8h, v4.8h, v25.8h \n" /* +128 -> unsigned */ \
1386 "uqshrn v0.8b, v3.8h, #8 \n" /* 16 bit to 8 bit U */ \
1387 "uqshrn v1.8b, v4.8h, #8 \n" /* 16 bit to 8 bit V */
1388 // clang-format on
1389
1390 // TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
1391 // TODO(fbarchard): consider ptrdiff_t for all strides.
1392
ARGBToUVRow_NEON(const uint8_t * src_argb,int src_stride_argb,uint8_t * dst_u,uint8_t * dst_v,int width)1393 void ARGBToUVRow_NEON(const uint8_t* src_argb,
1394 int src_stride_argb,
1395 uint8_t* dst_u,
1396 uint8_t* dst_v,
1397 int width) {
1398 const uint8_t* src_argb_1 = src_argb + src_stride_argb;
1399 asm volatile (
1400 RGBTOUV_SETUP_REG
1401 "1: \n"
1402 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
1403 "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
1404 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
1405 "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
1406
1407 "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16
1408 "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts.
1409 "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
1410 "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts.
1411
1412 "urshr v0.8h, v0.8h, #1 \n" // 2x average
1413 "urshr v1.8h, v1.8h, #1 \n"
1414 "urshr v2.8h, v2.8h, #1 \n"
1415
1416 "subs %w4, %w4, #16 \n" // 32 processed per loop.
1417 RGBTOUV(v0.8h, v1.8h, v2.8h)
1418 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
1419 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
1420 "b.gt 1b \n"
1421 : "+r"(src_argb), // %0
1422 "+r"(src_argb_1), // %1
1423 "+r"(dst_u), // %2
1424 "+r"(dst_v), // %3
1425 "+r"(width) // %4
1426 :
1427 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1428 "v20", "v21", "v22", "v23", "v24", "v25"
1429 );
1430 }
1431
1432 // TODO(fbarchard): Subsample match C code.
ARGBToUVJRow_NEON(const uint8_t * src_argb,int src_stride_argb,uint8_t * dst_u,uint8_t * dst_v,int width)1433 void ARGBToUVJRow_NEON(const uint8_t* src_argb,
1434 int src_stride_argb,
1435 uint8_t* dst_u,
1436 uint8_t* dst_v,
1437 int width) {
1438 const uint8_t* src_argb_1 = src_argb + src_stride_argb;
1439 asm volatile (
1440 "movi v20.8h, #63, lsl #0 \n" // UB/VR coeff (0.500) / 2
1441 "movi v21.8h, #42, lsl #0 \n" // UG coeff (-0.33126) / 2
1442 "movi v22.8h, #21, lsl #0 \n" // UR coeff (-0.16874) / 2
1443 "movi v23.8h, #10, lsl #0 \n" // VB coeff (-0.08131) / 2
1444 "movi v24.8h, #53, lsl #0 \n" // VG coeff (-0.41869) / 2
1445 "movi v25.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit)
1446 "1: \n"
1447 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
1448 "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
1449 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
1450 "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
1451 "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16
1452 "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts.
1453 "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
1454 "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts.
1455
1456 "urshr v0.8h, v0.8h, #1 \n" // 2x average
1457 "urshr v1.8h, v1.8h, #1 \n"
1458 "urshr v2.8h, v2.8h, #1 \n"
1459
1460 "subs %w4, %w4, #16 \n" // 32 processed per loop.
1461 RGBTOUV(v0.8h, v1.8h, v2.8h)
1462 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
1463 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
1464 "b.gt 1b \n"
1465 : "+r"(src_argb), // %0
1466 "+r"(src_argb_1), // %1
1467 "+r"(dst_u), // %2
1468 "+r"(dst_v), // %3
1469 "+r"(width) // %4
1470 :
1471 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1472 "v20", "v21", "v22", "v23", "v24", "v25"
1473 );
1474 }
1475
BGRAToUVRow_NEON(const uint8_t * src_bgra,int src_stride_bgra,uint8_t * dst_u,uint8_t * dst_v,int width)1476 void BGRAToUVRow_NEON(const uint8_t* src_bgra,
1477 int src_stride_bgra,
1478 uint8_t* dst_u,
1479 uint8_t* dst_v,
1480 int width) {
1481 const uint8_t* src_bgra_1 = src_bgra + src_stride_bgra;
1482 asm volatile (
1483 RGBTOUV_SETUP_REG
1484 "1: \n"
1485 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
1486 "uaddlp v0.8h, v3.16b \n" // B 16 bytes -> 8 shorts.
1487 "uaddlp v3.8h, v2.16b \n" // G 16 bytes -> 8 shorts.
1488 "uaddlp v2.8h, v1.16b \n" // R 16 bytes -> 8 shorts.
1489 "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more
1490 "uadalp v0.8h, v7.16b \n" // B 16 bytes -> 8 shorts.
1491 "uadalp v3.8h, v6.16b \n" // G 16 bytes -> 8 shorts.
1492 "uadalp v2.8h, v5.16b \n" // R 16 bytes -> 8 shorts.
1493
1494 "urshr v0.8h, v0.8h, #1 \n" // 2x average
1495 "urshr v1.8h, v3.8h, #1 \n"
1496 "urshr v2.8h, v2.8h, #1 \n"
1497
1498 "subs %w4, %w4, #16 \n" // 32 processed per loop.
1499 RGBTOUV(v0.8h, v1.8h, v2.8h)
1500 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
1501 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
1502 "b.gt 1b \n"
1503 : "+r"(src_bgra), // %0
1504 "+r"(src_bgra_1), // %1
1505 "+r"(dst_u), // %2
1506 "+r"(dst_v), // %3
1507 "+r"(width) // %4
1508 :
1509 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1510 "v20", "v21", "v22", "v23", "v24", "v25"
1511 );
1512 }
1513
ABGRToUVRow_NEON(const uint8_t * src_abgr,int src_stride_abgr,uint8_t * dst_u,uint8_t * dst_v,int width)1514 void ABGRToUVRow_NEON(const uint8_t* src_abgr,
1515 int src_stride_abgr,
1516 uint8_t* dst_u,
1517 uint8_t* dst_v,
1518 int width) {
1519 const uint8_t* src_abgr_1 = src_abgr + src_stride_abgr;
1520 asm volatile (
1521 RGBTOUV_SETUP_REG
1522 "1: \n"
1523 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
1524 "uaddlp v3.8h, v2.16b \n" // B 16 bytes -> 8 shorts.
1525 "uaddlp v2.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
1526 "uaddlp v1.8h, v0.16b \n" // R 16 bytes -> 8 shorts.
1527 "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more.
1528 "uadalp v3.8h, v6.16b \n" // B 16 bytes -> 8 shorts.
1529 "uadalp v2.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
1530 "uadalp v1.8h, v4.16b \n" // R 16 bytes -> 8 shorts.
1531
1532 "urshr v0.8h, v3.8h, #1 \n" // 2x average
1533 "urshr v2.8h, v2.8h, #1 \n"
1534 "urshr v1.8h, v1.8h, #1 \n"
1535
1536 "subs %w4, %w4, #16 \n" // 32 processed per loop.
1537 RGBTOUV(v0.8h, v2.8h, v1.8h)
1538 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
1539 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
1540 "b.gt 1b \n"
1541 : "+r"(src_abgr), // %0
1542 "+r"(src_abgr_1), // %1
1543 "+r"(dst_u), // %2
1544 "+r"(dst_v), // %3
1545 "+r"(width) // %4
1546 :
1547 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1548 "v20", "v21", "v22", "v23", "v24", "v25"
1549 );
1550 }
1551
RGBAToUVRow_NEON(const uint8_t * src_rgba,int src_stride_rgba,uint8_t * dst_u,uint8_t * dst_v,int width)1552 void RGBAToUVRow_NEON(const uint8_t* src_rgba,
1553 int src_stride_rgba,
1554 uint8_t* dst_u,
1555 uint8_t* dst_v,
1556 int width) {
1557 const uint8_t* src_rgba_1 = src_rgba + src_stride_rgba;
1558 asm volatile (
1559 RGBTOUV_SETUP_REG
1560 "1: \n"
1561 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
1562 "uaddlp v0.8h, v1.16b \n" // B 16 bytes -> 8 shorts.
1563 "uaddlp v1.8h, v2.16b \n" // G 16 bytes -> 8 shorts.
1564 "uaddlp v2.8h, v3.16b \n" // R 16 bytes -> 8 shorts.
1565 "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more.
1566 "uadalp v0.8h, v5.16b \n" // B 16 bytes -> 8 shorts.
1567 "uadalp v1.8h, v6.16b \n" // G 16 bytes -> 8 shorts.
1568 "uadalp v2.8h, v7.16b \n" // R 16 bytes -> 8 shorts.
1569
1570 "urshr v0.8h, v0.8h, #1 \n" // 2x average
1571 "urshr v1.8h, v1.8h, #1 \n"
1572 "urshr v2.8h, v2.8h, #1 \n"
1573
1574 "subs %w4, %w4, #16 \n" // 32 processed per loop.
1575 RGBTOUV(v0.8h, v1.8h, v2.8h)
1576 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
1577 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
1578 "b.gt 1b \n"
1579 : "+r"(src_rgba), // %0
1580 "+r"(src_rgba_1), // %1
1581 "+r"(dst_u), // %2
1582 "+r"(dst_v), // %3
1583 "+r"(width) // %4
1584 :
1585 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1586 "v20", "v21", "v22", "v23", "v24", "v25"
1587 );
1588 }
1589
RGB24ToUVRow_NEON(const uint8_t * src_rgb24,int src_stride_rgb24,uint8_t * dst_u,uint8_t * dst_v,int width)1590 void RGB24ToUVRow_NEON(const uint8_t* src_rgb24,
1591 int src_stride_rgb24,
1592 uint8_t* dst_u,
1593 uint8_t* dst_v,
1594 int width) {
1595 const uint8_t* src_rgb24_1 = src_rgb24 + src_stride_rgb24;
1596 asm volatile (
1597 RGBTOUV_SETUP_REG
1598 "1: \n"
1599 "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 pixels.
1600 "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
1601 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
1602 "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
1603 "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load 16 more.
1604 "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts.
1605 "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
1606 "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts.
1607
1608 "urshr v0.8h, v0.8h, #1 \n" // 2x average
1609 "urshr v1.8h, v1.8h, #1 \n"
1610 "urshr v2.8h, v2.8h, #1 \n"
1611
1612 "subs %w4, %w4, #16 \n" // 32 processed per loop.
1613 RGBTOUV(v0.8h, v1.8h, v2.8h)
1614 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
1615 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
1616 "b.gt 1b \n"
1617 : "+r"(src_rgb24), // %0
1618 "+r"(src_rgb24_1), // %1
1619 "+r"(dst_u), // %2
1620 "+r"(dst_v), // %3
1621 "+r"(width) // %4
1622 :
1623 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1624 "v20", "v21", "v22", "v23", "v24", "v25"
1625 );
1626 }
1627
RAWToUVRow_NEON(const uint8_t * src_raw,int src_stride_raw,uint8_t * dst_u,uint8_t * dst_v,int width)1628 void RAWToUVRow_NEON(const uint8_t* src_raw,
1629 int src_stride_raw,
1630 uint8_t* dst_u,
1631 uint8_t* dst_v,
1632 int width) {
1633 const uint8_t* src_raw_1 = src_raw + src_stride_raw;
1634 asm volatile (
1635 RGBTOUV_SETUP_REG
1636 "1: \n"
1637 "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 8 RAW pixels.
1638 "uaddlp v2.8h, v2.16b \n" // B 16 bytes -> 8 shorts.
1639 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
1640 "uaddlp v0.8h, v0.16b \n" // R 16 bytes -> 8 shorts.
1641 "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load 8 more RAW pixels
1642 "uadalp v2.8h, v6.16b \n" // B 16 bytes -> 8 shorts.
1643 "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
1644 "uadalp v0.8h, v4.16b \n" // R 16 bytes -> 8 shorts.
1645
1646 "urshr v2.8h, v2.8h, #1 \n" // 2x average
1647 "urshr v1.8h, v1.8h, #1 \n"
1648 "urshr v0.8h, v0.8h, #1 \n"
1649
1650 "subs %w4, %w4, #16 \n" // 32 processed per loop.
1651 RGBTOUV(v2.8h, v1.8h, v0.8h)
1652 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
1653 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
1654 "b.gt 1b \n"
1655 : "+r"(src_raw), // %0
1656 "+r"(src_raw_1), // %1
1657 "+r"(dst_u), // %2
1658 "+r"(dst_v), // %3
1659 "+r"(width) // %4
1660 :
1661 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1662 "v20", "v21", "v22", "v23", "v24", "v25"
1663 );
1664 }
1665
1666 // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
RGB565ToUVRow_NEON(const uint8_t * src_rgb565,int src_stride_rgb565,uint8_t * dst_u,uint8_t * dst_v,int width)1667 void RGB565ToUVRow_NEON(const uint8_t* src_rgb565,
1668 int src_stride_rgb565,
1669 uint8_t* dst_u,
1670 uint8_t* dst_v,
1671 int width) {
1672 const uint8_t* src_rgb565_1 = src_rgb565 + src_stride_rgb565;
1673 asm volatile(
1674 "movi v22.8h, #56, lsl #0 \n" // UB / VR coeff (0.875) /
1675 // 2
1676 "movi v23.8h, #37, lsl #0 \n" // UG coeff (-0.5781) / 2
1677 "movi v24.8h, #19, lsl #0 \n" // UR coeff (-0.2969) / 2
1678 "movi v25.8h, #9 , lsl #0 \n" // VB coeff (-0.1406) / 2
1679 "movi v26.8h, #47, lsl #0 \n" // VG coeff (-0.7344) / 2
1680 "movi v27.16b, #0x80 \n" // 128.5 0x8080 in 16bit
1681 "1: \n"
1682 "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels.
1683 RGB565TOARGB
1684 "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
1685 "uaddlp v18.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
1686 "uaddlp v20.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
1687 "ld1 {v0.16b}, [%0], #16 \n" // next 8 RGB565 pixels.
1688 RGB565TOARGB
1689 "uaddlp v17.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
1690 "uaddlp v19.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
1691 "uaddlp v21.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
1692
1693 "ld1 {v0.16b}, [%1], #16 \n" // load 8 RGB565 pixels.
1694 RGB565TOARGB
1695 "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
1696 "uadalp v18.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
1697 "uadalp v20.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
1698 "ld1 {v0.16b}, [%1], #16 \n" // next 8 RGB565 pixels.
1699 RGB565TOARGB
1700 "uadalp v17.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
1701 "uadalp v19.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
1702 "uadalp v21.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
1703
1704 "ins v16.D[1], v17.D[0] \n"
1705 "ins v18.D[1], v19.D[0] \n"
1706 "ins v20.D[1], v21.D[0] \n"
1707
1708 "urshr v4.8h, v16.8h, #1 \n" // 2x average
1709 "urshr v5.8h, v18.8h, #1 \n"
1710 "urshr v6.8h, v20.8h, #1 \n"
1711
1712 "subs %w4, %w4, #16 \n" // 16 processed per loop.
1713 "mul v16.8h, v4.8h, v22.8h \n" // B
1714 "mls v16.8h, v5.8h, v23.8h \n" // G
1715 "mls v16.8h, v6.8h, v24.8h \n" // R
1716 "add v16.8h, v16.8h, v27.8h \n" // +128 -> unsigned
1717 "mul v17.8h, v6.8h, v22.8h \n" // R
1718 "mls v17.8h, v5.8h, v26.8h \n" // G
1719 "mls v17.8h, v4.8h, v25.8h \n" // B
1720 "add v17.8h, v17.8h, v27.8h \n" // +128 -> unsigned
1721 "uqshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit U
1722 "uqshrn v1.8b, v17.8h, #8 \n" // 16 bit to 8 bit V
1723 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
1724 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
1725 "b.gt 1b \n"
1726 : "+r"(src_rgb565), // %0
1727 "+r"(src_rgb565_1), // %1
1728 "+r"(dst_u), // %2
1729 "+r"(dst_v), // %3
1730 "+r"(width) // %4
1731 :
1732 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
1733 "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
1734 "v27");
1735 }
1736
1737 // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
ARGB1555ToUVRow_NEON(const uint8_t * src_argb1555,int src_stride_argb1555,uint8_t * dst_u,uint8_t * dst_v,int width)1738 void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555,
1739 int src_stride_argb1555,
1740 uint8_t* dst_u,
1741 uint8_t* dst_v,
1742 int width) {
1743 const uint8_t* src_argb1555_1 = src_argb1555 + src_stride_argb1555;
1744 asm volatile(
1745 RGBTOUV_SETUP_REG
1746 "1: \n"
1747 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
1748 RGB555TOARGB
1749 "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
1750 "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
1751 "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
1752 "ld1 {v0.16b}, [%0], #16 \n" // next 8 ARGB1555 pixels.
1753 RGB555TOARGB
1754 "uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
1755 "uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
1756 "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
1757
1758 "ld1 {v0.16b}, [%1], #16 \n" // load 8 ARGB1555 pixels.
1759 RGB555TOARGB
1760 "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
1761 "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
1762 "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
1763 "ld1 {v0.16b}, [%1], #16 \n" // next 8 ARGB1555 pixels.
1764 RGB555TOARGB
1765 "uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
1766 "uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
1767 "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
1768
1769 "ins v16.D[1], v26.D[0] \n"
1770 "ins v17.D[1], v27.D[0] \n"
1771 "ins v18.D[1], v28.D[0] \n"
1772
1773 "urshr v4.8h, v16.8h, #1 \n" // 2x average
1774 "urshr v5.8h, v17.8h, #1 \n"
1775 "urshr v6.8h, v18.8h, #1 \n"
1776
1777 "subs %w4, %w4, #16 \n" // 16 processed per loop.
1778 "mul v2.8h, v4.8h, v20.8h \n" // B
1779 "mls v2.8h, v5.8h, v21.8h \n" // G
1780 "mls v2.8h, v6.8h, v22.8h \n" // R
1781 "add v2.8h, v2.8h, v25.8h \n" // +128 -> unsigned
1782 "mul v3.8h, v6.8h, v20.8h \n" // R
1783 "mls v3.8h, v5.8h, v24.8h \n" // G
1784 "mls v3.8h, v4.8h, v23.8h \n" // B
1785 "add v3.8h, v3.8h, v25.8h \n" // +128 -> unsigned
1786 "uqshrn v0.8b, v2.8h, #8 \n" // 16 bit to 8 bit U
1787 "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V
1788 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
1789 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
1790 "b.gt 1b \n"
1791 : "+r"(src_argb1555), // %0
1792 "+r"(src_argb1555_1), // %1
1793 "+r"(dst_u), // %2
1794 "+r"(dst_v), // %3
1795 "+r"(width) // %4
1796 :
1797 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17",
1798 "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27",
1799 "v28");
1800 }
1801
1802 // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
ARGB4444ToUVRow_NEON(const uint8_t * src_argb4444,int src_stride_argb4444,uint8_t * dst_u,uint8_t * dst_v,int width)1803 void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
1804 int src_stride_argb4444,
1805 uint8_t* dst_u,
1806 uint8_t* dst_v,
1807 int width) {
1808 const uint8_t* src_argb4444_1 = src_argb4444 + src_stride_argb4444;
1809 asm volatile(
1810 RGBTOUV_SETUP_REG
1811 "1: \n"
1812 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels.
1813 ARGB4444TOARGB
1814 "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
1815 "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
1816 "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
1817 "ld1 {v0.16b}, [%0], #16 \n" // next 8 ARGB4444 pixels.
1818 ARGB4444TOARGB
1819 "uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
1820 "uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
1821 "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
1822
1823 "ld1 {v0.16b}, [%1], #16 \n" // load 8 ARGB4444 pixels.
1824 ARGB4444TOARGB
1825 "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
1826 "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
1827 "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
1828 "ld1 {v0.16b}, [%1], #16 \n" // next 8 ARGB4444 pixels.
1829 ARGB4444TOARGB
1830 "uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
1831 "uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
1832 "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
1833
1834 "ins v16.D[1], v26.D[0] \n"
1835 "ins v17.D[1], v27.D[0] \n"
1836 "ins v18.D[1], v28.D[0] \n"
1837
1838 "urshr v4.8h, v16.8h, #1 \n" // 2x average
1839 "urshr v5.8h, v17.8h, #1 \n"
1840 "urshr v6.8h, v18.8h, #1 \n"
1841
1842 "subs %w4, %w4, #16 \n" // 16 processed per loop.
1843 "mul v2.8h, v4.8h, v20.8h \n" // B
1844 "mls v2.8h, v5.8h, v21.8h \n" // G
1845 "mls v2.8h, v6.8h, v22.8h \n" // R
1846 "add v2.8h, v2.8h, v25.8h \n" // +128 -> unsigned
1847 "mul v3.8h, v6.8h, v20.8h \n" // R
1848 "mls v3.8h, v5.8h, v24.8h \n" // G
1849 "mls v3.8h, v4.8h, v23.8h \n" // B
1850 "add v3.8h, v3.8h, v25.8h \n" // +128 -> unsigned
1851 "uqshrn v0.8b, v2.8h, #8 \n" // 16 bit to 8 bit U
1852 "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V
1853 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
1854 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
1855 "b.gt 1b \n"
1856 : "+r"(src_argb4444), // %0
1857 "+r"(src_argb4444_1), // %1
1858 "+r"(dst_u), // %2
1859 "+r"(dst_v), // %3
1860 "+r"(width) // %4
1861 :
1862 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17",
1863 "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27",
1864 "v28"
1865
1866 );
1867 }
1868
RGB565ToYRow_NEON(const uint8_t * src_rgb565,uint8_t * dst_y,int width)1869 void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
1870 asm volatile(
1871 "movi v24.8b, #13 \n" // B * 0.1016 coefficient
1872 "movi v25.8b, #65 \n" // G * 0.5078 coefficient
1873 "movi v26.8b, #33 \n" // R * 0.2578 coefficient
1874 "movi v27.8b, #16 \n" // Add 16 constant
1875 "1: \n"
1876 "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels.
1877 "subs %w2, %w2, #8 \n" // 8 processed per loop.
1878 RGB565TOARGB
1879 "umull v3.8h, v0.8b, v24.8b \n" // B
1880 "umlal v3.8h, v1.8b, v25.8b \n" // G
1881 "umlal v3.8h, v2.8b, v26.8b \n" // R
1882 "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y
1883 "uqadd v0.8b, v0.8b, v27.8b \n"
1884 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
1885 "b.gt 1b \n"
1886 : "+r"(src_rgb565), // %0
1887 "+r"(dst_y), // %1
1888 "+r"(width) // %2
1889 :
1890 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6", "v24", "v25", "v26",
1891 "v27");
1892 }
1893
ARGB1555ToYRow_NEON(const uint8_t * src_argb1555,uint8_t * dst_y,int width)1894 void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555,
1895 uint8_t* dst_y,
1896 int width) {
1897 asm volatile(
1898 "movi v4.8b, #13 \n" // B * 0.1016 coefficient
1899 "movi v5.8b, #65 \n" // G * 0.5078 coefficient
1900 "movi v6.8b, #33 \n" // R * 0.2578 coefficient
1901 "movi v7.8b, #16 \n" // Add 16 constant
1902 "1: \n"
1903 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
1904 "subs %w2, %w2, #8 \n" // 8 processed per loop.
1905 ARGB1555TOARGB
1906 "umull v3.8h, v0.8b, v4.8b \n" // B
1907 "umlal v3.8h, v1.8b, v5.8b \n" // G
1908 "umlal v3.8h, v2.8b, v6.8b \n" // R
1909 "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y
1910 "uqadd v0.8b, v0.8b, v7.8b \n"
1911 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
1912 "b.gt 1b \n"
1913 : "+r"(src_argb1555), // %0
1914 "+r"(dst_y), // %1
1915 "+r"(width) // %2
1916 :
1917 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
1918 }
1919
ARGB4444ToYRow_NEON(const uint8_t * src_argb4444,uint8_t * dst_y,int width)1920 void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
1921 uint8_t* dst_y,
1922 int width) {
1923 asm volatile(
1924 "movi v24.8b, #13 \n" // B * 0.1016 coefficient
1925 "movi v25.8b, #65 \n" // G * 0.5078 coefficient
1926 "movi v26.8b, #33 \n" // R * 0.2578 coefficient
1927 "movi v27.8b, #16 \n" // Add 16 constant
1928 "1: \n"
1929 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels.
1930 "subs %w2, %w2, #8 \n" // 8 processed per loop.
1931 ARGB4444TOARGB
1932 "umull v3.8h, v0.8b, v24.8b \n" // B
1933 "umlal v3.8h, v1.8b, v25.8b \n" // G
1934 "umlal v3.8h, v2.8b, v26.8b \n" // R
1935 "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y
1936 "uqadd v0.8b, v0.8b, v27.8b \n"
1937 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
1938 "b.gt 1b \n"
1939 : "+r"(src_argb4444), // %0
1940 "+r"(dst_y), // %1
1941 "+r"(width) // %2
1942 :
1943 : "cc", "memory", "v0", "v1", "v2", "v3", "v24", "v25", "v26", "v27");
1944 }
1945
BGRAToYRow_NEON(const uint8_t * src_bgra,uint8_t * dst_y,int width)1946 void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
1947 asm volatile(
1948 "movi v4.8b, #33 \n" // R * 0.2578 coefficient
1949 "movi v5.8b, #65 \n" // G * 0.5078 coefficient
1950 "movi v6.8b, #13 \n" // B * 0.1016 coefficient
1951 "movi v7.8b, #16 \n" // Add 16 constant
1952 "1: \n"
1953 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels.
1954 "subs %w2, %w2, #8 \n" // 8 processed per loop.
1955 "umull v16.8h, v1.8b, v4.8b \n" // R
1956 "umlal v16.8h, v2.8b, v5.8b \n" // G
1957 "umlal v16.8h, v3.8b, v6.8b \n" // B
1958 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y
1959 "uqadd v0.8b, v0.8b, v7.8b \n"
1960 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
1961 "b.gt 1b \n"
1962 : "+r"(src_bgra), // %0
1963 "+r"(dst_y), // %1
1964 "+r"(width) // %2
1965 :
1966 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
1967 }
1968
ABGRToYRow_NEON(const uint8_t * src_abgr,uint8_t * dst_y,int width)1969 void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
1970 asm volatile(
1971 "movi v4.8b, #33 \n" // R * 0.2578 coefficient
1972 "movi v5.8b, #65 \n" // G * 0.5078 coefficient
1973 "movi v6.8b, #13 \n" // B * 0.1016 coefficient
1974 "movi v7.8b, #16 \n" // Add 16 constant
1975 "1: \n"
1976 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels.
1977 "subs %w2, %w2, #8 \n" // 8 processed per loop.
1978 "umull v16.8h, v0.8b, v4.8b \n" // R
1979 "umlal v16.8h, v1.8b, v5.8b \n" // G
1980 "umlal v16.8h, v2.8b, v6.8b \n" // B
1981 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y
1982 "uqadd v0.8b, v0.8b, v7.8b \n"
1983 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
1984 "b.gt 1b \n"
1985 : "+r"(src_abgr), // %0
1986 "+r"(dst_y), // %1
1987 "+r"(width) // %2
1988 :
1989 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
1990 }
1991
RGBAToYRow_NEON(const uint8_t * src_rgba,uint8_t * dst_y,int width)1992 void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
1993 asm volatile(
1994 "movi v4.8b, #13 \n" // B * 0.1016 coefficient
1995 "movi v5.8b, #65 \n" // G * 0.5078 coefficient
1996 "movi v6.8b, #33 \n" // R * 0.2578 coefficient
1997 "movi v7.8b, #16 \n" // Add 16 constant
1998 "1: \n"
1999 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels.
2000 "subs %w2, %w2, #8 \n" // 8 processed per loop.
2001 "umull v16.8h, v1.8b, v4.8b \n" // B
2002 "umlal v16.8h, v2.8b, v5.8b \n" // G
2003 "umlal v16.8h, v3.8b, v6.8b \n" // R
2004 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y
2005 "uqadd v0.8b, v0.8b, v7.8b \n"
2006 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
2007 "b.gt 1b \n"
2008 : "+r"(src_rgba), // %0
2009 "+r"(dst_y), // %1
2010 "+r"(width) // %2
2011 :
2012 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
2013 }
2014
RGB24ToYRow_NEON(const uint8_t * src_rgb24,uint8_t * dst_y,int width)2015 void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width) {
2016 asm volatile(
2017 "movi v4.8b, #13 \n" // B * 0.1016 coefficient
2018 "movi v5.8b, #65 \n" // G * 0.5078 coefficient
2019 "movi v6.8b, #33 \n" // R * 0.2578 coefficient
2020 "movi v7.8b, #16 \n" // Add 16 constant
2021 "1: \n"
2022 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels.
2023 "subs %w2, %w2, #8 \n" // 8 processed per loop.
2024 "umull v16.8h, v0.8b, v4.8b \n" // B
2025 "umlal v16.8h, v1.8b, v5.8b \n" // G
2026 "umlal v16.8h, v2.8b, v6.8b \n" // R
2027 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y
2028 "uqadd v0.8b, v0.8b, v7.8b \n"
2029 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
2030 "b.gt 1b \n"
2031 : "+r"(src_rgb24), // %0
2032 "+r"(dst_y), // %1
2033 "+r"(width) // %2
2034 :
2035 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
2036 }
2037
RAWToYRow_NEON(const uint8_t * src_raw,uint8_t * dst_y,int width)2038 void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width) {
2039 asm volatile(
2040 "movi v4.8b, #33 \n" // R * 0.2578 coefficient
2041 "movi v5.8b, #65 \n" // G * 0.5078 coefficient
2042 "movi v6.8b, #13 \n" // B * 0.1016 coefficient
2043 "movi v7.8b, #16 \n" // Add 16 constant
2044 "1: \n"
2045 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels.
2046 "subs %w2, %w2, #8 \n" // 8 processed per loop.
2047 "umull v16.8h, v0.8b, v4.8b \n" // B
2048 "umlal v16.8h, v1.8b, v5.8b \n" // G
2049 "umlal v16.8h, v2.8b, v6.8b \n" // R
2050 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y
2051 "uqadd v0.8b, v0.8b, v7.8b \n"
2052 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
2053 "b.gt 1b \n"
2054 : "+r"(src_raw), // %0
2055 "+r"(dst_y), // %1
2056 "+r"(width) // %2
2057 :
2058 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
2059 }
2060
2061 // Bilinear filter 16x2 -> 16x1
InterpolateRow_NEON(uint8_t * dst_ptr,const uint8_t * src_ptr,ptrdiff_t src_stride,int dst_width,int source_y_fraction)2062 void InterpolateRow_NEON(uint8_t* dst_ptr,
2063 const uint8_t* src_ptr,
2064 ptrdiff_t src_stride,
2065 int dst_width,
2066 int source_y_fraction) {
2067 int y1_fraction = source_y_fraction;
2068 int y0_fraction = 256 - y1_fraction;
2069 const uint8_t* src_ptr1 = src_ptr + src_stride;
2070 asm volatile(
2071 "cmp %w4, #0 \n"
2072 "b.eq 100f \n"
2073 "cmp %w4, #128 \n"
2074 "b.eq 50f \n"
2075
2076 "dup v5.16b, %w4 \n"
2077 "dup v4.16b, %w5 \n"
2078 // General purpose row blend.
2079 "1: \n"
2080 "ld1 {v0.16b}, [%1], #16 \n"
2081 "ld1 {v1.16b}, [%2], #16 \n"
2082 "subs %w3, %w3, #16 \n"
2083 "umull v2.8h, v0.8b, v4.8b \n"
2084 "umull2 v3.8h, v0.16b, v4.16b \n"
2085 "umlal v2.8h, v1.8b, v5.8b \n"
2086 "umlal2 v3.8h, v1.16b, v5.16b \n"
2087 "rshrn v0.8b, v2.8h, #8 \n"
2088 "rshrn2 v0.16b, v3.8h, #8 \n"
2089 "st1 {v0.16b}, [%0], #16 \n"
2090 "b.gt 1b \n"
2091 "b 99f \n"
2092
2093 // Blend 50 / 50.
2094 "50: \n"
2095 "ld1 {v0.16b}, [%1], #16 \n"
2096 "ld1 {v1.16b}, [%2], #16 \n"
2097 "subs %w3, %w3, #16 \n"
2098 "urhadd v0.16b, v0.16b, v1.16b \n"
2099 "st1 {v0.16b}, [%0], #16 \n"
2100 "b.gt 50b \n"
2101 "b 99f \n"
2102
2103 // Blend 100 / 0 - Copy row unchanged.
2104 "100: \n"
2105 "ld1 {v0.16b}, [%1], #16 \n"
2106 "subs %w3, %w3, #16 \n"
2107 "st1 {v0.16b}, [%0], #16 \n"
2108 "b.gt 100b \n"
2109
2110 "99: \n"
2111 : "+r"(dst_ptr), // %0
2112 "+r"(src_ptr), // %1
2113 "+r"(src_ptr1), // %2
2114 "+r"(dst_width), // %3
2115 "+r"(y1_fraction), // %4
2116 "+r"(y0_fraction) // %5
2117 :
2118 : "cc", "memory", "v0", "v1", "v3", "v4", "v5");
2119 }
2120
2121 // dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr
ARGBBlendRow_NEON(const uint8_t * src_argb0,const uint8_t * src_argb1,uint8_t * dst_argb,int width)2122 void ARGBBlendRow_NEON(const uint8_t* src_argb0,
2123 const uint8_t* src_argb1,
2124 uint8_t* dst_argb,
2125 int width) {
2126 asm volatile(
2127 "subs %w3, %w3, #8 \n"
2128 "b.lt 89f \n"
2129 // Blend 8 pixels.
2130 "8: \n"
2131 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB0
2132 // pixels
2133 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 ARGB1
2134 // pixels
2135 "subs %w3, %w3, #8 \n" // 8 processed per loop.
2136 "umull v16.8h, v4.8b, v3.8b \n" // db * a
2137 "umull v17.8h, v5.8b, v3.8b \n" // dg * a
2138 "umull v18.8h, v6.8b, v3.8b \n" // dr * a
2139 "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8
2140 "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8
2141 "uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8
2142 "uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256)
2143 "uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256)
2144 "uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256)
2145 "uqadd v0.8b, v0.8b, v4.8b \n" // + sb
2146 "uqadd v1.8b, v1.8b, v5.8b \n" // + sg
2147 "uqadd v2.8b, v2.8b, v6.8b \n" // + sr
2148 "movi v3.8b, #255 \n" // a = 255
2149 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
2150 // pixels
2151 "b.ge 8b \n"
2152
2153 "89: \n"
2154 "adds %w3, %w3, #8-1 \n"
2155 "b.lt 99f \n"
2156
2157 // Blend 1 pixels.
2158 "1: \n"
2159 "ld4 {v0.b,v1.b,v2.b,v3.b}[0], [%0], #4 \n" // load 1 pixel ARGB0.
2160 "ld4 {v4.b,v5.b,v6.b,v7.b}[0], [%1], #4 \n" // load 1 pixel ARGB1.
2161 "subs %w3, %w3, #1 \n" // 1 processed per loop.
2162 "umull v16.8h, v4.8b, v3.8b \n" // db * a
2163 "umull v17.8h, v5.8b, v3.8b \n" // dg * a
2164 "umull v18.8h, v6.8b, v3.8b \n" // dr * a
2165 "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8
2166 "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8
2167 "uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8
2168 "uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256)
2169 "uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256)
2170 "uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256)
2171 "uqadd v0.8b, v0.8b, v4.8b \n" // + sb
2172 "uqadd v1.8b, v1.8b, v5.8b \n" // + sg
2173 "uqadd v2.8b, v2.8b, v6.8b \n" // + sr
2174 "movi v3.8b, #255 \n" // a = 255
2175 "st4 {v0.b,v1.b,v2.b,v3.b}[0], [%2], #4 \n" // store 1 pixel.
2176 "b.ge 1b \n"
2177
2178 "99: \n"
2179
2180 : "+r"(src_argb0), // %0
2181 "+r"(src_argb1), // %1
2182 "+r"(dst_argb), // %2
2183 "+r"(width) // %3
2184 :
2185 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
2186 "v17", "v18");
2187 }
2188
2189 // Attenuate 8 pixels at a time.
ARGBAttenuateRow_NEON(const uint8_t * src_argb,uint8_t * dst_argb,int width)2190 void ARGBAttenuateRow_NEON(const uint8_t* src_argb,
2191 uint8_t* dst_argb,
2192 int width) {
2193 asm volatile(
2194 // Attenuate 8 pixels.
2195 "1: \n"
2196 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
2197 "subs %w2, %w2, #8 \n" // 8 processed per loop.
2198 "umull v4.8h, v0.8b, v3.8b \n" // b * a
2199 "umull v5.8h, v1.8b, v3.8b \n" // g * a
2200 "umull v6.8h, v2.8b, v3.8b \n" // r * a
2201 "uqrshrn v0.8b, v4.8h, #8 \n" // b >>= 8
2202 "uqrshrn v1.8b, v5.8h, #8 \n" // g >>= 8
2203 "uqrshrn v2.8b, v6.8h, #8 \n" // r >>= 8
2204 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB
2205 // pixels
2206 "b.gt 1b \n"
2207 : "+r"(src_argb), // %0
2208 "+r"(dst_argb), // %1
2209 "+r"(width) // %2
2210 :
2211 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6");
2212 }
2213
2214 // Quantize 8 ARGB pixels (32 bytes).
2215 // dst = (dst * scale >> 16) * interval_size + interval_offset;
ARGBQuantizeRow_NEON(uint8_t * dst_argb,int scale,int interval_size,int interval_offset,int width)2216 void ARGBQuantizeRow_NEON(uint8_t* dst_argb,
2217 int scale,
2218 int interval_size,
2219 int interval_offset,
2220 int width) {
2221 asm volatile(
2222 "dup v4.8h, %w2 \n"
2223 "ushr v4.8h, v4.8h, #1 \n" // scale >>= 1
2224 "dup v5.8h, %w3 \n" // interval multiply.
2225 "dup v6.8h, %w4 \n" // interval add
2226
2227 // 8 pixel loop.
2228 "1: \n"
2229 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB.
2230 "subs %w1, %w1, #8 \n" // 8 processed per loop.
2231 "uxtl v0.8h, v0.8b \n" // b (0 .. 255)
2232 "uxtl v1.8h, v1.8b \n"
2233 "uxtl v2.8h, v2.8b \n"
2234 "sqdmulh v0.8h, v0.8h, v4.8h \n" // b * scale
2235 "sqdmulh v1.8h, v1.8h, v4.8h \n" // g
2236 "sqdmulh v2.8h, v2.8h, v4.8h \n" // r
2237 "mul v0.8h, v0.8h, v5.8h \n" // b * interval_size
2238 "mul v1.8h, v1.8h, v5.8h \n" // g
2239 "mul v2.8h, v2.8h, v5.8h \n" // r
2240 "add v0.8h, v0.8h, v6.8h \n" // b + interval_offset
2241 "add v1.8h, v1.8h, v6.8h \n" // g
2242 "add v2.8h, v2.8h, v6.8h \n" // r
2243 "uqxtn v0.8b, v0.8h \n"
2244 "uqxtn v1.8b, v1.8h \n"
2245 "uqxtn v2.8b, v2.8h \n"
2246 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 ARGB
2247 "b.gt 1b \n"
2248 : "+r"(dst_argb), // %0
2249 "+r"(width) // %1
2250 : "r"(scale), // %2
2251 "r"(interval_size), // %3
2252 "r"(interval_offset) // %4
2253 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6");
2254 }
2255
2256 // Shade 8 pixels at a time by specified value.
2257 // NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8.
2258 // Rounding in vqrdmulh does +1 to high if high bit of low s16 is set.
ARGBShadeRow_NEON(const uint8_t * src_argb,uint8_t * dst_argb,int width,uint32_t value)2259 void ARGBShadeRow_NEON(const uint8_t* src_argb,
2260 uint8_t* dst_argb,
2261 int width,
2262 uint32_t value) {
2263 asm volatile(
2264 "dup v0.4s, %w3 \n" // duplicate scale value.
2265 "zip1 v0.8b, v0.8b, v0.8b \n" // v0.8b aarrggbb.
2266 "ushr v0.8h, v0.8h, #1 \n" // scale / 2.
2267
2268 // 8 pixel loop.
2269 "1: \n"
2270 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n" // load 8 ARGB
2271 "subs %w2, %w2, #8 \n" // 8 processed per loop.
2272 "uxtl v4.8h, v4.8b \n" // b (0 .. 255)
2273 "uxtl v5.8h, v5.8b \n"
2274 "uxtl v6.8h, v6.8b \n"
2275 "uxtl v7.8h, v7.8b \n"
2276 "sqrdmulh v4.8h, v4.8h, v0.h[0] \n" // b * scale * 2
2277 "sqrdmulh v5.8h, v5.8h, v0.h[1] \n" // g
2278 "sqrdmulh v6.8h, v6.8h, v0.h[2] \n" // r
2279 "sqrdmulh v7.8h, v7.8h, v0.h[3] \n" // a
2280 "uqxtn v4.8b, v4.8h \n"
2281 "uqxtn v5.8b, v5.8h \n"
2282 "uqxtn v6.8b, v6.8h \n"
2283 "uqxtn v7.8b, v7.8h \n"
2284 "st4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // store 8 ARGB
2285 "b.gt 1b \n"
2286 : "+r"(src_argb), // %0
2287 "+r"(dst_argb), // %1
2288 "+r"(width) // %2
2289 : "r"(value) // %3
2290 : "cc", "memory", "v0", "v4", "v5", "v6", "v7");
2291 }
2292
2293 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
2294 // Similar to ARGBToYJ but stores ARGB.
2295 // C code is (15 * b + 75 * g + 38 * r + 64) >> 7;
ARGBGrayRow_NEON(const uint8_t * src_argb,uint8_t * dst_argb,int width)2296 void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
2297 asm volatile(
2298 "movi v24.8b, #15 \n" // B * 0.11400 coefficient
2299 "movi v25.8b, #75 \n" // G * 0.58700 coefficient
2300 "movi v26.8b, #38 \n" // R * 0.29900 coefficient
2301 "1: \n"
2302 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
2303 "subs %w2, %w2, #8 \n" // 8 processed per loop.
2304 "umull v4.8h, v0.8b, v24.8b \n" // B
2305 "umlal v4.8h, v1.8b, v25.8b \n" // G
2306 "umlal v4.8h, v2.8b, v26.8b \n" // R
2307 "sqrshrun v0.8b, v4.8h, #7 \n" // 15 bit to 8 bit B
2308 "orr v1.8b, v0.8b, v0.8b \n" // G
2309 "orr v2.8b, v0.8b, v0.8b \n" // R
2310 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 pixels.
2311 "b.gt 1b \n"
2312 : "+r"(src_argb), // %0
2313 "+r"(dst_argb), // %1
2314 "+r"(width) // %2
2315 :
2316 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26");
2317 }
2318
2319 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
2320 // b = (r * 35 + g * 68 + b * 17) >> 7
2321 // g = (r * 45 + g * 88 + b * 22) >> 7
2322 // r = (r * 50 + g * 98 + b * 24) >> 7
2323
ARGBSepiaRow_NEON(uint8_t * dst_argb,int width)2324 void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width) {
2325 asm volatile(
2326 "movi v20.8b, #17 \n" // BB coefficient
2327 "movi v21.8b, #68 \n" // BG coefficient
2328 "movi v22.8b, #35 \n" // BR coefficient
2329 "movi v24.8b, #22 \n" // GB coefficient
2330 "movi v25.8b, #88 \n" // GG coefficient
2331 "movi v26.8b, #45 \n" // GR coefficient
2332 "movi v28.8b, #24 \n" // BB coefficient
2333 "movi v29.8b, #98 \n" // BG coefficient
2334 "movi v30.8b, #50 \n" // BR coefficient
2335 "1: \n"
2336 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB pixels.
2337 "subs %w1, %w1, #8 \n" // 8 processed per loop.
2338 "umull v4.8h, v0.8b, v20.8b \n" // B to Sepia B
2339 "umlal v4.8h, v1.8b, v21.8b \n" // G
2340 "umlal v4.8h, v2.8b, v22.8b \n" // R
2341 "umull v5.8h, v0.8b, v24.8b \n" // B to Sepia G
2342 "umlal v5.8h, v1.8b, v25.8b \n" // G
2343 "umlal v5.8h, v2.8b, v26.8b \n" // R
2344 "umull v6.8h, v0.8b, v28.8b \n" // B to Sepia R
2345 "umlal v6.8h, v1.8b, v29.8b \n" // G
2346 "umlal v6.8h, v2.8b, v30.8b \n" // R
2347 "uqshrn v0.8b, v4.8h, #7 \n" // 16 bit to 8 bit B
2348 "uqshrn v1.8b, v5.8h, #7 \n" // 16 bit to 8 bit G
2349 "uqshrn v2.8b, v6.8h, #7 \n" // 16 bit to 8 bit R
2350 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 pixels.
2351 "b.gt 1b \n"
2352 : "+r"(dst_argb), // %0
2353 "+r"(width) // %1
2354 :
2355 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
2356 "v21", "v22", "v24", "v25", "v26", "v28", "v29", "v30");
2357 }
2358
2359 // Tranform 8 ARGB pixels (32 bytes) with color matrix.
2360 // TODO(fbarchard): Was same as Sepia except matrix is provided. This function
2361 // needs to saturate. Consider doing a non-saturating version.
ARGBColorMatrixRow_NEON(const uint8_t * src_argb,uint8_t * dst_argb,const int8_t * matrix_argb,int width)2362 void ARGBColorMatrixRow_NEON(const uint8_t* src_argb,
2363 uint8_t* dst_argb,
2364 const int8_t* matrix_argb,
2365 int width) {
2366 asm volatile(
2367 "ld1 {v2.16b}, [%3] \n" // load 3 ARGB vectors.
2368 "sxtl v0.8h, v2.8b \n" // B,G coefficients s16.
2369 "sxtl2 v1.8h, v2.16b \n" // R,A coefficients s16.
2370
2371 "1: \n"
2372 "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 ARGB
2373 "subs %w2, %w2, #8 \n" // 8 processed per loop.
2374 "uxtl v16.8h, v16.8b \n" // b (0 .. 255) 16 bit
2375 "uxtl v17.8h, v17.8b \n" // g
2376 "uxtl v18.8h, v18.8b \n" // r
2377 "uxtl v19.8h, v19.8b \n" // a
2378 "mul v22.8h, v16.8h, v0.h[0] \n" // B = B * Matrix B
2379 "mul v23.8h, v16.8h, v0.h[4] \n" // G = B * Matrix G
2380 "mul v24.8h, v16.8h, v1.h[0] \n" // R = B * Matrix R
2381 "mul v25.8h, v16.8h, v1.h[4] \n" // A = B * Matrix A
2382 "mul v4.8h, v17.8h, v0.h[1] \n" // B += G * Matrix B
2383 "mul v5.8h, v17.8h, v0.h[5] \n" // G += G * Matrix G
2384 "mul v6.8h, v17.8h, v1.h[1] \n" // R += G * Matrix R
2385 "mul v7.8h, v17.8h, v1.h[5] \n" // A += G * Matrix A
2386 "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B
2387 "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G
2388 "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R
2389 "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A
2390 "mul v4.8h, v18.8h, v0.h[2] \n" // B += R * Matrix B
2391 "mul v5.8h, v18.8h, v0.h[6] \n" // G += R * Matrix G
2392 "mul v6.8h, v18.8h, v1.h[2] \n" // R += R * Matrix R
2393 "mul v7.8h, v18.8h, v1.h[6] \n" // A += R * Matrix A
2394 "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B
2395 "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G
2396 "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R
2397 "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A
2398 "mul v4.8h, v19.8h, v0.h[3] \n" // B += A * Matrix B
2399 "mul v5.8h, v19.8h, v0.h[7] \n" // G += A * Matrix G
2400 "mul v6.8h, v19.8h, v1.h[3] \n" // R += A * Matrix R
2401 "mul v7.8h, v19.8h, v1.h[7] \n" // A += A * Matrix A
2402 "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B
2403 "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G
2404 "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R
2405 "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A
2406 "sqshrun v16.8b, v22.8h, #6 \n" // 16 bit to 8 bit B
2407 "sqshrun v17.8b, v23.8h, #6 \n" // 16 bit to 8 bit G
2408 "sqshrun v18.8b, v24.8h, #6 \n" // 16 bit to 8 bit R
2409 "sqshrun v19.8b, v25.8h, #6 \n" // 16 bit to 8 bit A
2410 "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n" // store 8 ARGB
2411 "b.gt 1b \n"
2412 : "+r"(src_argb), // %0
2413 "+r"(dst_argb), // %1
2414 "+r"(width) // %2
2415 : "r"(matrix_argb) // %3
2416 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
2417 "v17", "v18", "v19", "v22", "v23", "v24", "v25");
2418 }
2419
2420 // TODO(fbarchard): fix vqshrun in ARGBMultiplyRow_NEON and reenable.
2421 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
ARGBMultiplyRow_NEON(const uint8_t * src_argb0,const uint8_t * src_argb1,uint8_t * dst_argb,int width)2422 void ARGBMultiplyRow_NEON(const uint8_t* src_argb0,
2423 const uint8_t* src_argb1,
2424 uint8_t* dst_argb,
2425 int width) {
2426 asm volatile(
2427 // 8 pixel loop.
2428 "1: \n"
2429 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
2430 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more
2431 "subs %w3, %w3, #8 \n" // 8 processed per loop.
2432 "umull v0.8h, v0.8b, v4.8b \n" // multiply B
2433 "umull v1.8h, v1.8b, v5.8b \n" // multiply G
2434 "umull v2.8h, v2.8b, v6.8b \n" // multiply R
2435 "umull v3.8h, v3.8b, v7.8b \n" // multiply A
2436 "rshrn v0.8b, v0.8h, #8 \n" // 16 bit to 8 bit B
2437 "rshrn v1.8b, v1.8h, #8 \n" // 16 bit to 8 bit G
2438 "rshrn v2.8b, v2.8h, #8 \n" // 16 bit to 8 bit R
2439 "rshrn v3.8b, v3.8h, #8 \n" // 16 bit to 8 bit A
2440 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
2441 "b.gt 1b \n"
2442 : "+r"(src_argb0), // %0
2443 "+r"(src_argb1), // %1
2444 "+r"(dst_argb), // %2
2445 "+r"(width) // %3
2446 :
2447 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
2448 }
2449
2450 // Add 2 rows of ARGB pixels together, 8 pixels at a time.
ARGBAddRow_NEON(const uint8_t * src_argb0,const uint8_t * src_argb1,uint8_t * dst_argb,int width)2451 void ARGBAddRow_NEON(const uint8_t* src_argb0,
2452 const uint8_t* src_argb1,
2453 uint8_t* dst_argb,
2454 int width) {
2455 asm volatile(
2456 // 8 pixel loop.
2457 "1: \n"
2458 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
2459 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more
2460 "subs %w3, %w3, #8 \n" // 8 processed per loop.
2461 "uqadd v0.8b, v0.8b, v4.8b \n"
2462 "uqadd v1.8b, v1.8b, v5.8b \n"
2463 "uqadd v2.8b, v2.8b, v6.8b \n"
2464 "uqadd v3.8b, v3.8b, v7.8b \n"
2465 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
2466 "b.gt 1b \n"
2467 : "+r"(src_argb0), // %0
2468 "+r"(src_argb1), // %1
2469 "+r"(dst_argb), // %2
2470 "+r"(width) // %3
2471 :
2472 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
2473 }
2474
2475 // Subtract 2 rows of ARGB pixels, 8 pixels at a time.
ARGBSubtractRow_NEON(const uint8_t * src_argb0,const uint8_t * src_argb1,uint8_t * dst_argb,int width)2476 void ARGBSubtractRow_NEON(const uint8_t* src_argb0,
2477 const uint8_t* src_argb1,
2478 uint8_t* dst_argb,
2479 int width) {
2480 asm volatile(
2481 // 8 pixel loop.
2482 "1: \n"
2483 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
2484 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more
2485 "subs %w3, %w3, #8 \n" // 8 processed per loop.
2486 "uqsub v0.8b, v0.8b, v4.8b \n"
2487 "uqsub v1.8b, v1.8b, v5.8b \n"
2488 "uqsub v2.8b, v2.8b, v6.8b \n"
2489 "uqsub v3.8b, v3.8b, v7.8b \n"
2490 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
2491 "b.gt 1b \n"
2492 : "+r"(src_argb0), // %0
2493 "+r"(src_argb1), // %1
2494 "+r"(dst_argb), // %2
2495 "+r"(width) // %3
2496 :
2497 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
2498 }
2499
2500 // Adds Sobel X and Sobel Y and stores Sobel into ARGB.
2501 // A = 255
2502 // R = Sobel
2503 // G = Sobel
2504 // B = Sobel
SobelRow_NEON(const uint8_t * src_sobelx,const uint8_t * src_sobely,uint8_t * dst_argb,int width)2505 void SobelRow_NEON(const uint8_t* src_sobelx,
2506 const uint8_t* src_sobely,
2507 uint8_t* dst_argb,
2508 int width) {
2509 asm volatile(
2510 "movi v3.8b, #255 \n" // alpha
2511 // 8 pixel loop.
2512 "1: \n"
2513 "ld1 {v0.8b}, [%0], #8 \n" // load 8 sobelx.
2514 "ld1 {v1.8b}, [%1], #8 \n" // load 8 sobely.
2515 "subs %w3, %w3, #8 \n" // 8 processed per loop.
2516 "uqadd v0.8b, v0.8b, v1.8b \n" // add
2517 "orr v1.8b, v0.8b, v0.8b \n"
2518 "orr v2.8b, v0.8b, v0.8b \n"
2519 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
2520 "b.gt 1b \n"
2521 : "+r"(src_sobelx), // %0
2522 "+r"(src_sobely), // %1
2523 "+r"(dst_argb), // %2
2524 "+r"(width) // %3
2525 :
2526 : "cc", "memory", "v0", "v1", "v2", "v3");
2527 }
2528
2529 // Adds Sobel X and Sobel Y and stores Sobel into plane.
SobelToPlaneRow_NEON(const uint8_t * src_sobelx,const uint8_t * src_sobely,uint8_t * dst_y,int width)2530 void SobelToPlaneRow_NEON(const uint8_t* src_sobelx,
2531 const uint8_t* src_sobely,
2532 uint8_t* dst_y,
2533 int width) {
2534 asm volatile(
2535 // 16 pixel loop.
2536 "1: \n"
2537 "ld1 {v0.16b}, [%0], #16 \n" // load 16 sobelx.
2538 "ld1 {v1.16b}, [%1], #16 \n" // load 16 sobely.
2539 "subs %w3, %w3, #16 \n" // 16 processed per loop.
2540 "uqadd v0.16b, v0.16b, v1.16b \n" // add
2541 "st1 {v0.16b}, [%2], #16 \n" // store 16 pixels.
2542 "b.gt 1b \n"
2543 : "+r"(src_sobelx), // %0
2544 "+r"(src_sobely), // %1
2545 "+r"(dst_y), // %2
2546 "+r"(width) // %3
2547 :
2548 : "cc", "memory", "v0", "v1");
2549 }
2550
2551 // Mixes Sobel X, Sobel Y and Sobel into ARGB.
2552 // A = 255
2553 // R = Sobel X
2554 // G = Sobel
2555 // B = Sobel Y
SobelXYRow_NEON(const uint8_t * src_sobelx,const uint8_t * src_sobely,uint8_t * dst_argb,int width)2556 void SobelXYRow_NEON(const uint8_t* src_sobelx,
2557 const uint8_t* src_sobely,
2558 uint8_t* dst_argb,
2559 int width) {
2560 asm volatile(
2561 "movi v3.8b, #255 \n" // alpha
2562 // 8 pixel loop.
2563 "1: \n"
2564 "ld1 {v2.8b}, [%0], #8 \n" // load 8 sobelx.
2565 "ld1 {v0.8b}, [%1], #8 \n" // load 8 sobely.
2566 "subs %w3, %w3, #8 \n" // 8 processed per loop.
2567 "uqadd v1.8b, v0.8b, v2.8b \n" // add
2568 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB
2569 "b.gt 1b \n"
2570 : "+r"(src_sobelx), // %0
2571 "+r"(src_sobely), // %1
2572 "+r"(dst_argb), // %2
2573 "+r"(width) // %3
2574 :
2575 : "cc", "memory", "v0", "v1", "v2", "v3");
2576 }
2577
2578 // SobelX as a matrix is
2579 // -1 0 1
2580 // -2 0 2
2581 // -1 0 1
SobelXRow_NEON(const uint8_t * src_y0,const uint8_t * src_y1,const uint8_t * src_y2,uint8_t * dst_sobelx,int width)2582 void SobelXRow_NEON(const uint8_t* src_y0,
2583 const uint8_t* src_y1,
2584 const uint8_t* src_y2,
2585 uint8_t* dst_sobelx,
2586 int width) {
2587 asm volatile(
2588 "1: \n"
2589 "ld1 {v0.8b}, [%0],%5 \n" // top
2590 "ld1 {v1.8b}, [%0],%6 \n"
2591 "usubl v0.8h, v0.8b, v1.8b \n"
2592 "ld1 {v2.8b}, [%1],%5 \n" // center * 2
2593 "ld1 {v3.8b}, [%1],%6 \n"
2594 "usubl v1.8h, v2.8b, v3.8b \n"
2595 "add v0.8h, v0.8h, v1.8h \n"
2596 "add v0.8h, v0.8h, v1.8h \n"
2597 "ld1 {v2.8b}, [%2],%5 \n" // bottom
2598 "ld1 {v3.8b}, [%2],%6 \n"
2599 "subs %w4, %w4, #8 \n" // 8 pixels
2600 "usubl v1.8h, v2.8b, v3.8b \n"
2601 "add v0.8h, v0.8h, v1.8h \n"
2602 "abs v0.8h, v0.8h \n"
2603 "uqxtn v0.8b, v0.8h \n"
2604 "st1 {v0.8b}, [%3], #8 \n" // store 8 sobelx
2605 "b.gt 1b \n"
2606 : "+r"(src_y0), // %0
2607 "+r"(src_y1), // %1
2608 "+r"(src_y2), // %2
2609 "+r"(dst_sobelx), // %3
2610 "+r"(width) // %4
2611 : "r"(2LL), // %5
2612 "r"(6LL) // %6
2613 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
2614 );
2615 }
2616
2617 // SobelY as a matrix is
2618 // -1 -2 -1
2619 // 0 0 0
2620 // 1 2 1
SobelYRow_NEON(const uint8_t * src_y0,const uint8_t * src_y1,uint8_t * dst_sobely,int width)2621 void SobelYRow_NEON(const uint8_t* src_y0,
2622 const uint8_t* src_y1,
2623 uint8_t* dst_sobely,
2624 int width) {
2625 asm volatile(
2626 "1: \n"
2627 "ld1 {v0.8b}, [%0],%4 \n" // left
2628 "ld1 {v1.8b}, [%1],%4 \n"
2629 "usubl v0.8h, v0.8b, v1.8b \n"
2630 "ld1 {v2.8b}, [%0],%4 \n" // center * 2
2631 "ld1 {v3.8b}, [%1],%4 \n"
2632 "usubl v1.8h, v2.8b, v3.8b \n"
2633 "add v0.8h, v0.8h, v1.8h \n"
2634 "add v0.8h, v0.8h, v1.8h \n"
2635 "ld1 {v2.8b}, [%0],%5 \n" // right
2636 "ld1 {v3.8b}, [%1],%5 \n"
2637 "subs %w3, %w3, #8 \n" // 8 pixels
2638 "usubl v1.8h, v2.8b, v3.8b \n"
2639 "add v0.8h, v0.8h, v1.8h \n"
2640 "abs v0.8h, v0.8h \n"
2641 "uqxtn v0.8b, v0.8h \n"
2642 "st1 {v0.8b}, [%2], #8 \n" // store 8 sobely
2643 "b.gt 1b \n"
2644 : "+r"(src_y0), // %0
2645 "+r"(src_y1), // %1
2646 "+r"(dst_sobely), // %2
2647 "+r"(width) // %3
2648 : "r"(1LL), // %4
2649 "r"(6LL) // %5
2650 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
2651 );
2652 }
2653
2654 // Caveat - rounds float to half float whereas scaling version truncates.
HalfFloat1Row_NEON(const uint16_t * src,uint16_t * dst,float,int width)2655 void HalfFloat1Row_NEON(const uint16_t* src,
2656 uint16_t* dst,
2657 float /*unused*/,
2658 int width) {
2659 asm volatile(
2660 "1: \n"
2661 "ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts
2662 "subs %w2, %w2, #8 \n" // 8 pixels per loop
2663 "uxtl v2.4s, v1.4h \n" // 8 int's
2664 "uxtl2 v3.4s, v1.8h \n"
2665 "scvtf v2.4s, v2.4s \n" // 8 floats
2666 "scvtf v3.4s, v3.4s \n"
2667 "fcvtn v1.4h, v2.4s \n" // 8 half floats
2668 "fcvtn2 v1.8h, v3.4s \n"
2669 "st1 {v1.16b}, [%1], #16 \n" // store 8 shorts
2670 "b.gt 1b \n"
2671 : "+r"(src), // %0
2672 "+r"(dst), // %1
2673 "+r"(width) // %2
2674 :
2675 : "cc", "memory", "v1", "v2", "v3");
2676 }
2677
HalfFloatRow_NEON(const uint16_t * src,uint16_t * dst,float scale,int width)2678 void HalfFloatRow_NEON(const uint16_t* src,
2679 uint16_t* dst,
2680 float scale,
2681 int width) {
2682 asm volatile(
2683 "1: \n"
2684 "ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts
2685 "subs %w2, %w2, #8 \n" // 8 pixels per loop
2686 "uxtl v2.4s, v1.4h \n" // 8 int's
2687 "uxtl2 v3.4s, v1.8h \n"
2688 "scvtf v2.4s, v2.4s \n" // 8 floats
2689 "scvtf v3.4s, v3.4s \n"
2690 "fmul v2.4s, v2.4s, %3.s[0] \n" // adjust exponent
2691 "fmul v3.4s, v3.4s, %3.s[0] \n"
2692 "uqshrn v1.4h, v2.4s, #13 \n" // isolate halffloat
2693 "uqshrn2 v1.8h, v3.4s, #13 \n"
2694 "st1 {v1.16b}, [%1], #16 \n" // store 8 shorts
2695 "b.gt 1b \n"
2696 : "+r"(src), // %0
2697 "+r"(dst), // %1
2698 "+r"(width) // %2
2699 : "w"(scale * 1.9259299444e-34f) // %3
2700 : "cc", "memory", "v1", "v2", "v3");
2701 }
2702
ByteToFloatRow_NEON(const uint8_t * src,float * dst,float scale,int width)2703 void ByteToFloatRow_NEON(const uint8_t* src,
2704 float* dst,
2705 float scale,
2706 int width) {
2707 asm volatile(
2708 "1: \n"
2709 "ld1 {v1.8b}, [%0], #8 \n" // load 8 bytes
2710 "subs %w2, %w2, #8 \n" // 8 pixels per loop
2711 "uxtl v1.8h, v1.8b \n" // 8 shorts
2712 "uxtl v2.4s, v1.4h \n" // 8 ints
2713 "uxtl2 v3.4s, v1.8h \n"
2714 "scvtf v2.4s, v2.4s \n" // 8 floats
2715 "scvtf v3.4s, v3.4s \n"
2716 "fmul v2.4s, v2.4s, %3.s[0] \n" // scale
2717 "fmul v3.4s, v3.4s, %3.s[0] \n"
2718 "st1 {v2.16b, v3.16b}, [%1], #32 \n" // store 8 floats
2719 "b.gt 1b \n"
2720 : "+r"(src), // %0
2721 "+r"(dst), // %1
2722 "+r"(width) // %2
2723 : "w"(scale) // %3
2724 : "cc", "memory", "v1", "v2", "v3");
2725 }
2726
ScaleMaxSamples_NEON(const float * src,float * dst,float scale,int width)2727 float ScaleMaxSamples_NEON(const float* src,
2728 float* dst,
2729 float scale,
2730 int width) {
2731 float fmax;
2732 asm volatile(
2733 "movi v5.4s, #0 \n" // max
2734 "movi v6.4s, #0 \n"
2735
2736 "1: \n"
2737 "ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples
2738 "subs %w2, %w2, #8 \n" // 8 processed per loop
2739 "fmul v3.4s, v1.4s, %4.s[0] \n" // scale
2740 "fmul v4.4s, v2.4s, %4.s[0] \n" // scale
2741 "fmax v5.4s, v5.4s, v1.4s \n" // max
2742 "fmax v6.4s, v6.4s, v2.4s \n"
2743 "st1 {v3.4s, v4.4s}, [%1], #32 \n" // store 8 samples
2744 "b.gt 1b \n"
2745 "fmax v5.4s, v5.4s, v6.4s \n" // max
2746 "fmaxv %s3, v5.4s \n" // signed max acculator
2747 : "+r"(src), // %0
2748 "+r"(dst), // %1
2749 "+r"(width), // %2
2750 "=w"(fmax) // %3
2751 : "w"(scale) // %4
2752 : "cc", "memory", "v1", "v2", "v3", "v4", "v5", "v6");
2753 return fmax;
2754 }
2755
ScaleSumSamples_NEON(const float * src,float * dst,float scale,int width)2756 float ScaleSumSamples_NEON(const float* src,
2757 float* dst,
2758 float scale,
2759 int width) {
2760 float fsum;
2761 asm volatile(
2762 "movi v5.4s, #0 \n" // max
2763 "movi v6.4s, #0 \n" // max
2764
2765 "1: \n"
2766 "ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples
2767 "subs %w2, %w2, #8 \n" // 8 processed per loop
2768 "fmul v3.4s, v1.4s, %4.s[0] \n" // scale
2769 "fmul v4.4s, v2.4s, %4.s[0] \n"
2770 "fmla v5.4s, v1.4s, v1.4s \n" // sum of squares
2771 "fmla v6.4s, v2.4s, v2.4s \n"
2772 "st1 {v3.4s, v4.4s}, [%1], #32 \n" // store 8 samples
2773 "b.gt 1b \n"
2774 "faddp v5.4s, v5.4s, v6.4s \n"
2775 "faddp v5.4s, v5.4s, v5.4s \n"
2776 "faddp %3.4s, v5.4s, v5.4s \n" // sum
2777 : "+r"(src), // %0
2778 "+r"(dst), // %1
2779 "+r"(width), // %2
2780 "=w"(fsum) // %3
2781 : "w"(scale) // %4
2782 : "cc", "memory", "v1", "v2", "v3", "v4", "v5", "v6");
2783 return fsum;
2784 }
2785
ScaleSamples_NEON(const float * src,float * dst,float scale,int width)2786 void ScaleSamples_NEON(const float* src, float* dst, float scale, int width) {
2787 asm volatile(
2788 "1: \n"
2789 "ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples
2790 "subs %w2, %w2, #8 \n" // 8 processed per loop
2791 "fmul v1.4s, v1.4s, %3.s[0] \n" // scale
2792 "fmul v2.4s, v2.4s, %3.s[0] \n" // scale
2793 "st1 {v1.4s, v2.4s}, [%1], #32 \n" // store 8 samples
2794 "b.gt 1b \n"
2795 : "+r"(src), // %0
2796 "+r"(dst), // %1
2797 "+r"(width) // %2
2798 : "w"(scale) // %3
2799 : "cc", "memory", "v1", "v2");
2800 }
2801
2802 // filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
GaussCol_NEON(const uint16_t * src0,const uint16_t * src1,const uint16_t * src2,const uint16_t * src3,const uint16_t * src4,uint32_t * dst,int width)2803 void GaussCol_NEON(const uint16_t* src0,
2804 const uint16_t* src1,
2805 const uint16_t* src2,
2806 const uint16_t* src3,
2807 const uint16_t* src4,
2808 uint32_t* dst,
2809 int width) {
2810 asm volatile(
2811 "movi v6.8h, #4 \n" // constant 4
2812 "movi v7.8h, #6 \n" // constant 6
2813
2814 "1: \n"
2815 "ld1 {v1.8h}, [%0], #16 \n" // load 8 samples, 5 rows
2816 "ld1 {v2.8h}, [%4], #16 \n"
2817 "uaddl v0.4s, v1.4h, v2.4h \n" // * 1
2818 "uaddl2 v1.4s, v1.8h, v2.8h \n" // * 1
2819 "ld1 {v2.8h}, [%1], #16 \n"
2820 "umlal v0.4s, v2.4h, v6.4h \n" // * 4
2821 "umlal2 v1.4s, v2.8h, v6.8h \n" // * 4
2822 "ld1 {v2.8h}, [%2], #16 \n"
2823 "umlal v0.4s, v2.4h, v7.4h \n" // * 6
2824 "umlal2 v1.4s, v2.8h, v7.8h \n" // * 6
2825 "ld1 {v2.8h}, [%3], #16 \n"
2826 "umlal v0.4s, v2.4h, v6.4h \n" // * 4
2827 "umlal2 v1.4s, v2.8h, v6.8h \n" // * 4
2828 "subs %w6, %w6, #8 \n" // 8 processed per loop
2829 "st1 {v0.4s,v1.4s}, [%5], #32 \n" // store 8 samples
2830 "b.gt 1b \n"
2831 : "+r"(src0), // %0
2832 "+r"(src1), // %1
2833 "+r"(src2), // %2
2834 "+r"(src3), // %3
2835 "+r"(src4), // %4
2836 "+r"(dst), // %5
2837 "+r"(width) // %6
2838 :
2839 : "cc", "memory", "v0", "v1", "v2", "v6", "v7");
2840 }
2841
2842 // filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
GaussRow_NEON(const uint32_t * src,uint16_t * dst,int width)2843 void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width) {
2844 const uint32_t* src1 = src + 1;
2845 const uint32_t* src2 = src + 2;
2846 const uint32_t* src3 = src + 3;
2847 asm volatile(
2848 "movi v6.4s, #4 \n" // constant 4
2849 "movi v7.4s, #6 \n" // constant 6
2850
2851 "1: \n"
2852 "ld1 {v0.4s,v1.4s,v2.4s}, [%0], %6 \n" // load 12 source samples
2853 "add v0.4s, v0.4s, v1.4s \n" // * 1
2854 "add v1.4s, v1.4s, v2.4s \n" // * 1
2855 "ld1 {v2.4s,v3.4s}, [%2], #32 \n"
2856 "mla v0.4s, v2.4s, v7.4s \n" // * 6
2857 "mla v1.4s, v3.4s, v7.4s \n" // * 6
2858 "ld1 {v2.4s,v3.4s}, [%1], #32 \n"
2859 "ld1 {v4.4s,v5.4s}, [%3], #32 \n"
2860 "add v2.4s, v2.4s, v4.4s \n" // add rows for * 4
2861 "add v3.4s, v3.4s, v5.4s \n"
2862 "mla v0.4s, v2.4s, v6.4s \n" // * 4
2863 "mla v1.4s, v3.4s, v6.4s \n" // * 4
2864 "subs %w5, %w5, #8 \n" // 8 processed per loop
2865 "uqrshrn v0.4h, v0.4s, #8 \n" // round and pack
2866 "uqrshrn2 v0.8h, v1.4s, #8 \n"
2867 "st1 {v0.8h}, [%4], #16 \n" // store 8 samples
2868 "b.gt 1b \n"
2869 : "+r"(src), // %0
2870 "+r"(src1), // %1
2871 "+r"(src2), // %2
2872 "+r"(src3), // %3
2873 "+r"(dst), // %4
2874 "+r"(width) // %5
2875 : "r"(32LL) // %6
2876 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
2877 }
2878
2879 // Convert biplanar NV21 to packed YUV24
NV21ToYUV24Row_NEON(const uint8_t * src_y,const uint8_t * src_vu,uint8_t * dst_yuv24,int width)2880 void NV21ToYUV24Row_NEON(const uint8_t* src_y,
2881 const uint8_t* src_vu,
2882 uint8_t* dst_yuv24,
2883 int width) {
2884 asm volatile(
2885 "1: \n"
2886 "ld1 {v2.16b}, [%0], #16 \n" // load 16 Y values
2887 "ld2 {v0.8b, v1.8b}, [%1], #16 \n" // load 8 VU values
2888 "zip1 v0.16b, v0.16b, v0.16b \n" // replicate V values
2889 "zip1 v1.16b, v1.16b, v1.16b \n" // replicate U values
2890 "subs %w3, %w3, #16 \n" // 16 pixels per loop
2891 "st3 {v0.16b,v1.16b,v2.16b}, [%2], #48 \n" // store 16 YUV pixels
2892 "b.gt 1b \n"
2893 : "+r"(src_y), // %0
2894 "+r"(src_vu), // %1
2895 "+r"(dst_yuv24), // %2
2896 "+r"(width) // %3
2897 :
2898 : "cc", "memory", "v0", "v1", "v2");
2899 }
2900
AYUVToUVRow_NEON(const uint8_t * src_ayuv,int src_stride_ayuv,uint8_t * dst_uv,int width)2901 void AYUVToUVRow_NEON(const uint8_t* src_ayuv,
2902 int src_stride_ayuv,
2903 uint8_t* dst_uv,
2904 int width) {
2905 const uint8_t* src_ayuv_1 = src_ayuv + src_stride_ayuv;
2906 asm volatile(
2907
2908 "1: \n"
2909 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16
2910 // pixels.
2911 "uaddlp v0.8h, v0.16b \n" // V 16 bytes -> 8 shorts.
2912 "uaddlp v1.8h, v1.16b \n" // U 16 bytes -> 8 shorts.
2913 "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16
2914 "uadalp v0.8h, v4.16b \n" // V 16 bytes -> 8 shorts.
2915 "uadalp v1.8h, v5.16b \n" // U 16 bytes -> 8 shorts.
2916 "uqrshrn v3.8b, v0.8h, #2 \n" // 2x2 average
2917 "uqrshrn v2.8b, v1.8h, #2 \n"
2918 "subs %w3, %w3, #16 \n" // 16 processed per loop.
2919 "st2 {v2.8b,v3.8b}, [%2], #16 \n" // store 8 pixels UV.
2920 "b.gt 1b \n"
2921 : "+r"(src_ayuv), // %0
2922 "+r"(src_ayuv_1), // %1
2923 "+r"(dst_uv), // %2
2924 "+r"(width) // %3
2925 :
2926 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
2927 }
2928
AYUVToVURow_NEON(const uint8_t * src_ayuv,int src_stride_ayuv,uint8_t * dst_vu,int width)2929 void AYUVToVURow_NEON(const uint8_t* src_ayuv,
2930 int src_stride_ayuv,
2931 uint8_t* dst_vu,
2932 int width) {
2933 const uint8_t* src_ayuv_1 = src_ayuv + src_stride_ayuv;
2934 asm volatile(
2935
2936 "1: \n"
2937 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16
2938 // pixels.
2939 "uaddlp v0.8h, v0.16b \n" // V 16 bytes -> 8 shorts.
2940 "uaddlp v1.8h, v1.16b \n" // U 16 bytes -> 8 shorts.
2941 "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16
2942 "uadalp v0.8h, v4.16b \n" // V 16 bytes -> 8 shorts.
2943 "uadalp v1.8h, v5.16b \n" // U 16 bytes -> 8 shorts.
2944 "uqrshrn v0.8b, v0.8h, #2 \n" // 2x2 average
2945 "uqrshrn v1.8b, v1.8h, #2 \n"
2946 "subs %w3, %w3, #16 \n" // 16 processed per loop.
2947 "st2 {v0.8b,v1.8b}, [%2], #16 \n" // store 8 pixels VU.
2948 "b.gt 1b \n"
2949 : "+r"(src_ayuv), // %0
2950 "+r"(src_ayuv_1), // %1
2951 "+r"(dst_vu), // %2
2952 "+r"(width) // %3
2953 :
2954 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
2955 }
2956
2957 // Copy row of AYUV Y's into Y
AYUVToYRow_NEON(const uint8_t * src_ayuv,uint8_t * dst_y,int width)2958 void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) {
2959 asm volatile(
2960 "1: \n"
2961 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16
2962 // pixels
2963 "subs %w2, %w2, #16 \n" // 16 pixels per loop
2964 "st1 {v2.16b}, [%1], #16 \n" // store 16 Y pixels
2965 "b.gt 1b \n"
2966 : "+r"(src_ayuv), // %0
2967 "+r"(dst_y), // %1
2968 "+r"(width) // %2
2969 :
2970 : "cc", "memory", "v0", "v1", "v2", "v3");
2971 }
2972
FloatDivToByteRow_NEON(const float * src_weights,const float * src_values,uint8_t * dst_out,uint8_t * dst_mask,int width)2973 void FloatDivToByteRow_NEON(const float* src_weights,
2974 const float* src_values,
2975 uint8_t* dst_out,
2976 uint8_t* dst_mask,
2977 int width) {
2978 asm volatile(
2979 "movi v0.4s, #0 \n"
2980
2981 "1: \n"
2982 "ld1 {v1.4s,v2.4s}, [%0], #32 \n" // load 8 float weights
2983 "ld1 {v3.4s,v4.4s}, [%1], #32 \n" // load 8 float values
2984 "subs %w4, %w4, #8 \n" // 8 pixels per loop
2985
2986 "fdiv v1.4s, v3.4s, v1.4s \n" // values / weights
2987 "fdiv v2.4s, v4.4s, v2.4s \n"
2988
2989 "fcvtas v1.4s, v1.4s \n" // float to int
2990 "fcvtas v2.4s, v2.4s \n" // float to int
2991 "uqxtn v1.4h, v1.4s \n" // 8 shorts
2992 "uqxtn2 v1.8h, v2.4s \n"
2993 "uqxtn v1.8b, v1.8h \n" // 8 bytes
2994
2995 "st1 {v1.8b}, [%2], #8 \n" // store 8 byte out
2996
2997 "fcmgt v5.4s, v1.4s, v0.4s \n" // cmp weight to zero
2998 "fcmgt v6.4s, v2.4s, v0.4s \n"
2999 "uqxtn v5.4h, v5.4s \n" // 8 shorts
3000 "uqxtn2 v5.8h, v6.4s \n"
3001 "uqxtn v5.8b, v1.8h \n" // 8 bytes
3002
3003 "st1 {v5.8b}, [%3], #8 \n" // store 8 byte mask
3004
3005 "b.gt 1b \n"
3006 : "+r"(src_weights), // %0
3007 "+r"(src_values), // %1
3008 "+r"(dst_out), // %2
3009 "+r"(dst_mask), // %3
3010 "+r"(width) // %4
3011 :
3012 : "cc", "memory", "v1", "v2", "v3", "v4", "v5", "v6");
3013 }
3014
3015 // Convert biplanar UV channel of NV12 to NV21
UVToVURow_NEON(const uint8_t * src_uv,uint8_t * dst_vu,int width)3016 void UVToVURow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
3017 asm volatile(
3018 "1: \n"
3019 "ld2 {v0.16b, v1.16b}, [%0], #32 \n" // load 16 UV values
3020 "orr v2.16b, v0.16b, v0.16b \n" // move U after V
3021 "subs %w2, %w2, #16 \n" // 16 pixels per loop
3022 "st2 {v1.16b, v2.16b}, [%1], #32 \n" // store 16 VU pixels
3023 "b.gt 1b \n"
3024 : "+r"(src_uv), // %0
3025 "+r"(dst_vu), // %1
3026 "+r"(width) // %2
3027 :
3028 : "cc", "memory", "v0", "v1", "v2");
3029 }
3030
3031 #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
3032
3033 #ifdef __cplusplus
3034 } // extern "C"
3035 } // namespace libyuv
3036 #endif
3037