1 /*
2 * Copyright 2014 The LibYuv Project Authors. All rights reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "libyuv/row.h"
12
13 #ifdef __cplusplus
14 namespace libyuv {
15 extern "C" {
16 #endif
17
18 // This module is for GCC Neon armv8 64 bit.
19 #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
20
21 // Read 8 Y, 4 U and 4 V from 422
22 #define READYUV422 \
23 MEMACCESS(0) \
24 "ld1 {v0.8b}, [%0], #8 \n" \
25 MEMACCESS(1) \
26 "ld1 {v1.s}[0], [%1], #4 \n" \
27 MEMACCESS(2) \
28 "ld1 {v1.s}[1], [%2], #4 \n"
29
30 // Read 8 Y, 2 U and 2 V from 422
31 #define READYUV411 \
32 MEMACCESS(0) \
33 "ld1 {v0.8b}, [%0], #8 \n" \
34 MEMACCESS(1) \
35 "ld1 {v2.h}[0], [%1], #2 \n" \
36 MEMACCESS(2) \
37 "ld1 {v2.h}[1], [%2], #2 \n" \
38 "zip1 v1.8b, v2.8b, v2.8b \n"
39
40 // Read 8 Y, 8 U and 8 V from 444
41 #define READYUV444 \
42 MEMACCESS(0) \
43 "ld1 {v0.8b}, [%0], #8 \n" \
44 MEMACCESS(1) \
45 "ld1 {v1.d}[0], [%1], #8 \n" \
46 MEMACCESS(2) \
47 "ld1 {v1.d}[1], [%2], #8 \n" \
48 "uaddlp v1.8h, v1.16b \n" \
49 "rshrn v1.8b, v1.8h, #1 \n"
50
51 // Read 8 Y, and set 4 U and 4 V to 128
52 #define READYUV400 \
53 MEMACCESS(0) \
54 "ld1 {v0.8b}, [%0], #8 \n" \
55 "movi v1.8b , #128 \n"
56
57 // Read 8 Y and 4 UV from NV12
58 #define READNV12 \
59 MEMACCESS(0) \
60 "ld1 {v0.8b}, [%0], #8 \n" \
61 MEMACCESS(1) \
62 "ld1 {v2.8b}, [%1], #8 \n" \
63 "uzp1 v1.8b, v2.8b, v2.8b \n" \
64 "uzp2 v3.8b, v2.8b, v2.8b \n" \
65 "ins v1.s[1], v3.s[0] \n"
66
67 // Read 8 Y and 4 VU from NV21
68 #define READNV21 \
69 MEMACCESS(0) \
70 "ld1 {v0.8b}, [%0], #8 \n" \
71 MEMACCESS(1) \
72 "ld1 {v2.8b}, [%1], #8 \n" \
73 "uzp1 v3.8b, v2.8b, v2.8b \n" \
74 "uzp2 v1.8b, v2.8b, v2.8b \n" \
75 "ins v1.s[1], v3.s[0] \n"
76
77 // Read 8 YUY2
78 #define READYUY2 \
79 MEMACCESS(0) \
80 "ld2 {v0.8b, v1.8b}, [%0], #16 \n" \
81 "uzp2 v3.8b, v1.8b, v1.8b \n" \
82 "uzp1 v1.8b, v1.8b, v1.8b \n" \
83 "ins v1.s[1], v3.s[0] \n"
84
85 // Read 8 UYVY
86 #define READUYVY \
87 MEMACCESS(0) \
88 "ld2 {v2.8b, v3.8b}, [%0], #16 \n" \
89 "orr v0.8b, v3.8b, v3.8b \n" \
90 "uzp1 v1.8b, v2.8b, v2.8b \n" \
91 "uzp2 v3.8b, v2.8b, v2.8b \n" \
92 "ins v1.s[1], v3.s[0] \n"
93
94 #define YUVTORGB_SETUP \
95 "ld1r {v24.8h}, [%[kUVBiasBGR]], #2 \n" \
96 "ld1r {v25.8h}, [%[kUVBiasBGR]], #2 \n" \
97 "ld1r {v26.8h}, [%[kUVBiasBGR]] \n" \
98 "ld1r {v31.4s}, [%[kYToRgb]] \n" \
99 "ld2 {v27.8h, v28.8h}, [%[kUVToRB]] \n" \
100 "ld2 {v29.8h, v30.8h}, [%[kUVToG]] \n"
101
102 #define YUVTORGB(vR, vG, vB) \
103 "uxtl v0.8h, v0.8b \n" /* Extract Y */ \
104 "shll v2.8h, v1.8b, #8 \n" /* Replicate UV */ \
105 "ushll2 v3.4s, v0.8h, #0 \n" /* Y */ \
106 "ushll v0.4s, v0.4h, #0 \n" \
107 "mul v3.4s, v3.4s, v31.4s \n" \
108 "mul v0.4s, v0.4s, v31.4s \n" \
109 "sqshrun v0.4h, v0.4s, #16 \n" \
110 "sqshrun2 v0.8h, v3.4s, #16 \n" /* Y */ \
111 "uaddw v1.8h, v2.8h, v1.8b \n" /* Replicate UV */ \
112 "mov v2.d[0], v1.d[1] \n" /* Extract V */ \
113 "uxtl v2.8h, v2.8b \n" \
114 "uxtl v1.8h, v1.8b \n" /* Extract U */ \
115 "mul v3.8h, v1.8h, v27.8h \n" \
116 "mul v5.8h, v1.8h, v29.8h \n" \
117 "mul v6.8h, v2.8h, v30.8h \n" \
118 "mul v7.8h, v2.8h, v28.8h \n" \
119 "sqadd v6.8h, v6.8h, v5.8h \n" \
120 "sqadd " #vB ".8h, v24.8h, v0.8h \n" /* B */ \
121 "sqadd " #vG ".8h, v25.8h, v0.8h \n" /* G */ \
122 "sqadd " #vR ".8h, v26.8h, v0.8h \n" /* R */ \
123 "sqadd " #vB ".8h, " #vB ".8h, v3.8h \n" /* B */ \
124 "sqsub " #vG ".8h, " #vG ".8h, v6.8h \n" /* G */ \
125 "sqadd " #vR ".8h, " #vR ".8h, v7.8h \n" /* R */ \
126 "sqshrun " #vB ".8b, " #vB ".8h, #6 \n" /* B */ \
127 "sqshrun " #vG ".8b, " #vG ".8h, #6 \n" /* G */ \
128 "sqshrun " #vR ".8b, " #vR ".8h, #6 \n" /* R */ \
129
I444ToARGBRow_NEON(const uint8 * src_y,const uint8 * src_u,const uint8 * src_v,uint8 * dst_argb,const struct YuvConstants * yuvconstants,int width)130 void I444ToARGBRow_NEON(const uint8* src_y,
131 const uint8* src_u,
132 const uint8* src_v,
133 uint8* dst_argb,
134 const struct YuvConstants* yuvconstants,
135 int width) {
136 asm volatile (
137 YUVTORGB_SETUP
138 "movi v23.8b, #255 \n" /* A */
139 "1: \n"
140 READYUV444
141 YUVTORGB(v22, v21, v20)
142 "subs %w4, %w4, #8 \n"
143 MEMACCESS(3)
144 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
145 "b.gt 1b \n"
146 : "+r"(src_y), // %0
147 "+r"(src_u), // %1
148 "+r"(src_v), // %2
149 "+r"(dst_argb), // %3
150 "+r"(width) // %4
151 : [kUVToRB]"r"(&yuvconstants->kUVToRB),
152 [kUVToG]"r"(&yuvconstants->kUVToG),
153 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
154 [kYToRgb]"r"(&yuvconstants->kYToRgb)
155 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
156 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
157 );
158 }
159
I422ToARGBRow_NEON(const uint8 * src_y,const uint8 * src_u,const uint8 * src_v,uint8 * dst_argb,const struct YuvConstants * yuvconstants,int width)160 void I422ToARGBRow_NEON(const uint8* src_y,
161 const uint8* src_u,
162 const uint8* src_v,
163 uint8* dst_argb,
164 const struct YuvConstants* yuvconstants,
165 int width) {
166 asm volatile (
167 YUVTORGB_SETUP
168 "movi v23.8b, #255 \n" /* A */
169 "1: \n"
170 READYUV422
171 YUVTORGB(v22, v21, v20)
172 "subs %w4, %w4, #8 \n"
173 MEMACCESS(3)
174 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
175 "b.gt 1b \n"
176 : "+r"(src_y), // %0
177 "+r"(src_u), // %1
178 "+r"(src_v), // %2
179 "+r"(dst_argb), // %3
180 "+r"(width) // %4
181 : [kUVToRB]"r"(&yuvconstants->kUVToRB),
182 [kUVToG]"r"(&yuvconstants->kUVToG),
183 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
184 [kYToRgb]"r"(&yuvconstants->kYToRgb)
185 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
186 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
187 );
188 }
189
I422AlphaToARGBRow_NEON(const uint8 * src_y,const uint8 * src_u,const uint8 * src_v,const uint8 * src_a,uint8 * dst_argb,const struct YuvConstants * yuvconstants,int width)190 void I422AlphaToARGBRow_NEON(const uint8* src_y,
191 const uint8* src_u,
192 const uint8* src_v,
193 const uint8* src_a,
194 uint8* dst_argb,
195 const struct YuvConstants* yuvconstants,
196 int width) {
197 asm volatile (
198 YUVTORGB_SETUP
199 "1: \n"
200 READYUV422
201 YUVTORGB(v22, v21, v20)
202 MEMACCESS(3)
203 "ld1 {v23.8b}, [%3], #8 \n"
204 "subs %w5, %w5, #8 \n"
205 MEMACCESS(4)
206 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%4], #32 \n"
207 "b.gt 1b \n"
208 : "+r"(src_y), // %0
209 "+r"(src_u), // %1
210 "+r"(src_v), // %2
211 "+r"(src_a), // %3
212 "+r"(dst_argb), // %4
213 "+r"(width) // %5
214 : [kUVToRB]"r"(&yuvconstants->kUVToRB),
215 [kUVToG]"r"(&yuvconstants->kUVToG),
216 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
217 [kYToRgb]"r"(&yuvconstants->kYToRgb)
218 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
219 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
220 );
221 }
222
I411ToARGBRow_NEON(const uint8 * src_y,const uint8 * src_u,const uint8 * src_v,uint8 * dst_argb,const struct YuvConstants * yuvconstants,int width)223 void I411ToARGBRow_NEON(const uint8* src_y,
224 const uint8* src_u,
225 const uint8* src_v,
226 uint8* dst_argb,
227 const struct YuvConstants* yuvconstants,
228 int width) {
229 asm volatile (
230 YUVTORGB_SETUP
231 "movi v23.8b, #255 \n" /* A */
232 "1: \n"
233 READYUV411
234 YUVTORGB(v22, v21, v20)
235 "subs %w4, %w4, #8 \n"
236 MEMACCESS(3)
237 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
238 "b.gt 1b \n"
239 : "+r"(src_y), // %0
240 "+r"(src_u), // %1
241 "+r"(src_v), // %2
242 "+r"(dst_argb), // %3
243 "+r"(width) // %4
244 : [kUVToRB]"r"(&yuvconstants->kUVToRB),
245 [kUVToG]"r"(&yuvconstants->kUVToG),
246 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
247 [kYToRgb]"r"(&yuvconstants->kYToRgb)
248 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
249 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
250 );
251 }
252
I422ToRGBARow_NEON(const uint8 * src_y,const uint8 * src_u,const uint8 * src_v,uint8 * dst_rgba,const struct YuvConstants * yuvconstants,int width)253 void I422ToRGBARow_NEON(const uint8* src_y,
254 const uint8* src_u,
255 const uint8* src_v,
256 uint8* dst_rgba,
257 const struct YuvConstants* yuvconstants,
258 int width) {
259 asm volatile (
260 YUVTORGB_SETUP
261 "movi v20.8b, #255 \n" /* A */
262 "1: \n"
263 READYUV422
264 YUVTORGB(v23, v22, v21)
265 "subs %w4, %w4, #8 \n"
266 MEMACCESS(3)
267 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
268 "b.gt 1b \n"
269 : "+r"(src_y), // %0
270 "+r"(src_u), // %1
271 "+r"(src_v), // %2
272 "+r"(dst_rgba), // %3
273 "+r"(width) // %4
274 : [kUVToRB]"r"(&yuvconstants->kUVToRB),
275 [kUVToG]"r"(&yuvconstants->kUVToG),
276 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
277 [kYToRgb]"r"(&yuvconstants->kYToRgb)
278 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
279 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
280 );
281 }
282
I422ToRGB24Row_NEON(const uint8 * src_y,const uint8 * src_u,const uint8 * src_v,uint8 * dst_rgb24,const struct YuvConstants * yuvconstants,int width)283 void I422ToRGB24Row_NEON(const uint8* src_y,
284 const uint8* src_u,
285 const uint8* src_v,
286 uint8* dst_rgb24,
287 const struct YuvConstants* yuvconstants,
288 int width) {
289 asm volatile (
290 YUVTORGB_SETUP
291 "1: \n"
292 READYUV422
293 YUVTORGB(v22, v21, v20)
294 "subs %w4, %w4, #8 \n"
295 MEMACCESS(3)
296 "st3 {v20.8b,v21.8b,v22.8b}, [%3], #24 \n"
297 "b.gt 1b \n"
298 : "+r"(src_y), // %0
299 "+r"(src_u), // %1
300 "+r"(src_v), // %2
301 "+r"(dst_rgb24), // %3
302 "+r"(width) // %4
303 : [kUVToRB]"r"(&yuvconstants->kUVToRB),
304 [kUVToG]"r"(&yuvconstants->kUVToG),
305 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
306 [kYToRgb]"r"(&yuvconstants->kYToRgb)
307 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
308 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
309 );
310 }
311
312 #define ARGBTORGB565 \
313 "shll v0.8h, v22.8b, #8 \n" /* R */ \
314 "shll v21.8h, v21.8b, #8 \n" /* G */ \
315 "shll v20.8h, v20.8b, #8 \n" /* B */ \
316 "sri v0.8h, v21.8h, #5 \n" /* RG */ \
317 "sri v0.8h, v20.8h, #11 \n" /* RGB */
318
I422ToRGB565Row_NEON(const uint8 * src_y,const uint8 * src_u,const uint8 * src_v,uint8 * dst_rgb565,const struct YuvConstants * yuvconstants,int width)319 void I422ToRGB565Row_NEON(const uint8* src_y,
320 const uint8* src_u,
321 const uint8* src_v,
322 uint8* dst_rgb565,
323 const struct YuvConstants* yuvconstants,
324 int width) {
325 asm volatile (
326 YUVTORGB_SETUP
327 "1: \n"
328 READYUV422
329 YUVTORGB(v22, v21, v20)
330 "subs %w4, %w4, #8 \n"
331 ARGBTORGB565
332 MEMACCESS(3)
333 "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels RGB565.
334 "b.gt 1b \n"
335 : "+r"(src_y), // %0
336 "+r"(src_u), // %1
337 "+r"(src_v), // %2
338 "+r"(dst_rgb565), // %3
339 "+r"(width) // %4
340 : [kUVToRB]"r"(&yuvconstants->kUVToRB),
341 [kUVToG]"r"(&yuvconstants->kUVToG),
342 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
343 [kYToRgb]"r"(&yuvconstants->kYToRgb)
344 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
345 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
346 );
347 }
348
349 #define ARGBTOARGB1555 \
350 "shll v0.8h, v23.8b, #8 \n" /* A */ \
351 "shll v22.8h, v22.8b, #8 \n" /* R */ \
352 "shll v21.8h, v21.8b, #8 \n" /* G */ \
353 "shll v20.8h, v20.8b, #8 \n" /* B */ \
354 "sri v0.8h, v22.8h, #1 \n" /* AR */ \
355 "sri v0.8h, v21.8h, #6 \n" /* ARG */ \
356 "sri v0.8h, v20.8h, #11 \n" /* ARGB */
357
I422ToARGB1555Row_NEON(const uint8 * src_y,const uint8 * src_u,const uint8 * src_v,uint8 * dst_argb1555,const struct YuvConstants * yuvconstants,int width)358 void I422ToARGB1555Row_NEON(const uint8* src_y,
359 const uint8* src_u,
360 const uint8* src_v,
361 uint8* dst_argb1555,
362 const struct YuvConstants* yuvconstants,
363 int width) {
364 asm volatile (
365 YUVTORGB_SETUP
366 "movi v23.8b, #255 \n"
367 "1: \n"
368 READYUV422
369 YUVTORGB(v22, v21, v20)
370 "subs %w4, %w4, #8 \n"
371 ARGBTOARGB1555
372 MEMACCESS(3)
373 "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels RGB565.
374 "b.gt 1b \n"
375 : "+r"(src_y), // %0
376 "+r"(src_u), // %1
377 "+r"(src_v), // %2
378 "+r"(dst_argb1555), // %3
379 "+r"(width) // %4
380 : [kUVToRB]"r"(&yuvconstants->kUVToRB),
381 [kUVToG]"r"(&yuvconstants->kUVToG),
382 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
383 [kYToRgb]"r"(&yuvconstants->kYToRgb)
384 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
385 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
386 );
387 }
388
389 #define ARGBTOARGB4444 \
390 /* Input v20.8b<=B, v21.8b<=G, v22.8b<=R, v23.8b<=A, v4.8b<=0x0f */ \
391 "ushr v20.8b, v20.8b, #4 \n" /* B */ \
392 "bic v21.8b, v21.8b, v4.8b \n" /* G */ \
393 "ushr v22.8b, v22.8b, #4 \n" /* R */ \
394 "bic v23.8b, v23.8b, v4.8b \n" /* A */ \
395 "orr v0.8b, v20.8b, v21.8b \n" /* BG */ \
396 "orr v1.8b, v22.8b, v23.8b \n" /* RA */ \
397 "zip1 v0.16b, v0.16b, v1.16b \n" /* BGRA */
398
I422ToARGB4444Row_NEON(const uint8 * src_y,const uint8 * src_u,const uint8 * src_v,uint8 * dst_argb4444,const struct YuvConstants * yuvconstants,int width)399 void I422ToARGB4444Row_NEON(const uint8* src_y,
400 const uint8* src_u,
401 const uint8* src_v,
402 uint8* dst_argb4444,
403 const struct YuvConstants* yuvconstants,
404 int width) {
405 asm volatile (
406 YUVTORGB_SETUP
407 "movi v4.16b, #0x0f \n" // bits to clear with vbic.
408 "1: \n"
409 READYUV422
410 YUVTORGB(v22, v21, v20)
411 "subs %w4, %w4, #8 \n"
412 "movi v23.8b, #255 \n"
413 ARGBTOARGB4444
414 MEMACCESS(3)
415 "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels ARGB4444.
416 "b.gt 1b \n"
417 : "+r"(src_y), // %0
418 "+r"(src_u), // %1
419 "+r"(src_v), // %2
420 "+r"(dst_argb4444), // %3
421 "+r"(width) // %4
422 : [kUVToRB]"r"(&yuvconstants->kUVToRB),
423 [kUVToG]"r"(&yuvconstants->kUVToG),
424 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
425 [kYToRgb]"r"(&yuvconstants->kYToRgb)
426 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
427 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
428 );
429 }
430
I400ToARGBRow_NEON(const uint8 * src_y,uint8 * dst_argb,int width)431 void I400ToARGBRow_NEON(const uint8* src_y,
432 uint8* dst_argb,
433 int width) {
434 asm volatile (
435 YUVTORGB_SETUP
436 "movi v23.8b, #255 \n"
437 "1: \n"
438 READYUV400
439 YUVTORGB(v22, v21, v20)
440 "subs %w2, %w2, #8 \n"
441 MEMACCESS(1)
442 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
443 "b.gt 1b \n"
444 : "+r"(src_y), // %0
445 "+r"(dst_argb), // %1
446 "+r"(width) // %2
447 : [kUVToRB]"r"(&kYuvI601Constants.kUVToRB),
448 [kUVToG]"r"(&kYuvI601Constants.kUVToG),
449 [kUVBiasBGR]"r"(&kYuvI601Constants.kUVBiasBGR),
450 [kYToRgb]"r"(&kYuvI601Constants.kYToRgb)
451 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
452 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
453 );
454 }
455
J400ToARGBRow_NEON(const uint8 * src_y,uint8 * dst_argb,int width)456 void J400ToARGBRow_NEON(const uint8* src_y,
457 uint8* dst_argb,
458 int width) {
459 asm volatile (
460 "movi v23.8b, #255 \n"
461 "1: \n"
462 MEMACCESS(0)
463 "ld1 {v20.8b}, [%0], #8 \n"
464 "orr v21.8b, v20.8b, v20.8b \n"
465 "orr v22.8b, v20.8b, v20.8b \n"
466 "subs %w2, %w2, #8 \n"
467 MEMACCESS(1)
468 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
469 "b.gt 1b \n"
470 : "+r"(src_y), // %0
471 "+r"(dst_argb), // %1
472 "+r"(width) // %2
473 :
474 : "cc", "memory", "v20", "v21", "v22", "v23"
475 );
476 }
477
NV12ToARGBRow_NEON(const uint8 * src_y,const uint8 * src_uv,uint8 * dst_argb,const struct YuvConstants * yuvconstants,int width)478 void NV12ToARGBRow_NEON(const uint8* src_y,
479 const uint8* src_uv,
480 uint8* dst_argb,
481 const struct YuvConstants* yuvconstants,
482 int width) {
483 asm volatile (
484 YUVTORGB_SETUP
485 "movi v23.8b, #255 \n"
486 "1: \n"
487 READNV12
488 YUVTORGB(v22, v21, v20)
489 "subs %w3, %w3, #8 \n"
490 MEMACCESS(2)
491 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n"
492 "b.gt 1b \n"
493 : "+r"(src_y), // %0
494 "+r"(src_uv), // %1
495 "+r"(dst_argb), // %2
496 "+r"(width) // %3
497 : [kUVToRB]"r"(&yuvconstants->kUVToRB),
498 [kUVToG]"r"(&yuvconstants->kUVToG),
499 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
500 [kYToRgb]"r"(&yuvconstants->kYToRgb)
501 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
502 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
503 );
504 }
505
NV21ToARGBRow_NEON(const uint8 * src_y,const uint8 * src_vu,uint8 * dst_argb,const struct YuvConstants * yuvconstants,int width)506 void NV21ToARGBRow_NEON(const uint8* src_y,
507 const uint8* src_vu,
508 uint8* dst_argb,
509 const struct YuvConstants* yuvconstants,
510 int width) {
511 asm volatile (
512 YUVTORGB_SETUP
513 "movi v23.8b, #255 \n"
514 "1: \n"
515 READNV21
516 YUVTORGB(v22, v21, v20)
517 "subs %w3, %w3, #8 \n"
518 MEMACCESS(2)
519 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n"
520 "b.gt 1b \n"
521 : "+r"(src_y), // %0
522 "+r"(src_vu), // %1
523 "+r"(dst_argb), // %2
524 "+r"(width) // %3
525 : [kUVToRB]"r"(&yuvconstants->kUVToRB),
526 [kUVToG]"r"(&yuvconstants->kUVToG),
527 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
528 [kYToRgb]"r"(&yuvconstants->kYToRgb)
529 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
530 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
531 );
532 }
533
NV12ToRGB565Row_NEON(const uint8 * src_y,const uint8 * src_uv,uint8 * dst_rgb565,const struct YuvConstants * yuvconstants,int width)534 void NV12ToRGB565Row_NEON(const uint8* src_y,
535 const uint8* src_uv,
536 uint8* dst_rgb565,
537 const struct YuvConstants* yuvconstants,
538 int width) {
539 asm volatile (
540 YUVTORGB_SETUP
541 "1: \n"
542 READNV12
543 YUVTORGB(v22, v21, v20)
544 "subs %w3, %w3, #8 \n"
545 ARGBTORGB565
546 MEMACCESS(2)
547 "st1 {v0.8h}, [%2], 16 \n" // store 8 pixels RGB565.
548 "b.gt 1b \n"
549 : "+r"(src_y), // %0
550 "+r"(src_uv), // %1
551 "+r"(dst_rgb565), // %2
552 "+r"(width) // %3
553 : [kUVToRB]"r"(&yuvconstants->kUVToRB),
554 [kUVToG]"r"(&yuvconstants->kUVToG),
555 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
556 [kYToRgb]"r"(&yuvconstants->kYToRgb)
557 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
558 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
559 );
560 }
561
YUY2ToARGBRow_NEON(const uint8 * src_yuy2,uint8 * dst_argb,const struct YuvConstants * yuvconstants,int width)562 void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
563 uint8* dst_argb,
564 const struct YuvConstants* yuvconstants,
565 int width) {
566 asm volatile (
567 YUVTORGB_SETUP
568 "movi v23.8b, #255 \n"
569 "1: \n"
570 READYUY2
571 YUVTORGB(v22, v21, v20)
572 "subs %w2, %w2, #8 \n"
573 MEMACCESS(1)
574 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
575 "b.gt 1b \n"
576 : "+r"(src_yuy2), // %0
577 "+r"(dst_argb), // %1
578 "+r"(width) // %2
579 : [kUVToRB]"r"(&yuvconstants->kUVToRB),
580 [kUVToG]"r"(&yuvconstants->kUVToG),
581 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
582 [kYToRgb]"r"(&yuvconstants->kYToRgb)
583 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
584 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
585 );
586 }
587
UYVYToARGBRow_NEON(const uint8 * src_uyvy,uint8 * dst_argb,const struct YuvConstants * yuvconstants,int width)588 void UYVYToARGBRow_NEON(const uint8* src_uyvy,
589 uint8* dst_argb,
590 const struct YuvConstants* yuvconstants,
591 int width) {
592 asm volatile (
593 YUVTORGB_SETUP
594 "movi v23.8b, #255 \n"
595 "1: \n"
596 READUYVY
597 YUVTORGB(v22, v21, v20)
598 "subs %w2, %w2, #8 \n"
599 MEMACCESS(1)
600 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], 32 \n"
601 "b.gt 1b \n"
602 : "+r"(src_uyvy), // %0
603 "+r"(dst_argb), // %1
604 "+r"(width) // %2
605 : [kUVToRB]"r"(&yuvconstants->kUVToRB),
606 [kUVToG]"r"(&yuvconstants->kUVToG),
607 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
608 [kYToRgb]"r"(&yuvconstants->kYToRgb)
609 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
610 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
611 );
612 }
613
614 // Reads 16 pairs of UV and write even values to dst_u and odd to dst_v.
SplitUVRow_NEON(const uint8 * src_uv,uint8 * dst_u,uint8 * dst_v,int width)615 void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
616 int width) {
617 asm volatile (
618 "1: \n"
619 MEMACCESS(0)
620 "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pairs of UV
621 "subs %w3, %w3, #16 \n" // 16 processed per loop
622 MEMACCESS(1)
623 "st1 {v0.16b}, [%1], #16 \n" // store U
624 MEMACCESS(2)
625 "st1 {v1.16b}, [%2], #16 \n" // store V
626 "b.gt 1b \n"
627 : "+r"(src_uv), // %0
628 "+r"(dst_u), // %1
629 "+r"(dst_v), // %2
630 "+r"(width) // %3 // Output registers
631 : // Input registers
632 : "cc", "memory", "v0", "v1" // Clobber List
633 );
634 }
635
636 // Reads 16 U's and V's and writes out 16 pairs of UV.
MergeUVRow_NEON(const uint8 * src_u,const uint8 * src_v,uint8 * dst_uv,int width)637 void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
638 int width) {
639 asm volatile (
640 "1: \n"
641 MEMACCESS(0)
642 "ld1 {v0.16b}, [%0], #16 \n" // load U
643 MEMACCESS(1)
644 "ld1 {v1.16b}, [%1], #16 \n" // load V
645 "subs %w3, %w3, #16 \n" // 16 processed per loop
646 MEMACCESS(2)
647 "st2 {v0.16b,v1.16b}, [%2], #32 \n" // store 16 pairs of UV
648 "b.gt 1b \n"
649 :
650 "+r"(src_u), // %0
651 "+r"(src_v), // %1
652 "+r"(dst_uv), // %2
653 "+r"(width) // %3 // Output registers
654 : // Input registers
655 : "cc", "memory", "v0", "v1" // Clobber List
656 );
657 }
658
659 // Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15.
CopyRow_NEON(const uint8 * src,uint8 * dst,int count)660 void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
661 asm volatile (
662 "1: \n"
663 MEMACCESS(0)
664 "ld1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 32
665 "subs %w2, %w2, #32 \n" // 32 processed per loop
666 MEMACCESS(1)
667 "st1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 32
668 "b.gt 1b \n"
669 : "+r"(src), // %0
670 "+r"(dst), // %1
671 "+r"(count) // %2 // Output registers
672 : // Input registers
673 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
674 );
675 }
676
677 // SetRow writes 'count' bytes using an 8 bit value repeated.
SetRow_NEON(uint8 * dst,uint8 v8,int count)678 void SetRow_NEON(uint8* dst, uint8 v8, int count) {
679 asm volatile (
680 "dup v0.16b, %w2 \n" // duplicate 16 bytes
681 "1: \n"
682 "subs %w1, %w1, #16 \n" // 16 bytes per loop
683 MEMACCESS(0)
684 "st1 {v0.16b}, [%0], #16 \n" // store
685 "b.gt 1b \n"
686 : "+r"(dst), // %0
687 "+r"(count) // %1
688 : "r"(v8) // %2
689 : "cc", "memory", "v0"
690 );
691 }
692
ARGBSetRow_NEON(uint8 * dst,uint32 v32,int count)693 void ARGBSetRow_NEON(uint8* dst, uint32 v32, int count) {
694 asm volatile (
695 "dup v0.4s, %w2 \n" // duplicate 4 ints
696 "1: \n"
697 "subs %w1, %w1, #4 \n" // 4 ints per loop
698 MEMACCESS(0)
699 "st1 {v0.16b}, [%0], #16 \n" // store
700 "b.gt 1b \n"
701 : "+r"(dst), // %0
702 "+r"(count) // %1
703 : "r"(v32) // %2
704 : "cc", "memory", "v0"
705 );
706 }
707
MirrorRow_NEON(const uint8 * src,uint8 * dst,int width)708 void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
709 asm volatile (
710 // Start at end of source row.
711 "add %0, %0, %w2, sxtw \n"
712 "sub %0, %0, #16 \n"
713 "1: \n"
714 MEMACCESS(0)
715 "ld1 {v0.16b}, [%0], %3 \n" // src -= 16
716 "subs %w2, %w2, #16 \n" // 16 pixels per loop.
717 "rev64 v0.16b, v0.16b \n"
718 MEMACCESS(1)
719 "st1 {v0.D}[1], [%1], #8 \n" // dst += 16
720 MEMACCESS(1)
721 "st1 {v0.D}[0], [%1], #8 \n"
722 "b.gt 1b \n"
723 : "+r"(src), // %0
724 "+r"(dst), // %1
725 "+r"(width) // %2
726 : "r"((ptrdiff_t)-16) // %3
727 : "cc", "memory", "v0"
728 );
729 }
730
MirrorUVRow_NEON(const uint8 * src_uv,uint8 * dst_u,uint8 * dst_v,int width)731 void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
732 int width) {
733 asm volatile (
734 // Start at end of source row.
735 "add %0, %0, %w3, sxtw #1 \n"
736 "sub %0, %0, #16 \n"
737 "1: \n"
738 MEMACCESS(0)
739 "ld2 {v0.8b, v1.8b}, [%0], %4 \n" // src -= 16
740 "subs %w3, %w3, #8 \n" // 8 pixels per loop.
741 "rev64 v0.8b, v0.8b \n"
742 "rev64 v1.8b, v1.8b \n"
743 MEMACCESS(1)
744 "st1 {v0.8b}, [%1], #8 \n" // dst += 8
745 MEMACCESS(2)
746 "st1 {v1.8b}, [%2], #8 \n"
747 "b.gt 1b \n"
748 : "+r"(src_uv), // %0
749 "+r"(dst_u), // %1
750 "+r"(dst_v), // %2
751 "+r"(width) // %3
752 : "r"((ptrdiff_t)-16) // %4
753 : "cc", "memory", "v0", "v1"
754 );
755 }
756
ARGBMirrorRow_NEON(const uint8 * src,uint8 * dst,int width)757 void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {
758 asm volatile (
759 // Start at end of source row.
760 "add %0, %0, %w2, sxtw #2 \n"
761 "sub %0, %0, #16 \n"
762 "1: \n"
763 MEMACCESS(0)
764 "ld1 {v0.16b}, [%0], %3 \n" // src -= 16
765 "subs %w2, %w2, #4 \n" // 4 pixels per loop.
766 "rev64 v0.4s, v0.4s \n"
767 MEMACCESS(1)
768 "st1 {v0.D}[1], [%1], #8 \n" // dst += 16
769 MEMACCESS(1)
770 "st1 {v0.D}[0], [%1], #8 \n"
771 "b.gt 1b \n"
772 : "+r"(src), // %0
773 "+r"(dst), // %1
774 "+r"(width) // %2
775 : "r"((ptrdiff_t)-16) // %3
776 : "cc", "memory", "v0"
777 );
778 }
779
RGB24ToARGBRow_NEON(const uint8 * src_rgb24,uint8 * dst_argb,int width)780 void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int width) {
781 asm volatile (
782 "movi v4.8b, #255 \n" // Alpha
783 "1: \n"
784 MEMACCESS(0)
785 "ld3 {v1.8b,v2.8b,v3.8b}, [%0], #24 \n" // load 8 pixels of RGB24.
786 "subs %w2, %w2, #8 \n" // 8 processed per loop.
787 MEMACCESS(1)
788 "st4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n" // store 8 ARGB pixels
789 "b.gt 1b \n"
790 : "+r"(src_rgb24), // %0
791 "+r"(dst_argb), // %1
792 "+r"(width) // %2
793 :
794 : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List
795 );
796 }
797
RAWToARGBRow_NEON(const uint8 * src_raw,uint8 * dst_argb,int width)798 void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int width) {
799 asm volatile (
800 "movi v5.8b, #255 \n" // Alpha
801 "1: \n"
802 MEMACCESS(0)
803 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b
804 "subs %w2, %w2, #8 \n" // 8 processed per loop.
805 "orr v3.8b, v1.8b, v1.8b \n" // move g
806 "orr v4.8b, v0.8b, v0.8b \n" // move r
807 MEMACCESS(1)
808 "st4 {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n" // store b g r a
809 "b.gt 1b \n"
810 : "+r"(src_raw), // %0
811 "+r"(dst_argb), // %1
812 "+r"(width) // %2
813 :
814 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5" // Clobber List
815 );
816 }
817
RAWToRGB24Row_NEON(const uint8 * src_raw,uint8 * dst_rgb24,int width)818 void RAWToRGB24Row_NEON(const uint8* src_raw, uint8* dst_rgb24, int width) {
819 asm volatile (
820 "1: \n"
821 MEMACCESS(0)
822 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b
823 "subs %w2, %w2, #8 \n" // 8 processed per loop.
824 "orr v3.8b, v1.8b, v1.8b \n" // move g
825 "orr v4.8b, v0.8b, v0.8b \n" // move r
826 MEMACCESS(1)
827 "st3 {v2.8b,v3.8b,v4.8b}, [%1], #24 \n" // store b g r
828 "b.gt 1b \n"
829 : "+r"(src_raw), // %0
830 "+r"(dst_rgb24), // %1
831 "+r"(width) // %2
832 :
833 : "cc", "memory", "v0", "v1", "v2", "v3", "v4" // Clobber List
834 );
835 }
836
837 #define RGB565TOARGB \
838 "shrn v6.8b, v0.8h, #5 \n" /* G xxGGGGGG */ \
839 "shl v6.8b, v6.8b, #2 \n" /* G GGGGGG00 upper 6 */ \
840 "ushr v4.8b, v6.8b, #6 \n" /* G 000000GG lower 2 */ \
841 "orr v1.8b, v4.8b, v6.8b \n" /* G */ \
842 "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \
843 "ushr v0.8h, v0.8h, #11 \n" /* R 000RRRRR */ \
844 "xtn2 v2.16b,v0.8h \n" /* R in upper part */ \
845 "shl v2.16b, v2.16b, #3 \n" /* R,B BBBBB000 upper 5 */ \
846 "ushr v0.16b, v2.16b, #5 \n" /* R,B 00000BBB lower 3 */ \
847 "orr v0.16b, v0.16b, v2.16b \n" /* R,B */ \
848 "dup v2.2D, v0.D[1] \n" /* R */
849
RGB565ToARGBRow_NEON(const uint8 * src_rgb565,uint8 * dst_argb,int width)850 void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int width) {
851 asm volatile (
852 "movi v3.8b, #255 \n" // Alpha
853 "1: \n"
854 MEMACCESS(0)
855 "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels.
856 "subs %w2, %w2, #8 \n" // 8 processed per loop.
857 RGB565TOARGB
858 MEMACCESS(1)
859 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels
860 "b.gt 1b \n"
861 : "+r"(src_rgb565), // %0
862 "+r"(dst_argb), // %1
863 "+r"(width) // %2
864 :
865 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6" // Clobber List
866 );
867 }
868
869 #define ARGB1555TOARGB \
870 "ushr v2.8h, v0.8h, #10 \n" /* R xxxRRRRR */ \
871 "shl v2.8h, v2.8h, #3 \n" /* R RRRRR000 upper 5 */ \
872 "xtn v3.8b, v2.8h \n" /* RRRRR000 AAAAAAAA */ \
873 \
874 "sshr v2.8h, v0.8h, #15 \n" /* A AAAAAAAA */ \
875 "xtn2 v3.16b, v2.8h \n" \
876 \
877 "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \
878 "shrn2 v2.16b,v0.8h, #5 \n" /* G xxxGGGGG */ \
879 \
880 "ushr v1.16b, v3.16b, #5 \n" /* R,A 00000RRR lower 3 */ \
881 "shl v0.16b, v2.16b, #3 \n" /* B,G BBBBB000 upper 5 */ \
882 "ushr v2.16b, v0.16b, #5 \n" /* B,G 00000BBB lower 3 */ \
883 \
884 "orr v0.16b, v0.16b, v2.16b \n" /* B,G */ \
885 "orr v2.16b, v1.16b, v3.16b \n" /* R,A */ \
886 "dup v1.2D, v0.D[1] \n" \
887 "dup v3.2D, v2.D[1] \n"
888
889 // RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha.
890 #define RGB555TOARGB \
891 "ushr v2.8h, v0.8h, #10 \n" /* R xxxRRRRR */ \
892 "shl v2.8h, v2.8h, #3 \n" /* R RRRRR000 upper 5 */ \
893 "xtn v3.8b, v2.8h \n" /* RRRRR000 */ \
894 \
895 "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \
896 "shrn2 v2.16b,v0.8h, #5 \n" /* G xxxGGGGG */ \
897 \
898 "ushr v1.16b, v3.16b, #5 \n" /* R 00000RRR lower 3 */ \
899 "shl v0.16b, v2.16b, #3 \n" /* B,G BBBBB000 upper 5 */ \
900 "ushr v2.16b, v0.16b, #5 \n" /* B,G 00000BBB lower 3 */ \
901 \
902 "orr v0.16b, v0.16b, v2.16b \n" /* B,G */ \
903 "orr v2.16b, v1.16b, v3.16b \n" /* R */ \
904 "dup v1.2D, v0.D[1] \n" /* G */ \
905
ARGB1555ToARGBRow_NEON(const uint8 * src_argb1555,uint8 * dst_argb,int width)906 void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb,
907 int width) {
908 asm volatile (
909 "movi v3.8b, #255 \n" // Alpha
910 "1: \n"
911 MEMACCESS(0)
912 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
913 "subs %w2, %w2, #8 \n" // 8 processed per loop.
914 ARGB1555TOARGB
915 MEMACCESS(1)
916 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels
917 "b.gt 1b \n"
918 : "+r"(src_argb1555), // %0
919 "+r"(dst_argb), // %1
920 "+r"(width) // %2
921 :
922 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
923 );
924 }
925
926 #define ARGB4444TOARGB \
927 "shrn v1.8b, v0.8h, #8 \n" /* v1(l) AR */ \
928 "xtn2 v1.16b, v0.8h \n" /* v1(h) GB */ \
929 "shl v2.16b, v1.16b, #4 \n" /* B,R BBBB0000 */ \
930 "ushr v3.16b, v1.16b, #4 \n" /* G,A 0000GGGG */ \
931 "ushr v0.16b, v2.16b, #4 \n" /* B,R 0000BBBB */ \
932 "shl v1.16b, v3.16b, #4 \n" /* G,A GGGG0000 */ \
933 "orr v2.16b, v0.16b, v2.16b \n" /* B,R BBBBBBBB */ \
934 "orr v3.16b, v1.16b, v3.16b \n" /* G,A GGGGGGGG */ \
935 "dup v0.2D, v2.D[1] \n" \
936 "dup v1.2D, v3.D[1] \n"
937
ARGB4444ToARGBRow_NEON(const uint8 * src_argb4444,uint8 * dst_argb,int width)938 void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb,
939 int width) {
940 asm volatile (
941 "1: \n"
942 MEMACCESS(0)
943 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels.
944 "subs %w2, %w2, #8 \n" // 8 processed per loop.
945 ARGB4444TOARGB
946 MEMACCESS(1)
947 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels
948 "b.gt 1b \n"
949 : "+r"(src_argb4444), // %0
950 "+r"(dst_argb), // %1
951 "+r"(width) // %2
952 :
953 : "cc", "memory", "v0", "v1", "v2", "v3", "v4" // Clobber List
954 );
955 }
956
ARGBToRGB24Row_NEON(const uint8 * src_argb,uint8 * dst_rgb24,int width)957 void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int width) {
958 asm volatile (
959 "1: \n"
960 MEMACCESS(0)
961 "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load 8 ARGB pixels
962 "subs %w2, %w2, #8 \n" // 8 processed per loop.
963 MEMACCESS(1)
964 "st3 {v1.8b,v2.8b,v3.8b}, [%1], #24 \n" // store 8 pixels of RGB24.
965 "b.gt 1b \n"
966 : "+r"(src_argb), // %0
967 "+r"(dst_rgb24), // %1
968 "+r"(width) // %2
969 :
970 : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List
971 );
972 }
973
ARGBToRAWRow_NEON(const uint8 * src_argb,uint8 * dst_raw,int width)974 void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int width) {
975 asm volatile (
976 "1: \n"
977 MEMACCESS(0)
978 "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load b g r a
979 "subs %w2, %w2, #8 \n" // 8 processed per loop.
980 "orr v4.8b, v2.8b, v2.8b \n" // mov g
981 "orr v5.8b, v1.8b, v1.8b \n" // mov b
982 MEMACCESS(1)
983 "st3 {v3.8b,v4.8b,v5.8b}, [%1], #24 \n" // store r g b
984 "b.gt 1b \n"
985 : "+r"(src_argb), // %0
986 "+r"(dst_raw), // %1
987 "+r"(width) // %2
988 :
989 : "cc", "memory", "v1", "v2", "v3", "v4", "v5" // Clobber List
990 );
991 }
992
YUY2ToYRow_NEON(const uint8 * src_yuy2,uint8 * dst_y,int width)993 void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int width) {
994 asm volatile (
995 "1: \n"
996 MEMACCESS(0)
997 "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of YUY2.
998 "subs %w2, %w2, #16 \n" // 16 processed per loop.
999 MEMACCESS(1)
1000 "st1 {v0.16b}, [%1], #16 \n" // store 16 pixels of Y.
1001 "b.gt 1b \n"
1002 : "+r"(src_yuy2), // %0
1003 "+r"(dst_y), // %1
1004 "+r"(width) // %2
1005 :
1006 : "cc", "memory", "v0", "v1" // Clobber List
1007 );
1008 }
1009
UYVYToYRow_NEON(const uint8 * src_uyvy,uint8 * dst_y,int width)1010 void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int width) {
1011 asm volatile (
1012 "1: \n"
1013 MEMACCESS(0)
1014 "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY.
1015 "subs %w2, %w2, #16 \n" // 16 processed per loop.
1016 MEMACCESS(1)
1017 "st1 {v1.16b}, [%1], #16 \n" // store 16 pixels of Y.
1018 "b.gt 1b \n"
1019 : "+r"(src_uyvy), // %0
1020 "+r"(dst_y), // %1
1021 "+r"(width) // %2
1022 :
1023 : "cc", "memory", "v0", "v1" // Clobber List
1024 );
1025 }
1026
YUY2ToUV422Row_NEON(const uint8 * src_yuy2,uint8 * dst_u,uint8 * dst_v,int width)1027 void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
1028 int width) {
1029 asm volatile (
1030 "1: \n"
1031 MEMACCESS(0)
1032 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 YUY2 pixels
1033 "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs.
1034 MEMACCESS(1)
1035 "st1 {v1.8b}, [%1], #8 \n" // store 8 U.
1036 MEMACCESS(2)
1037 "st1 {v3.8b}, [%2], #8 \n" // store 8 V.
1038 "b.gt 1b \n"
1039 : "+r"(src_yuy2), // %0
1040 "+r"(dst_u), // %1
1041 "+r"(dst_v), // %2
1042 "+r"(width) // %3
1043 :
1044 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
1045 );
1046 }
1047
UYVYToUV422Row_NEON(const uint8 * src_uyvy,uint8 * dst_u,uint8 * dst_v,int width)1048 void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v,
1049 int width) {
1050 asm volatile (
1051 "1: \n"
1052 MEMACCESS(0)
1053 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 UYVY pixels
1054 "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs.
1055 MEMACCESS(1)
1056 "st1 {v0.8b}, [%1], #8 \n" // store 8 U.
1057 MEMACCESS(2)
1058 "st1 {v2.8b}, [%2], #8 \n" // store 8 V.
1059 "b.gt 1b \n"
1060 : "+r"(src_uyvy), // %0
1061 "+r"(dst_u), // %1
1062 "+r"(dst_v), // %2
1063 "+r"(width) // %3
1064 :
1065 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
1066 );
1067 }
1068
YUY2ToUVRow_NEON(const uint8 * src_yuy2,int stride_yuy2,uint8 * dst_u,uint8 * dst_v,int width)1069 void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
1070 uint8* dst_u, uint8* dst_v, int width) {
1071 const uint8* src_yuy2b = src_yuy2 + stride_yuy2;
1072 asm volatile (
1073 "1: \n"
1074 MEMACCESS(0)
1075 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels
1076 "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs.
1077 MEMACCESS(1)
1078 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row
1079 "urhadd v1.8b, v1.8b, v5.8b \n" // average rows of U
1080 "urhadd v3.8b, v3.8b, v7.8b \n" // average rows of V
1081 MEMACCESS(2)
1082 "st1 {v1.8b}, [%2], #8 \n" // store 8 U.
1083 MEMACCESS(3)
1084 "st1 {v3.8b}, [%3], #8 \n" // store 8 V.
1085 "b.gt 1b \n"
1086 : "+r"(src_yuy2), // %0
1087 "+r"(src_yuy2b), // %1
1088 "+r"(dst_u), // %2
1089 "+r"(dst_v), // %3
1090 "+r"(width) // %4
1091 :
1092 : "cc", "memory", "v0", "v1", "v2", "v3", "v4",
1093 "v5", "v6", "v7" // Clobber List
1094 );
1095 }
1096
UYVYToUVRow_NEON(const uint8 * src_uyvy,int stride_uyvy,uint8 * dst_u,uint8 * dst_v,int width)1097 void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
1098 uint8* dst_u, uint8* dst_v, int width) {
1099 const uint8* src_uyvyb = src_uyvy + stride_uyvy;
1100 asm volatile (
1101 "1: \n"
1102 MEMACCESS(0)
1103 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels
1104 "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs.
1105 MEMACCESS(1)
1106 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row
1107 "urhadd v0.8b, v0.8b, v4.8b \n" // average rows of U
1108 "urhadd v2.8b, v2.8b, v6.8b \n" // average rows of V
1109 MEMACCESS(2)
1110 "st1 {v0.8b}, [%2], #8 \n" // store 8 U.
1111 MEMACCESS(3)
1112 "st1 {v2.8b}, [%3], #8 \n" // store 8 V.
1113 "b.gt 1b \n"
1114 : "+r"(src_uyvy), // %0
1115 "+r"(src_uyvyb), // %1
1116 "+r"(dst_u), // %2
1117 "+r"(dst_v), // %3
1118 "+r"(width) // %4
1119 :
1120 : "cc", "memory", "v0", "v1", "v2", "v3", "v4",
1121 "v5", "v6", "v7" // Clobber List
1122 );
1123 }
1124
1125 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
ARGBShuffleRow_NEON(const uint8 * src_argb,uint8 * dst_argb,const uint8 * shuffler,int width)1126 void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,
1127 const uint8* shuffler, int width) {
1128 asm volatile (
1129 MEMACCESS(3)
1130 "ld1 {v2.16b}, [%3] \n" // shuffler
1131 "1: \n"
1132 MEMACCESS(0)
1133 "ld1 {v0.16b}, [%0], #16 \n" // load 4 pixels.
1134 "subs %w2, %w2, #4 \n" // 4 processed per loop
1135 "tbl v1.16b, {v0.16b}, v2.16b \n" // look up 4 pixels
1136 MEMACCESS(1)
1137 "st1 {v1.16b}, [%1], #16 \n" // store 4.
1138 "b.gt 1b \n"
1139 : "+r"(src_argb), // %0
1140 "+r"(dst_argb), // %1
1141 "+r"(width) // %2
1142 : "r"(shuffler) // %3
1143 : "cc", "memory", "v0", "v1", "v2" // Clobber List
1144 );
1145 }
1146
I422ToYUY2Row_NEON(const uint8 * src_y,const uint8 * src_u,const uint8 * src_v,uint8 * dst_yuy2,int width)1147 void I422ToYUY2Row_NEON(const uint8* src_y,
1148 const uint8* src_u,
1149 const uint8* src_v,
1150 uint8* dst_yuy2, int width) {
1151 asm volatile (
1152 "1: \n"
1153 MEMACCESS(0)
1154 "ld2 {v0.8b, v1.8b}, [%0], #16 \n" // load 16 Ys
1155 "orr v2.8b, v1.8b, v1.8b \n"
1156 MEMACCESS(1)
1157 "ld1 {v1.8b}, [%1], #8 \n" // load 8 Us
1158 MEMACCESS(2)
1159 "ld1 {v3.8b}, [%2], #8 \n" // load 8 Vs
1160 "subs %w4, %w4, #16 \n" // 16 pixels
1161 MEMACCESS(3)
1162 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels.
1163 "b.gt 1b \n"
1164 : "+r"(src_y), // %0
1165 "+r"(src_u), // %1
1166 "+r"(src_v), // %2
1167 "+r"(dst_yuy2), // %3
1168 "+r"(width) // %4
1169 :
1170 : "cc", "memory", "v0", "v1", "v2", "v3"
1171 );
1172 }
1173
I422ToUYVYRow_NEON(const uint8 * src_y,const uint8 * src_u,const uint8 * src_v,uint8 * dst_uyvy,int width)1174 void I422ToUYVYRow_NEON(const uint8* src_y,
1175 const uint8* src_u,
1176 const uint8* src_v,
1177 uint8* dst_uyvy, int width) {
1178 asm volatile (
1179 "1: \n"
1180 MEMACCESS(0)
1181 "ld2 {v1.8b,v2.8b}, [%0], #16 \n" // load 16 Ys
1182 "orr v3.8b, v2.8b, v2.8b \n"
1183 MEMACCESS(1)
1184 "ld1 {v0.8b}, [%1], #8 \n" // load 8 Us
1185 MEMACCESS(2)
1186 "ld1 {v2.8b}, [%2], #8 \n" // load 8 Vs
1187 "subs %w4, %w4, #16 \n" // 16 pixels
1188 MEMACCESS(3)
1189 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels.
1190 "b.gt 1b \n"
1191 : "+r"(src_y), // %0
1192 "+r"(src_u), // %1
1193 "+r"(src_v), // %2
1194 "+r"(dst_uyvy), // %3
1195 "+r"(width) // %4
1196 :
1197 : "cc", "memory", "v0", "v1", "v2", "v3"
1198 );
1199 }
1200
ARGBToRGB565Row_NEON(const uint8 * src_argb,uint8 * dst_rgb565,int width)1201 void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int width) {
1202 asm volatile (
1203 "1: \n"
1204 MEMACCESS(0)
1205 "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels
1206 "subs %w2, %w2, #8 \n" // 8 processed per loop.
1207 ARGBTORGB565
1208 MEMACCESS(1)
1209 "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels RGB565.
1210 "b.gt 1b \n"
1211 : "+r"(src_argb), // %0
1212 "+r"(dst_rgb565), // %1
1213 "+r"(width) // %2
1214 :
1215 : "cc", "memory", "v0", "v20", "v21", "v22", "v23"
1216 );
1217 }
1218
ARGBToRGB565DitherRow_NEON(const uint8 * src_argb,uint8 * dst_rgb,const uint32 dither4,int width)1219 void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb,
1220 const uint32 dither4, int width) {
1221 asm volatile (
1222 "dup v1.4s, %w2 \n" // dither4
1223 "1: \n"
1224 MEMACCESS(1)
1225 "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" // load 8 pixels
1226 "subs %w3, %w3, #8 \n" // 8 processed per loop.
1227 "uqadd v20.8b, v20.8b, v1.8b \n"
1228 "uqadd v21.8b, v21.8b, v1.8b \n"
1229 "uqadd v22.8b, v22.8b, v1.8b \n"
1230 ARGBTORGB565
1231 MEMACCESS(0)
1232 "st1 {v0.16b}, [%0], #16 \n" // store 8 pixels RGB565.
1233 "b.gt 1b \n"
1234 : "+r"(dst_rgb) // %0
1235 : "r"(src_argb), // %1
1236 "r"(dither4), // %2
1237 "r"(width) // %3
1238 : "cc", "memory", "v0", "v1", "v20", "v21", "v22", "v23"
1239 );
1240 }
1241
ARGBToARGB1555Row_NEON(const uint8 * src_argb,uint8 * dst_argb1555,int width)1242 void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555,
1243 int width) {
1244 asm volatile (
1245 "1: \n"
1246 MEMACCESS(0)
1247 "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels
1248 "subs %w2, %w2, #8 \n" // 8 processed per loop.
1249 ARGBTOARGB1555
1250 MEMACCESS(1)
1251 "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels ARGB1555.
1252 "b.gt 1b \n"
1253 : "+r"(src_argb), // %0
1254 "+r"(dst_argb1555), // %1
1255 "+r"(width) // %2
1256 :
1257 : "cc", "memory", "v0", "v20", "v21", "v22", "v23"
1258 );
1259 }
1260
ARGBToARGB4444Row_NEON(const uint8 * src_argb,uint8 * dst_argb4444,int width)1261 void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444,
1262 int width) {
1263 asm volatile (
1264 "movi v4.16b, #0x0f \n" // bits to clear with vbic.
1265 "1: \n"
1266 MEMACCESS(0)
1267 "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels
1268 "subs %w2, %w2, #8 \n" // 8 processed per loop.
1269 ARGBTOARGB4444
1270 MEMACCESS(1)
1271 "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels ARGB4444.
1272 "b.gt 1b \n"
1273 : "+r"(src_argb), // %0
1274 "+r"(dst_argb4444), // %1
1275 "+r"(width) // %2
1276 :
1277 : "cc", "memory", "v0", "v1", "v4", "v20", "v21", "v22", "v23"
1278 );
1279 }
1280
ARGBToYRow_NEON(const uint8 * src_argb,uint8 * dst_y,int width)1281 void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int width) {
1282 asm volatile (
1283 "movi v4.8b, #13 \n" // B * 0.1016 coefficient
1284 "movi v5.8b, #65 \n" // G * 0.5078 coefficient
1285 "movi v6.8b, #33 \n" // R * 0.2578 coefficient
1286 "movi v7.8b, #16 \n" // Add 16 constant
1287 "1: \n"
1288 MEMACCESS(0)
1289 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
1290 "subs %w2, %w2, #8 \n" // 8 processed per loop.
1291 "umull v3.8h, v0.8b, v4.8b \n" // B
1292 "umlal v3.8h, v1.8b, v5.8b \n" // G
1293 "umlal v3.8h, v2.8b, v6.8b \n" // R
1294 "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y
1295 "uqadd v0.8b, v0.8b, v7.8b \n"
1296 MEMACCESS(1)
1297 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
1298 "b.gt 1b \n"
1299 : "+r"(src_argb), // %0
1300 "+r"(dst_y), // %1
1301 "+r"(width) // %2
1302 :
1303 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
1304 );
1305 }
1306
ARGBExtractAlphaRow_NEON(const uint8 * src_argb,uint8 * dst_a,int width)1307 void ARGBExtractAlphaRow_NEON(const uint8* src_argb, uint8* dst_a, int width) {
1308 asm volatile (
1309 "1: \n"
1310 MEMACCESS(0)
1311 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load row 16 pixels
1312 "subs %w2, %w2, #16 \n" // 16 processed per loop
1313 MEMACCESS(1)
1314 "st1 {v3.16b}, [%1], #16 \n" // store 16 A's.
1315 "b.gt 1b \n"
1316 : "+r"(src_argb), // %0
1317 "+r"(dst_a), // %1
1318 "+r"(width) // %2
1319 :
1320 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
1321 );
1322 }
1323
ARGBToYJRow_NEON(const uint8 * src_argb,uint8 * dst_y,int width)1324 void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int width) {
1325 asm volatile (
1326 "movi v4.8b, #15 \n" // B * 0.11400 coefficient
1327 "movi v5.8b, #75 \n" // G * 0.58700 coefficient
1328 "movi v6.8b, #38 \n" // R * 0.29900 coefficient
1329 "1: \n"
1330 MEMACCESS(0)
1331 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
1332 "subs %w2, %w2, #8 \n" // 8 processed per loop.
1333 "umull v3.8h, v0.8b, v4.8b \n" // B
1334 "umlal v3.8h, v1.8b, v5.8b \n" // G
1335 "umlal v3.8h, v2.8b, v6.8b \n" // R
1336 "sqrshrun v0.8b, v3.8h, #7 \n" // 15 bit to 8 bit Y
1337 MEMACCESS(1)
1338 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
1339 "b.gt 1b \n"
1340 : "+r"(src_argb), // %0
1341 "+r"(dst_y), // %1
1342 "+r"(width) // %2
1343 :
1344 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"
1345 );
1346 }
1347
1348 // 8x1 pixels.
ARGBToUV444Row_NEON(const uint8 * src_argb,uint8 * dst_u,uint8 * dst_v,int width)1349 void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
1350 int width) {
1351 asm volatile (
1352 "movi v24.8b, #112 \n" // UB / VR 0.875 coefficient
1353 "movi v25.8b, #74 \n" // UG -0.5781 coefficient
1354 "movi v26.8b, #38 \n" // UR -0.2969 coefficient
1355 "movi v27.8b, #18 \n" // VB -0.1406 coefficient
1356 "movi v28.8b, #94 \n" // VG -0.7344 coefficient
1357 "movi v29.16b,#0x80 \n" // 128.5
1358 "1: \n"
1359 MEMACCESS(0)
1360 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
1361 "subs %w3, %w3, #8 \n" // 8 processed per loop.
1362 "umull v4.8h, v0.8b, v24.8b \n" // B
1363 "umlsl v4.8h, v1.8b, v25.8b \n" // G
1364 "umlsl v4.8h, v2.8b, v26.8b \n" // R
1365 "add v4.8h, v4.8h, v29.8h \n" // +128 -> unsigned
1366
1367 "umull v3.8h, v2.8b, v24.8b \n" // R
1368 "umlsl v3.8h, v1.8b, v28.8b \n" // G
1369 "umlsl v3.8h, v0.8b, v27.8b \n" // B
1370 "add v3.8h, v3.8h, v29.8h \n" // +128 -> unsigned
1371
1372 "uqshrn v0.8b, v4.8h, #8 \n" // 16 bit to 8 bit U
1373 "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V
1374
1375 MEMACCESS(1)
1376 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U.
1377 MEMACCESS(2)
1378 "st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V.
1379 "b.gt 1b \n"
1380 : "+r"(src_argb), // %0
1381 "+r"(dst_u), // %1
1382 "+r"(dst_v), // %2
1383 "+r"(width) // %3
1384 :
1385 : "cc", "memory", "v0", "v1", "v2", "v3", "v4",
1386 "v24", "v25", "v26", "v27", "v28", "v29"
1387 );
1388 }
1389
1390 #define RGBTOUV_SETUP_REG \
1391 "movi v20.8h, #56, lsl #0 \n" /* UB/VR coefficient (0.875) / 2 */ \
1392 "movi v21.8h, #37, lsl #0 \n" /* UG coefficient (-0.5781) / 2 */ \
1393 "movi v22.8h, #19, lsl #0 \n" /* UR coefficient (-0.2969) / 2 */ \
1394 "movi v23.8h, #9, lsl #0 \n" /* VB coefficient (-0.1406) / 2 */ \
1395 "movi v24.8h, #47, lsl #0 \n" /* VG coefficient (-0.7344) / 2 */ \
1396 "movi v25.16b, #0x80 \n" /* 128.5 (0x8080 in 16-bit) */
1397
1398 // 32x1 pixels -> 8x1. width is number of argb pixels. e.g. 32.
ARGBToUV411Row_NEON(const uint8 * src_argb,uint8 * dst_u,uint8 * dst_v,int width)1399 void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
1400 int width) {
1401 asm volatile (
1402 RGBTOUV_SETUP_REG
1403 "1: \n"
1404 MEMACCESS(0)
1405 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
1406 "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
1407 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
1408 "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
1409 MEMACCESS(0)
1410 "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%0], #64 \n" // load next 16.
1411 "uaddlp v4.8h, v4.16b \n" // B 16 bytes -> 8 shorts.
1412 "uaddlp v5.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
1413 "uaddlp v6.8h, v6.16b \n" // R 16 bytes -> 8 shorts.
1414
1415 "addp v0.8h, v0.8h, v4.8h \n" // B 16 shorts -> 8 shorts.
1416 "addp v1.8h, v1.8h, v5.8h \n" // G 16 shorts -> 8 shorts.
1417 "addp v2.8h, v2.8h, v6.8h \n" // R 16 shorts -> 8 shorts.
1418
1419 "urshr v0.8h, v0.8h, #1 \n" // 2x average
1420 "urshr v1.8h, v1.8h, #1 \n"
1421 "urshr v2.8h, v2.8h, #1 \n"
1422
1423 "subs %w3, %w3, #32 \n" // 32 processed per loop.
1424 "mul v3.8h, v0.8h, v20.8h \n" // B
1425 "mls v3.8h, v1.8h, v21.8h \n" // G
1426 "mls v3.8h, v2.8h, v22.8h \n" // R
1427 "add v3.8h, v3.8h, v25.8h \n" // +128 -> unsigned
1428 "mul v4.8h, v2.8h, v20.8h \n" // R
1429 "mls v4.8h, v1.8h, v24.8h \n" // G
1430 "mls v4.8h, v0.8h, v23.8h \n" // B
1431 "add v4.8h, v4.8h, v25.8h \n" // +128 -> unsigned
1432 "uqshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit U
1433 "uqshrn v1.8b, v4.8h, #8 \n" // 16 bit to 8 bit V
1434 MEMACCESS(1)
1435 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U.
1436 MEMACCESS(2)
1437 "st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V.
1438 "b.gt 1b \n"
1439 : "+r"(src_argb), // %0
1440 "+r"(dst_u), // %1
1441 "+r"(dst_v), // %2
1442 "+r"(width) // %3
1443 :
1444 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1445 "v20", "v21", "v22", "v23", "v24", "v25"
1446 );
1447 }
1448
1449 // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
1450 #define RGBTOUV(QB, QG, QR) \
1451 "mul v3.8h, " #QB ",v20.8h \n" /* B */ \
1452 "mul v4.8h, " #QR ",v20.8h \n" /* R */ \
1453 "mls v3.8h, " #QG ",v21.8h \n" /* G */ \
1454 "mls v4.8h, " #QG ",v24.8h \n" /* G */ \
1455 "mls v3.8h, " #QR ",v22.8h \n" /* R */ \
1456 "mls v4.8h, " #QB ",v23.8h \n" /* B */ \
1457 "add v3.8h, v3.8h, v25.8h \n" /* +128 -> unsigned */ \
1458 "add v4.8h, v4.8h, v25.8h \n" /* +128 -> unsigned */ \
1459 "uqshrn v0.8b, v3.8h, #8 \n" /* 16 bit to 8 bit U */ \
1460 "uqshrn v1.8b, v4.8h, #8 \n" /* 16 bit to 8 bit V */
1461
1462 // TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
1463 // TODO(fbarchard): consider ptrdiff_t for all strides.
1464
ARGBToUVRow_NEON(const uint8 * src_argb,int src_stride_argb,uint8 * dst_u,uint8 * dst_v,int width)1465 void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,
1466 uint8* dst_u, uint8* dst_v, int width) {
1467 const uint8* src_argb_1 = src_argb + src_stride_argb;
1468 asm volatile (
1469 RGBTOUV_SETUP_REG
1470 "1: \n"
1471 MEMACCESS(0)
1472 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
1473 "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
1474 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
1475 "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
1476
1477 MEMACCESS(1)
1478 "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16
1479 "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts.
1480 "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
1481 "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts.
1482
1483 "urshr v0.8h, v0.8h, #1 \n" // 2x average
1484 "urshr v1.8h, v1.8h, #1 \n"
1485 "urshr v2.8h, v2.8h, #1 \n"
1486
1487 "subs %w4, %w4, #16 \n" // 32 processed per loop.
1488 RGBTOUV(v0.8h, v1.8h, v2.8h)
1489 MEMACCESS(2)
1490 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
1491 MEMACCESS(3)
1492 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
1493 "b.gt 1b \n"
1494 : "+r"(src_argb), // %0
1495 "+r"(src_argb_1), // %1
1496 "+r"(dst_u), // %2
1497 "+r"(dst_v), // %3
1498 "+r"(width) // %4
1499 :
1500 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1501 "v20", "v21", "v22", "v23", "v24", "v25"
1502 );
1503 }
1504
1505 // TODO(fbarchard): Subsample match C code.
ARGBToUVJRow_NEON(const uint8 * src_argb,int src_stride_argb,uint8 * dst_u,uint8 * dst_v,int width)1506 void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb,
1507 uint8* dst_u, uint8* dst_v, int width) {
1508 const uint8* src_argb_1 = src_argb + src_stride_argb;
1509 asm volatile (
1510 "movi v20.8h, #63, lsl #0 \n" // UB/VR coeff (0.500) / 2
1511 "movi v21.8h, #42, lsl #0 \n" // UG coeff (-0.33126) / 2
1512 "movi v22.8h, #21, lsl #0 \n" // UR coeff (-0.16874) / 2
1513 "movi v23.8h, #10, lsl #0 \n" // VB coeff (-0.08131) / 2
1514 "movi v24.8h, #53, lsl #0 \n" // VG coeff (-0.41869) / 2
1515 "movi v25.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit)
1516 "1: \n"
1517 MEMACCESS(0)
1518 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
1519 "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
1520 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
1521 "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
1522 MEMACCESS(1)
1523 "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16
1524 "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts.
1525 "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
1526 "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts.
1527
1528 "urshr v0.8h, v0.8h, #1 \n" // 2x average
1529 "urshr v1.8h, v1.8h, #1 \n"
1530 "urshr v2.8h, v2.8h, #1 \n"
1531
1532 "subs %w4, %w4, #16 \n" // 32 processed per loop.
1533 RGBTOUV(v0.8h, v1.8h, v2.8h)
1534 MEMACCESS(2)
1535 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
1536 MEMACCESS(3)
1537 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
1538 "b.gt 1b \n"
1539 : "+r"(src_argb), // %0
1540 "+r"(src_argb_1), // %1
1541 "+r"(dst_u), // %2
1542 "+r"(dst_v), // %3
1543 "+r"(width) // %4
1544 :
1545 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1546 "v20", "v21", "v22", "v23", "v24", "v25"
1547 );
1548 }
1549
BGRAToUVRow_NEON(const uint8 * src_bgra,int src_stride_bgra,uint8 * dst_u,uint8 * dst_v,int width)1550 void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra,
1551 uint8* dst_u, uint8* dst_v, int width) {
1552 const uint8* src_bgra_1 = src_bgra + src_stride_bgra;
1553 asm volatile (
1554 RGBTOUV_SETUP_REG
1555 "1: \n"
1556 MEMACCESS(0)
1557 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
1558 "uaddlp v0.8h, v3.16b \n" // B 16 bytes -> 8 shorts.
1559 "uaddlp v3.8h, v2.16b \n" // G 16 bytes -> 8 shorts.
1560 "uaddlp v2.8h, v1.16b \n" // R 16 bytes -> 8 shorts.
1561 MEMACCESS(1)
1562 "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more
1563 "uadalp v0.8h, v7.16b \n" // B 16 bytes -> 8 shorts.
1564 "uadalp v3.8h, v6.16b \n" // G 16 bytes -> 8 shorts.
1565 "uadalp v2.8h, v5.16b \n" // R 16 bytes -> 8 shorts.
1566
1567 "urshr v0.8h, v0.8h, #1 \n" // 2x average
1568 "urshr v1.8h, v3.8h, #1 \n"
1569 "urshr v2.8h, v2.8h, #1 \n"
1570
1571 "subs %w4, %w4, #16 \n" // 32 processed per loop.
1572 RGBTOUV(v0.8h, v1.8h, v2.8h)
1573 MEMACCESS(2)
1574 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
1575 MEMACCESS(3)
1576 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
1577 "b.gt 1b \n"
1578 : "+r"(src_bgra), // %0
1579 "+r"(src_bgra_1), // %1
1580 "+r"(dst_u), // %2
1581 "+r"(dst_v), // %3
1582 "+r"(width) // %4
1583 :
1584 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1585 "v20", "v21", "v22", "v23", "v24", "v25"
1586 );
1587 }
1588
ABGRToUVRow_NEON(const uint8 * src_abgr,int src_stride_abgr,uint8 * dst_u,uint8 * dst_v,int width)1589 void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr,
1590 uint8* dst_u, uint8* dst_v, int width) {
1591 const uint8* src_abgr_1 = src_abgr + src_stride_abgr;
1592 asm volatile (
1593 RGBTOUV_SETUP_REG
1594 "1: \n"
1595 MEMACCESS(0)
1596 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
1597 "uaddlp v3.8h, v2.16b \n" // B 16 bytes -> 8 shorts.
1598 "uaddlp v2.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
1599 "uaddlp v1.8h, v0.16b \n" // R 16 bytes -> 8 shorts.
1600 MEMACCESS(1)
1601 "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more.
1602 "uadalp v3.8h, v6.16b \n" // B 16 bytes -> 8 shorts.
1603 "uadalp v2.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
1604 "uadalp v1.8h, v4.16b \n" // R 16 bytes -> 8 shorts.
1605
1606 "urshr v0.8h, v3.8h, #1 \n" // 2x average
1607 "urshr v2.8h, v2.8h, #1 \n"
1608 "urshr v1.8h, v1.8h, #1 \n"
1609
1610 "subs %w4, %w4, #16 \n" // 32 processed per loop.
1611 RGBTOUV(v0.8h, v2.8h, v1.8h)
1612 MEMACCESS(2)
1613 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
1614 MEMACCESS(3)
1615 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
1616 "b.gt 1b \n"
1617 : "+r"(src_abgr), // %0
1618 "+r"(src_abgr_1), // %1
1619 "+r"(dst_u), // %2
1620 "+r"(dst_v), // %3
1621 "+r"(width) // %4
1622 :
1623 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1624 "v20", "v21", "v22", "v23", "v24", "v25"
1625 );
1626 }
1627
RGBAToUVRow_NEON(const uint8 * src_rgba,int src_stride_rgba,uint8 * dst_u,uint8 * dst_v,int width)1628 void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba,
1629 uint8* dst_u, uint8* dst_v, int width) {
1630 const uint8* src_rgba_1 = src_rgba + src_stride_rgba;
1631 asm volatile (
1632 RGBTOUV_SETUP_REG
1633 "1: \n"
1634 MEMACCESS(0)
1635 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
1636 "uaddlp v0.8h, v1.16b \n" // B 16 bytes -> 8 shorts.
1637 "uaddlp v1.8h, v2.16b \n" // G 16 bytes -> 8 shorts.
1638 "uaddlp v2.8h, v3.16b \n" // R 16 bytes -> 8 shorts.
1639 MEMACCESS(1)
1640 "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more.
1641 "uadalp v0.8h, v5.16b \n" // B 16 bytes -> 8 shorts.
1642 "uadalp v1.8h, v6.16b \n" // G 16 bytes -> 8 shorts.
1643 "uadalp v2.8h, v7.16b \n" // R 16 bytes -> 8 shorts.
1644
1645 "urshr v0.8h, v0.8h, #1 \n" // 2x average
1646 "urshr v1.8h, v1.8h, #1 \n"
1647 "urshr v2.8h, v2.8h, #1 \n"
1648
1649 "subs %w4, %w4, #16 \n" // 32 processed per loop.
1650 RGBTOUV(v0.8h, v1.8h, v2.8h)
1651 MEMACCESS(2)
1652 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
1653 MEMACCESS(3)
1654 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
1655 "b.gt 1b \n"
1656 : "+r"(src_rgba), // %0
1657 "+r"(src_rgba_1), // %1
1658 "+r"(dst_u), // %2
1659 "+r"(dst_v), // %3
1660 "+r"(width) // %4
1661 :
1662 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1663 "v20", "v21", "v22", "v23", "v24", "v25"
1664 );
1665 }
1666
RGB24ToUVRow_NEON(const uint8 * src_rgb24,int src_stride_rgb24,uint8 * dst_u,uint8 * dst_v,int width)1667 void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24,
1668 uint8* dst_u, uint8* dst_v, int width) {
1669 const uint8* src_rgb24_1 = src_rgb24 + src_stride_rgb24;
1670 asm volatile (
1671 RGBTOUV_SETUP_REG
1672 "1: \n"
1673 MEMACCESS(0)
1674 "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 pixels.
1675 "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
1676 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
1677 "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
1678 MEMACCESS(1)
1679 "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load 16 more.
1680 "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts.
1681 "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
1682 "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts.
1683
1684 "urshr v0.8h, v0.8h, #1 \n" // 2x average
1685 "urshr v1.8h, v1.8h, #1 \n"
1686 "urshr v2.8h, v2.8h, #1 \n"
1687
1688 "subs %w4, %w4, #16 \n" // 32 processed per loop.
1689 RGBTOUV(v0.8h, v1.8h, v2.8h)
1690 MEMACCESS(2)
1691 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
1692 MEMACCESS(3)
1693 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
1694 "b.gt 1b \n"
1695 : "+r"(src_rgb24), // %0
1696 "+r"(src_rgb24_1), // %1
1697 "+r"(dst_u), // %2
1698 "+r"(dst_v), // %3
1699 "+r"(width) // %4
1700 :
1701 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1702 "v20", "v21", "v22", "v23", "v24", "v25"
1703 );
1704 }
1705
RAWToUVRow_NEON(const uint8 * src_raw,int src_stride_raw,uint8 * dst_u,uint8 * dst_v,int width)1706 void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw,
1707 uint8* dst_u, uint8* dst_v, int width) {
1708 const uint8* src_raw_1 = src_raw + src_stride_raw;
1709 asm volatile (
1710 RGBTOUV_SETUP_REG
1711 "1: \n"
1712 MEMACCESS(0)
1713 "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 8 RAW pixels.
1714 "uaddlp v2.8h, v2.16b \n" // B 16 bytes -> 8 shorts.
1715 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
1716 "uaddlp v0.8h, v0.16b \n" // R 16 bytes -> 8 shorts.
1717 MEMACCESS(1)
1718 "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load 8 more RAW pixels
1719 "uadalp v2.8h, v6.16b \n" // B 16 bytes -> 8 shorts.
1720 "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
1721 "uadalp v0.8h, v4.16b \n" // R 16 bytes -> 8 shorts.
1722
1723 "urshr v2.8h, v2.8h, #1 \n" // 2x average
1724 "urshr v1.8h, v1.8h, #1 \n"
1725 "urshr v0.8h, v0.8h, #1 \n"
1726
1727 "subs %w4, %w4, #16 \n" // 32 processed per loop.
1728 RGBTOUV(v2.8h, v1.8h, v0.8h)
1729 MEMACCESS(2)
1730 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
1731 MEMACCESS(3)
1732 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
1733 "b.gt 1b \n"
1734 : "+r"(src_raw), // %0
1735 "+r"(src_raw_1), // %1
1736 "+r"(dst_u), // %2
1737 "+r"(dst_v), // %3
1738 "+r"(width) // %4
1739 :
1740 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1741 "v20", "v21", "v22", "v23", "v24", "v25"
1742 );
1743 }
1744
1745 // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
RGB565ToUVRow_NEON(const uint8 * src_rgb565,int src_stride_rgb565,uint8 * dst_u,uint8 * dst_v,int width)1746 void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565,
1747 uint8* dst_u, uint8* dst_v, int width) {
1748 const uint8* src_rgb565_1 = src_rgb565 + src_stride_rgb565;
1749 asm volatile (
1750 "movi v22.8h, #56, lsl #0 \n" // UB / VR coeff (0.875) / 2
1751 "movi v23.8h, #37, lsl #0 \n" // UG coeff (-0.5781) / 2
1752 "movi v24.8h, #19, lsl #0 \n" // UR coeff (-0.2969) / 2
1753 "movi v25.8h, #9 , lsl #0 \n" // VB coeff (-0.1406) / 2
1754 "movi v26.8h, #47, lsl #0 \n" // VG coeff (-0.7344) / 2
1755 "movi v27.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit)
1756 "1: \n"
1757 MEMACCESS(0)
1758 "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels.
1759 RGB565TOARGB
1760 "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
1761 "uaddlp v18.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
1762 "uaddlp v20.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
1763 MEMACCESS(0)
1764 "ld1 {v0.16b}, [%0], #16 \n" // next 8 RGB565 pixels.
1765 RGB565TOARGB
1766 "uaddlp v17.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
1767 "uaddlp v19.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
1768 "uaddlp v21.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
1769
1770 MEMACCESS(1)
1771 "ld1 {v0.16b}, [%1], #16 \n" // load 8 RGB565 pixels.
1772 RGB565TOARGB
1773 "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
1774 "uadalp v18.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
1775 "uadalp v20.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
1776 MEMACCESS(1)
1777 "ld1 {v0.16b}, [%1], #16 \n" // next 8 RGB565 pixels.
1778 RGB565TOARGB
1779 "uadalp v17.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
1780 "uadalp v19.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
1781 "uadalp v21.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
1782
1783 "ins v16.D[1], v17.D[0] \n"
1784 "ins v18.D[1], v19.D[0] \n"
1785 "ins v20.D[1], v21.D[0] \n"
1786
1787 "urshr v4.8h, v16.8h, #1 \n" // 2x average
1788 "urshr v5.8h, v18.8h, #1 \n"
1789 "urshr v6.8h, v20.8h, #1 \n"
1790
1791 "subs %w4, %w4, #16 \n" // 16 processed per loop.
1792 "mul v16.8h, v4.8h, v22.8h \n" // B
1793 "mls v16.8h, v5.8h, v23.8h \n" // G
1794 "mls v16.8h, v6.8h, v24.8h \n" // R
1795 "add v16.8h, v16.8h, v27.8h \n" // +128 -> unsigned
1796 "mul v17.8h, v6.8h, v22.8h \n" // R
1797 "mls v17.8h, v5.8h, v26.8h \n" // G
1798 "mls v17.8h, v4.8h, v25.8h \n" // B
1799 "add v17.8h, v17.8h, v27.8h \n" // +128 -> unsigned
1800 "uqshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit U
1801 "uqshrn v1.8b, v17.8h, #8 \n" // 16 bit to 8 bit V
1802 MEMACCESS(2)
1803 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
1804 MEMACCESS(3)
1805 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
1806 "b.gt 1b \n"
1807 : "+r"(src_rgb565), // %0
1808 "+r"(src_rgb565_1), // %1
1809 "+r"(dst_u), // %2
1810 "+r"(dst_v), // %3
1811 "+r"(width) // %4
1812 :
1813 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1814 "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24",
1815 "v25", "v26", "v27"
1816 );
1817 }
1818
1819 // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
ARGB1555ToUVRow_NEON(const uint8 * src_argb1555,int src_stride_argb1555,uint8 * dst_u,uint8 * dst_v,int width)1820 void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555,
1821 uint8* dst_u, uint8* dst_v, int width) {
1822 const uint8* src_argb1555_1 = src_argb1555 + src_stride_argb1555;
1823 asm volatile (
1824 RGBTOUV_SETUP_REG
1825 "1: \n"
1826 MEMACCESS(0)
1827 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
1828 RGB555TOARGB
1829 "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
1830 "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
1831 "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
1832 MEMACCESS(0)
1833 "ld1 {v0.16b}, [%0], #16 \n" // next 8 ARGB1555 pixels.
1834 RGB555TOARGB
1835 "uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
1836 "uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
1837 "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
1838
1839 MEMACCESS(1)
1840 "ld1 {v0.16b}, [%1], #16 \n" // load 8 ARGB1555 pixels.
1841 RGB555TOARGB
1842 "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
1843 "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
1844 "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
1845 MEMACCESS(1)
1846 "ld1 {v0.16b}, [%1], #16 \n" // next 8 ARGB1555 pixels.
1847 RGB555TOARGB
1848 "uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
1849 "uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
1850 "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
1851
1852 "ins v16.D[1], v26.D[0] \n"
1853 "ins v17.D[1], v27.D[0] \n"
1854 "ins v18.D[1], v28.D[0] \n"
1855
1856 "urshr v4.8h, v16.8h, #1 \n" // 2x average
1857 "urshr v5.8h, v17.8h, #1 \n"
1858 "urshr v6.8h, v18.8h, #1 \n"
1859
1860 "subs %w4, %w4, #16 \n" // 16 processed per loop.
1861 "mul v2.8h, v4.8h, v20.8h \n" // B
1862 "mls v2.8h, v5.8h, v21.8h \n" // G
1863 "mls v2.8h, v6.8h, v22.8h \n" // R
1864 "add v2.8h, v2.8h, v25.8h \n" // +128 -> unsigned
1865 "mul v3.8h, v6.8h, v20.8h \n" // R
1866 "mls v3.8h, v5.8h, v24.8h \n" // G
1867 "mls v3.8h, v4.8h, v23.8h \n" // B
1868 "add v3.8h, v3.8h, v25.8h \n" // +128 -> unsigned
1869 "uqshrn v0.8b, v2.8h, #8 \n" // 16 bit to 8 bit U
1870 "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V
1871 MEMACCESS(2)
1872 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
1873 MEMACCESS(3)
1874 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
1875 "b.gt 1b \n"
1876 : "+r"(src_argb1555), // %0
1877 "+r"(src_argb1555_1), // %1
1878 "+r"(dst_u), // %2
1879 "+r"(dst_v), // %3
1880 "+r"(width) // %4
1881 :
1882 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
1883 "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
1884 "v26", "v27", "v28"
1885 );
1886 }
1887
1888 // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
ARGB4444ToUVRow_NEON(const uint8 * src_argb4444,int src_stride_argb4444,uint8 * dst_u,uint8 * dst_v,int width)1889 void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444,
1890 uint8* dst_u, uint8* dst_v, int width) {
1891 const uint8* src_argb4444_1 = src_argb4444 + src_stride_argb4444;
1892 asm volatile (
1893 RGBTOUV_SETUP_REG
1894 "1: \n"
1895 MEMACCESS(0)
1896 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels.
1897 ARGB4444TOARGB
1898 "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
1899 "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
1900 "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
1901 MEMACCESS(0)
1902 "ld1 {v0.16b}, [%0], #16 \n" // next 8 ARGB4444 pixels.
1903 ARGB4444TOARGB
1904 "uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
1905 "uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
1906 "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
1907
1908 MEMACCESS(1)
1909 "ld1 {v0.16b}, [%1], #16 \n" // load 8 ARGB4444 pixels.
1910 ARGB4444TOARGB
1911 "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
1912 "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
1913 "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
1914 MEMACCESS(1)
1915 "ld1 {v0.16b}, [%1], #16 \n" // next 8 ARGB4444 pixels.
1916 ARGB4444TOARGB
1917 "uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
1918 "uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
1919 "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
1920
1921 "ins v16.D[1], v26.D[0] \n"
1922 "ins v17.D[1], v27.D[0] \n"
1923 "ins v18.D[1], v28.D[0] \n"
1924
1925 "urshr v4.8h, v16.8h, #1 \n" // 2x average
1926 "urshr v5.8h, v17.8h, #1 \n"
1927 "urshr v6.8h, v18.8h, #1 \n"
1928
1929 "subs %w4, %w4, #16 \n" // 16 processed per loop.
1930 "mul v2.8h, v4.8h, v20.8h \n" // B
1931 "mls v2.8h, v5.8h, v21.8h \n" // G
1932 "mls v2.8h, v6.8h, v22.8h \n" // R
1933 "add v2.8h, v2.8h, v25.8h \n" // +128 -> unsigned
1934 "mul v3.8h, v6.8h, v20.8h \n" // R
1935 "mls v3.8h, v5.8h, v24.8h \n" // G
1936 "mls v3.8h, v4.8h, v23.8h \n" // B
1937 "add v3.8h, v3.8h, v25.8h \n" // +128 -> unsigned
1938 "uqshrn v0.8b, v2.8h, #8 \n" // 16 bit to 8 bit U
1939 "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V
1940 MEMACCESS(2)
1941 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
1942 MEMACCESS(3)
1943 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
1944 "b.gt 1b \n"
1945 : "+r"(src_argb4444), // %0
1946 "+r"(src_argb4444_1), // %1
1947 "+r"(dst_u), // %2
1948 "+r"(dst_v), // %3
1949 "+r"(width) // %4
1950 :
1951 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
1952 "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
1953 "v26", "v27", "v28"
1954
1955 );
1956 }
1957
RGB565ToYRow_NEON(const uint8 * src_rgb565,uint8 * dst_y,int width)1958 void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int width) {
1959 asm volatile (
1960 "movi v24.8b, #13 \n" // B * 0.1016 coefficient
1961 "movi v25.8b, #65 \n" // G * 0.5078 coefficient
1962 "movi v26.8b, #33 \n" // R * 0.2578 coefficient
1963 "movi v27.8b, #16 \n" // Add 16 constant
1964 "1: \n"
1965 MEMACCESS(0)
1966 "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels.
1967 "subs %w2, %w2, #8 \n" // 8 processed per loop.
1968 RGB565TOARGB
1969 "umull v3.8h, v0.8b, v24.8b \n" // B
1970 "umlal v3.8h, v1.8b, v25.8b \n" // G
1971 "umlal v3.8h, v2.8b, v26.8b \n" // R
1972 "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y
1973 "uqadd v0.8b, v0.8b, v27.8b \n"
1974 MEMACCESS(1)
1975 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
1976 "b.gt 1b \n"
1977 : "+r"(src_rgb565), // %0
1978 "+r"(dst_y), // %1
1979 "+r"(width) // %2
1980 :
1981 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6",
1982 "v24", "v25", "v26", "v27"
1983 );
1984 }
1985
ARGB1555ToYRow_NEON(const uint8 * src_argb1555,uint8 * dst_y,int width)1986 void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int width) {
1987 asm volatile (
1988 "movi v4.8b, #13 \n" // B * 0.1016 coefficient
1989 "movi v5.8b, #65 \n" // G * 0.5078 coefficient
1990 "movi v6.8b, #33 \n" // R * 0.2578 coefficient
1991 "movi v7.8b, #16 \n" // Add 16 constant
1992 "1: \n"
1993 MEMACCESS(0)
1994 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
1995 "subs %w2, %w2, #8 \n" // 8 processed per loop.
1996 ARGB1555TOARGB
1997 "umull v3.8h, v0.8b, v4.8b \n" // B
1998 "umlal v3.8h, v1.8b, v5.8b \n" // G
1999 "umlal v3.8h, v2.8b, v6.8b \n" // R
2000 "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y
2001 "uqadd v0.8b, v0.8b, v7.8b \n"
2002 MEMACCESS(1)
2003 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
2004 "b.gt 1b \n"
2005 : "+r"(src_argb1555), // %0
2006 "+r"(dst_y), // %1
2007 "+r"(width) // %2
2008 :
2009 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
2010 );
2011 }
2012
ARGB4444ToYRow_NEON(const uint8 * src_argb4444,uint8 * dst_y,int width)2013 void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int width) {
2014 asm volatile (
2015 "movi v24.8b, #13 \n" // B * 0.1016 coefficient
2016 "movi v25.8b, #65 \n" // G * 0.5078 coefficient
2017 "movi v26.8b, #33 \n" // R * 0.2578 coefficient
2018 "movi v27.8b, #16 \n" // Add 16 constant
2019 "1: \n"
2020 MEMACCESS(0)
2021 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels.
2022 "subs %w2, %w2, #8 \n" // 8 processed per loop.
2023 ARGB4444TOARGB
2024 "umull v3.8h, v0.8b, v24.8b \n" // B
2025 "umlal v3.8h, v1.8b, v25.8b \n" // G
2026 "umlal v3.8h, v2.8b, v26.8b \n" // R
2027 "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y
2028 "uqadd v0.8b, v0.8b, v27.8b \n"
2029 MEMACCESS(1)
2030 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
2031 "b.gt 1b \n"
2032 : "+r"(src_argb4444), // %0
2033 "+r"(dst_y), // %1
2034 "+r"(width) // %2
2035 :
2036 : "cc", "memory", "v0", "v1", "v2", "v3", "v24", "v25", "v26", "v27"
2037 );
2038 }
2039
BGRAToYRow_NEON(const uint8 * src_bgra,uint8 * dst_y,int width)2040 void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int width) {
2041 asm volatile (
2042 "movi v4.8b, #33 \n" // R * 0.2578 coefficient
2043 "movi v5.8b, #65 \n" // G * 0.5078 coefficient
2044 "movi v6.8b, #13 \n" // B * 0.1016 coefficient
2045 "movi v7.8b, #16 \n" // Add 16 constant
2046 "1: \n"
2047 MEMACCESS(0)
2048 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels.
2049 "subs %w2, %w2, #8 \n" // 8 processed per loop.
2050 "umull v16.8h, v1.8b, v4.8b \n" // R
2051 "umlal v16.8h, v2.8b, v5.8b \n" // G
2052 "umlal v16.8h, v3.8b, v6.8b \n" // B
2053 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y
2054 "uqadd v0.8b, v0.8b, v7.8b \n"
2055 MEMACCESS(1)
2056 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
2057 "b.gt 1b \n"
2058 : "+r"(src_bgra), // %0
2059 "+r"(dst_y), // %1
2060 "+r"(width) // %2
2061 :
2062 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
2063 );
2064 }
2065
ABGRToYRow_NEON(const uint8 * src_abgr,uint8 * dst_y,int width)2066 void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int width) {
2067 asm volatile (
2068 "movi v4.8b, #33 \n" // R * 0.2578 coefficient
2069 "movi v5.8b, #65 \n" // G * 0.5078 coefficient
2070 "movi v6.8b, #13 \n" // B * 0.1016 coefficient
2071 "movi v7.8b, #16 \n" // Add 16 constant
2072 "1: \n"
2073 MEMACCESS(0)
2074 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels.
2075 "subs %w2, %w2, #8 \n" // 8 processed per loop.
2076 "umull v16.8h, v0.8b, v4.8b \n" // R
2077 "umlal v16.8h, v1.8b, v5.8b \n" // G
2078 "umlal v16.8h, v2.8b, v6.8b \n" // B
2079 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y
2080 "uqadd v0.8b, v0.8b, v7.8b \n"
2081 MEMACCESS(1)
2082 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
2083 "b.gt 1b \n"
2084 : "+r"(src_abgr), // %0
2085 "+r"(dst_y), // %1
2086 "+r"(width) // %2
2087 :
2088 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
2089 );
2090 }
2091
RGBAToYRow_NEON(const uint8 * src_rgba,uint8 * dst_y,int width)2092 void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int width) {
2093 asm volatile (
2094 "movi v4.8b, #13 \n" // B * 0.1016 coefficient
2095 "movi v5.8b, #65 \n" // G * 0.5078 coefficient
2096 "movi v6.8b, #33 \n" // R * 0.2578 coefficient
2097 "movi v7.8b, #16 \n" // Add 16 constant
2098 "1: \n"
2099 MEMACCESS(0)
2100 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels.
2101 "subs %w2, %w2, #8 \n" // 8 processed per loop.
2102 "umull v16.8h, v1.8b, v4.8b \n" // B
2103 "umlal v16.8h, v2.8b, v5.8b \n" // G
2104 "umlal v16.8h, v3.8b, v6.8b \n" // R
2105 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y
2106 "uqadd v0.8b, v0.8b, v7.8b \n"
2107 MEMACCESS(1)
2108 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
2109 "b.gt 1b \n"
2110 : "+r"(src_rgba), // %0
2111 "+r"(dst_y), // %1
2112 "+r"(width) // %2
2113 :
2114 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
2115 );
2116 }
2117
RGB24ToYRow_NEON(const uint8 * src_rgb24,uint8 * dst_y,int width)2118 void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int width) {
2119 asm volatile (
2120 "movi v4.8b, #13 \n" // B * 0.1016 coefficient
2121 "movi v5.8b, #65 \n" // G * 0.5078 coefficient
2122 "movi v6.8b, #33 \n" // R * 0.2578 coefficient
2123 "movi v7.8b, #16 \n" // Add 16 constant
2124 "1: \n"
2125 MEMACCESS(0)
2126 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels.
2127 "subs %w2, %w2, #8 \n" // 8 processed per loop.
2128 "umull v16.8h, v0.8b, v4.8b \n" // B
2129 "umlal v16.8h, v1.8b, v5.8b \n" // G
2130 "umlal v16.8h, v2.8b, v6.8b \n" // R
2131 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y
2132 "uqadd v0.8b, v0.8b, v7.8b \n"
2133 MEMACCESS(1)
2134 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
2135 "b.gt 1b \n"
2136 : "+r"(src_rgb24), // %0
2137 "+r"(dst_y), // %1
2138 "+r"(width) // %2
2139 :
2140 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
2141 );
2142 }
2143
RAWToYRow_NEON(const uint8 * src_raw,uint8 * dst_y,int width)2144 void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int width) {
2145 asm volatile (
2146 "movi v4.8b, #33 \n" // R * 0.2578 coefficient
2147 "movi v5.8b, #65 \n" // G * 0.5078 coefficient
2148 "movi v6.8b, #13 \n" // B * 0.1016 coefficient
2149 "movi v7.8b, #16 \n" // Add 16 constant
2150 "1: \n"
2151 MEMACCESS(0)
2152 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels.
2153 "subs %w2, %w2, #8 \n" // 8 processed per loop.
2154 "umull v16.8h, v0.8b, v4.8b \n" // B
2155 "umlal v16.8h, v1.8b, v5.8b \n" // G
2156 "umlal v16.8h, v2.8b, v6.8b \n" // R
2157 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y
2158 "uqadd v0.8b, v0.8b, v7.8b \n"
2159 MEMACCESS(1)
2160 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
2161 "b.gt 1b \n"
2162 : "+r"(src_raw), // %0
2163 "+r"(dst_y), // %1
2164 "+r"(width) // %2
2165 :
2166 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
2167 );
2168 }
2169
2170 // Bilinear filter 16x2 -> 16x1
InterpolateRow_NEON(uint8 * dst_ptr,const uint8 * src_ptr,ptrdiff_t src_stride,int dst_width,int source_y_fraction)2171 void InterpolateRow_NEON(uint8* dst_ptr,
2172 const uint8* src_ptr, ptrdiff_t src_stride,
2173 int dst_width, int source_y_fraction) {
2174 int y1_fraction = source_y_fraction;
2175 int y0_fraction = 256 - y1_fraction;
2176 const uint8* src_ptr1 = src_ptr + src_stride;
2177 asm volatile (
2178 "cmp %w4, #0 \n"
2179 "b.eq 100f \n"
2180 "cmp %w4, #128 \n"
2181 "b.eq 50f \n"
2182
2183 "dup v5.16b, %w4 \n"
2184 "dup v4.16b, %w5 \n"
2185 // General purpose row blend.
2186 "1: \n"
2187 MEMACCESS(1)
2188 "ld1 {v0.16b}, [%1], #16 \n"
2189 MEMACCESS(2)
2190 "ld1 {v1.16b}, [%2], #16 \n"
2191 "subs %w3, %w3, #16 \n"
2192 "umull v2.8h, v0.8b, v4.8b \n"
2193 "umull2 v3.8h, v0.16b, v4.16b \n"
2194 "umlal v2.8h, v1.8b, v5.8b \n"
2195 "umlal2 v3.8h, v1.16b, v5.16b \n"
2196 "rshrn v0.8b, v2.8h, #8 \n"
2197 "rshrn2 v0.16b, v3.8h, #8 \n"
2198 MEMACCESS(0)
2199 "st1 {v0.16b}, [%0], #16 \n"
2200 "b.gt 1b \n"
2201 "b 99f \n"
2202
2203 // Blend 50 / 50.
2204 "50: \n"
2205 MEMACCESS(1)
2206 "ld1 {v0.16b}, [%1], #16 \n"
2207 MEMACCESS(2)
2208 "ld1 {v1.16b}, [%2], #16 \n"
2209 "subs %w3, %w3, #16 \n"
2210 "urhadd v0.16b, v0.16b, v1.16b \n"
2211 MEMACCESS(0)
2212 "st1 {v0.16b}, [%0], #16 \n"
2213 "b.gt 50b \n"
2214 "b 99f \n"
2215
2216 // Blend 100 / 0 - Copy row unchanged.
2217 "100: \n"
2218 MEMACCESS(1)
2219 "ld1 {v0.16b}, [%1], #16 \n"
2220 "subs %w3, %w3, #16 \n"
2221 MEMACCESS(0)
2222 "st1 {v0.16b}, [%0], #16 \n"
2223 "b.gt 100b \n"
2224
2225 "99: \n"
2226 : "+r"(dst_ptr), // %0
2227 "+r"(src_ptr), // %1
2228 "+r"(src_ptr1), // %2
2229 "+r"(dst_width), // %3
2230 "+r"(y1_fraction), // %4
2231 "+r"(y0_fraction) // %5
2232 :
2233 : "cc", "memory", "v0", "v1", "v3", "v4", "v5"
2234 );
2235 }
2236
2237 // dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr
ARGBBlendRow_NEON(const uint8 * src_argb0,const uint8 * src_argb1,uint8 * dst_argb,int width)2238 void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
2239 uint8* dst_argb, int width) {
2240 asm volatile (
2241 "subs %w3, %w3, #8 \n"
2242 "b.lt 89f \n"
2243 // Blend 8 pixels.
2244 "8: \n"
2245 MEMACCESS(0)
2246 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB0 pixels
2247 MEMACCESS(1)
2248 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 ARGB1 pixels
2249 "subs %w3, %w3, #8 \n" // 8 processed per loop.
2250 "umull v16.8h, v4.8b, v3.8b \n" // db * a
2251 "umull v17.8h, v5.8b, v3.8b \n" // dg * a
2252 "umull v18.8h, v6.8b, v3.8b \n" // dr * a
2253 "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8
2254 "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8
2255 "uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8
2256 "uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256)
2257 "uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256)
2258 "uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256)
2259 "uqadd v0.8b, v0.8b, v4.8b \n" // + sb
2260 "uqadd v1.8b, v1.8b, v5.8b \n" // + sg
2261 "uqadd v2.8b, v2.8b, v6.8b \n" // + sr
2262 "movi v3.8b, #255 \n" // a = 255
2263 MEMACCESS(2)
2264 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels
2265 "b.ge 8b \n"
2266
2267 "89: \n"
2268 "adds %w3, %w3, #8-1 \n"
2269 "b.lt 99f \n"
2270
2271 // Blend 1 pixels.
2272 "1: \n"
2273 MEMACCESS(0)
2274 "ld4 {v0.b,v1.b,v2.b,v3.b}[0], [%0], #4 \n" // load 1 pixel ARGB0.
2275 MEMACCESS(1)
2276 "ld4 {v4.b,v5.b,v6.b,v7.b}[0], [%1], #4 \n" // load 1 pixel ARGB1.
2277 "subs %w3, %w3, #1 \n" // 1 processed per loop.
2278 "umull v16.8h, v4.8b, v3.8b \n" // db * a
2279 "umull v17.8h, v5.8b, v3.8b \n" // dg * a
2280 "umull v18.8h, v6.8b, v3.8b \n" // dr * a
2281 "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8
2282 "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8
2283 "uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8
2284 "uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256)
2285 "uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256)
2286 "uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256)
2287 "uqadd v0.8b, v0.8b, v4.8b \n" // + sb
2288 "uqadd v1.8b, v1.8b, v5.8b \n" // + sg
2289 "uqadd v2.8b, v2.8b, v6.8b \n" // + sr
2290 "movi v3.8b, #255 \n" // a = 255
2291 MEMACCESS(2)
2292 "st4 {v0.b,v1.b,v2.b,v3.b}[0], [%2], #4 \n" // store 1 pixel.
2293 "b.ge 1b \n"
2294
2295 "99: \n"
2296
2297 : "+r"(src_argb0), // %0
2298 "+r"(src_argb1), // %1
2299 "+r"(dst_argb), // %2
2300 "+r"(width) // %3
2301 :
2302 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
2303 "v16", "v17", "v18"
2304 );
2305 }
2306
2307 // Attenuate 8 pixels at a time.
ARGBAttenuateRow_NEON(const uint8 * src_argb,uint8 * dst_argb,int width)2308 void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
2309 asm volatile (
2310 // Attenuate 8 pixels.
2311 "1: \n"
2312 MEMACCESS(0)
2313 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels
2314 "subs %w2, %w2, #8 \n" // 8 processed per loop.
2315 "umull v4.8h, v0.8b, v3.8b \n" // b * a
2316 "umull v5.8h, v1.8b, v3.8b \n" // g * a
2317 "umull v6.8h, v2.8b, v3.8b \n" // r * a
2318 "uqrshrn v0.8b, v4.8h, #8 \n" // b >>= 8
2319 "uqrshrn v1.8b, v5.8h, #8 \n" // g >>= 8
2320 "uqrshrn v2.8b, v6.8h, #8 \n" // r >>= 8
2321 MEMACCESS(1)
2322 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels
2323 "b.gt 1b \n"
2324 : "+r"(src_argb), // %0
2325 "+r"(dst_argb), // %1
2326 "+r"(width) // %2
2327 :
2328 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"
2329 );
2330 }
2331
2332 // Quantize 8 ARGB pixels (32 bytes).
2333 // dst = (dst * scale >> 16) * interval_size + interval_offset;
ARGBQuantizeRow_NEON(uint8 * dst_argb,int scale,int interval_size,int interval_offset,int width)2334 void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size,
2335 int interval_offset, int width) {
2336 asm volatile (
2337 "dup v4.8h, %w2 \n"
2338 "ushr v4.8h, v4.8h, #1 \n" // scale >>= 1
2339 "dup v5.8h, %w3 \n" // interval multiply.
2340 "dup v6.8h, %w4 \n" // interval add
2341
2342 // 8 pixel loop.
2343 "1: \n"
2344 MEMACCESS(0)
2345 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 pixels of ARGB.
2346 "subs %w1, %w1, #8 \n" // 8 processed per loop.
2347 "uxtl v0.8h, v0.8b \n" // b (0 .. 255)
2348 "uxtl v1.8h, v1.8b \n"
2349 "uxtl v2.8h, v2.8b \n"
2350 "sqdmulh v0.8h, v0.8h, v4.8h \n" // b * scale
2351 "sqdmulh v1.8h, v1.8h, v4.8h \n" // g
2352 "sqdmulh v2.8h, v2.8h, v4.8h \n" // r
2353 "mul v0.8h, v0.8h, v5.8h \n" // b * interval_size
2354 "mul v1.8h, v1.8h, v5.8h \n" // g
2355 "mul v2.8h, v2.8h, v5.8h \n" // r
2356 "add v0.8h, v0.8h, v6.8h \n" // b + interval_offset
2357 "add v1.8h, v1.8h, v6.8h \n" // g
2358 "add v2.8h, v2.8h, v6.8h \n" // r
2359 "uqxtn v0.8b, v0.8h \n"
2360 "uqxtn v1.8b, v1.8h \n"
2361 "uqxtn v2.8b, v2.8h \n"
2362 MEMACCESS(0)
2363 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 ARGB pixels
2364 "b.gt 1b \n"
2365 : "+r"(dst_argb), // %0
2366 "+r"(width) // %1
2367 : "r"(scale), // %2
2368 "r"(interval_size), // %3
2369 "r"(interval_offset) // %4
2370 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"
2371 );
2372 }
2373
2374 // Shade 8 pixels at a time by specified value.
2375 // NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8.
2376 // Rounding in vqrdmulh does +1 to high if high bit of low s16 is set.
ARGBShadeRow_NEON(const uint8 * src_argb,uint8 * dst_argb,int width,uint32 value)2377 void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width,
2378 uint32 value) {
2379 asm volatile (
2380 "dup v0.4s, %w3 \n" // duplicate scale value.
2381 "zip1 v0.8b, v0.8b, v0.8b \n" // v0.8b aarrggbb.
2382 "ushr v0.8h, v0.8h, #1 \n" // scale / 2.
2383
2384 // 8 pixel loop.
2385 "1: \n"
2386 MEMACCESS(0)
2387 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n" // load 8 ARGB pixels.
2388 "subs %w2, %w2, #8 \n" // 8 processed per loop.
2389 "uxtl v4.8h, v4.8b \n" // b (0 .. 255)
2390 "uxtl v5.8h, v5.8b \n"
2391 "uxtl v6.8h, v6.8b \n"
2392 "uxtl v7.8h, v7.8b \n"
2393 "sqrdmulh v4.8h, v4.8h, v0.h[0] \n" // b * scale * 2
2394 "sqrdmulh v5.8h, v5.8h, v0.h[1] \n" // g
2395 "sqrdmulh v6.8h, v6.8h, v0.h[2] \n" // r
2396 "sqrdmulh v7.8h, v7.8h, v0.h[3] \n" // a
2397 "uqxtn v4.8b, v4.8h \n"
2398 "uqxtn v5.8b, v5.8h \n"
2399 "uqxtn v6.8b, v6.8h \n"
2400 "uqxtn v7.8b, v7.8h \n"
2401 MEMACCESS(1)
2402 "st4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // store 8 ARGB pixels
2403 "b.gt 1b \n"
2404 : "+r"(src_argb), // %0
2405 "+r"(dst_argb), // %1
2406 "+r"(width) // %2
2407 : "r"(value) // %3
2408 : "cc", "memory", "v0", "v4", "v5", "v6", "v7"
2409 );
2410 }
2411
2412 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
2413 // Similar to ARGBToYJ but stores ARGB.
2414 // C code is (15 * b + 75 * g + 38 * r + 64) >> 7;
ARGBGrayRow_NEON(const uint8 * src_argb,uint8 * dst_argb,int width)2415 void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
2416 asm volatile (
2417 "movi v24.8b, #15 \n" // B * 0.11400 coefficient
2418 "movi v25.8b, #75 \n" // G * 0.58700 coefficient
2419 "movi v26.8b, #38 \n" // R * 0.29900 coefficient
2420 "1: \n"
2421 MEMACCESS(0)
2422 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
2423 "subs %w2, %w2, #8 \n" // 8 processed per loop.
2424 "umull v4.8h, v0.8b, v24.8b \n" // B
2425 "umlal v4.8h, v1.8b, v25.8b \n" // G
2426 "umlal v4.8h, v2.8b, v26.8b \n" // R
2427 "sqrshrun v0.8b, v4.8h, #7 \n" // 15 bit to 8 bit B
2428 "orr v1.8b, v0.8b, v0.8b \n" // G
2429 "orr v2.8b, v0.8b, v0.8b \n" // R
2430 MEMACCESS(1)
2431 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 pixels.
2432 "b.gt 1b \n"
2433 : "+r"(src_argb), // %0
2434 "+r"(dst_argb), // %1
2435 "+r"(width) // %2
2436 :
2437 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26"
2438 );
2439 }
2440
2441 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
2442 // b = (r * 35 + g * 68 + b * 17) >> 7
2443 // g = (r * 45 + g * 88 + b * 22) >> 7
2444 // r = (r * 50 + g * 98 + b * 24) >> 7
2445
ARGBSepiaRow_NEON(uint8 * dst_argb,int width)2446 void ARGBSepiaRow_NEON(uint8* dst_argb, int width) {
2447 asm volatile (
2448 "movi v20.8b, #17 \n" // BB coefficient
2449 "movi v21.8b, #68 \n" // BG coefficient
2450 "movi v22.8b, #35 \n" // BR coefficient
2451 "movi v24.8b, #22 \n" // GB coefficient
2452 "movi v25.8b, #88 \n" // GG coefficient
2453 "movi v26.8b, #45 \n" // GR coefficient
2454 "movi v28.8b, #24 \n" // BB coefficient
2455 "movi v29.8b, #98 \n" // BG coefficient
2456 "movi v30.8b, #50 \n" // BR coefficient
2457 "1: \n"
2458 MEMACCESS(0)
2459 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB pixels.
2460 "subs %w1, %w1, #8 \n" // 8 processed per loop.
2461 "umull v4.8h, v0.8b, v20.8b \n" // B to Sepia B
2462 "umlal v4.8h, v1.8b, v21.8b \n" // G
2463 "umlal v4.8h, v2.8b, v22.8b \n" // R
2464 "umull v5.8h, v0.8b, v24.8b \n" // B to Sepia G
2465 "umlal v5.8h, v1.8b, v25.8b \n" // G
2466 "umlal v5.8h, v2.8b, v26.8b \n" // R
2467 "umull v6.8h, v0.8b, v28.8b \n" // B to Sepia R
2468 "umlal v6.8h, v1.8b, v29.8b \n" // G
2469 "umlal v6.8h, v2.8b, v30.8b \n" // R
2470 "uqshrn v0.8b, v4.8h, #7 \n" // 16 bit to 8 bit B
2471 "uqshrn v1.8b, v5.8h, #7 \n" // 16 bit to 8 bit G
2472 "uqshrn v2.8b, v6.8h, #7 \n" // 16 bit to 8 bit R
2473 MEMACCESS(0)
2474 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 pixels.
2475 "b.gt 1b \n"
2476 : "+r"(dst_argb), // %0
2477 "+r"(width) // %1
2478 :
2479 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
2480 "v20", "v21", "v22", "v24", "v25", "v26", "v28", "v29", "v30"
2481 );
2482 }
2483
2484 // Tranform 8 ARGB pixels (32 bytes) with color matrix.
2485 // TODO(fbarchard): Was same as Sepia except matrix is provided. This function
2486 // needs to saturate. Consider doing a non-saturating version.
ARGBColorMatrixRow_NEON(const uint8 * src_argb,uint8 * dst_argb,const int8 * matrix_argb,int width)2487 void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb,
2488 const int8* matrix_argb, int width) {
2489 asm volatile (
2490 MEMACCESS(3)
2491 "ld1 {v2.16b}, [%3] \n" // load 3 ARGB vectors.
2492 "sxtl v0.8h, v2.8b \n" // B,G coefficients s16.
2493 "sxtl2 v1.8h, v2.16b \n" // R,A coefficients s16.
2494
2495 "1: \n"
2496 MEMACCESS(0)
2497 "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 pixels.
2498 "subs %w2, %w2, #8 \n" // 8 processed per loop.
2499 "uxtl v16.8h, v16.8b \n" // b (0 .. 255) 16 bit
2500 "uxtl v17.8h, v17.8b \n" // g
2501 "uxtl v18.8h, v18.8b \n" // r
2502 "uxtl v19.8h, v19.8b \n" // a
2503 "mul v22.8h, v16.8h, v0.h[0] \n" // B = B * Matrix B
2504 "mul v23.8h, v16.8h, v0.h[4] \n" // G = B * Matrix G
2505 "mul v24.8h, v16.8h, v1.h[0] \n" // R = B * Matrix R
2506 "mul v25.8h, v16.8h, v1.h[4] \n" // A = B * Matrix A
2507 "mul v4.8h, v17.8h, v0.h[1] \n" // B += G * Matrix B
2508 "mul v5.8h, v17.8h, v0.h[5] \n" // G += G * Matrix G
2509 "mul v6.8h, v17.8h, v1.h[1] \n" // R += G * Matrix R
2510 "mul v7.8h, v17.8h, v1.h[5] \n" // A += G * Matrix A
2511 "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B
2512 "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G
2513 "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R
2514 "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A
2515 "mul v4.8h, v18.8h, v0.h[2] \n" // B += R * Matrix B
2516 "mul v5.8h, v18.8h, v0.h[6] \n" // G += R * Matrix G
2517 "mul v6.8h, v18.8h, v1.h[2] \n" // R += R * Matrix R
2518 "mul v7.8h, v18.8h, v1.h[6] \n" // A += R * Matrix A
2519 "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B
2520 "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G
2521 "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R
2522 "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A
2523 "mul v4.8h, v19.8h, v0.h[3] \n" // B += A * Matrix B
2524 "mul v5.8h, v19.8h, v0.h[7] \n" // G += A * Matrix G
2525 "mul v6.8h, v19.8h, v1.h[3] \n" // R += A * Matrix R
2526 "mul v7.8h, v19.8h, v1.h[7] \n" // A += A * Matrix A
2527 "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B
2528 "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G
2529 "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R
2530 "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A
2531 "sqshrun v16.8b, v22.8h, #6 \n" // 16 bit to 8 bit B
2532 "sqshrun v17.8b, v23.8h, #6 \n" // 16 bit to 8 bit G
2533 "sqshrun v18.8b, v24.8h, #6 \n" // 16 bit to 8 bit R
2534 "sqshrun v19.8b, v25.8h, #6 \n" // 16 bit to 8 bit A
2535 MEMACCESS(1)
2536 "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n" // store 8 pixels.
2537 "b.gt 1b \n"
2538 : "+r"(src_argb), // %0
2539 "+r"(dst_argb), // %1
2540 "+r"(width) // %2
2541 : "r"(matrix_argb) // %3
2542 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17",
2543 "v18", "v19", "v22", "v23", "v24", "v25"
2544 );
2545 }
2546
2547 // TODO(fbarchard): fix vqshrun in ARGBMultiplyRow_NEON and reenable.
2548 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
ARGBMultiplyRow_NEON(const uint8 * src_argb0,const uint8 * src_argb1,uint8 * dst_argb,int width)2549 void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
2550 uint8* dst_argb, int width) {
2551 asm volatile (
2552 // 8 pixel loop.
2553 "1: \n"
2554 MEMACCESS(0)
2555 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
2556 MEMACCESS(1)
2557 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more pixels.
2558 "subs %w3, %w3, #8 \n" // 8 processed per loop.
2559 "umull v0.8h, v0.8b, v4.8b \n" // multiply B
2560 "umull v1.8h, v1.8b, v5.8b \n" // multiply G
2561 "umull v2.8h, v2.8b, v6.8b \n" // multiply R
2562 "umull v3.8h, v3.8b, v7.8b \n" // multiply A
2563 "rshrn v0.8b, v0.8h, #8 \n" // 16 bit to 8 bit B
2564 "rshrn v1.8b, v1.8h, #8 \n" // 16 bit to 8 bit G
2565 "rshrn v2.8b, v2.8h, #8 \n" // 16 bit to 8 bit R
2566 "rshrn v3.8b, v3.8h, #8 \n" // 16 bit to 8 bit A
2567 MEMACCESS(2)
2568 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels
2569 "b.gt 1b \n"
2570
2571 : "+r"(src_argb0), // %0
2572 "+r"(src_argb1), // %1
2573 "+r"(dst_argb), // %2
2574 "+r"(width) // %3
2575 :
2576 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
2577 );
2578 }
2579
2580 // Add 2 rows of ARGB pixels together, 8 pixels at a time.
ARGBAddRow_NEON(const uint8 * src_argb0,const uint8 * src_argb1,uint8 * dst_argb,int width)2581 void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
2582 uint8* dst_argb, int width) {
2583 asm volatile (
2584 // 8 pixel loop.
2585 "1: \n"
2586 MEMACCESS(0)
2587 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
2588 MEMACCESS(1)
2589 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more pixels.
2590 "subs %w3, %w3, #8 \n" // 8 processed per loop.
2591 "uqadd v0.8b, v0.8b, v4.8b \n"
2592 "uqadd v1.8b, v1.8b, v5.8b \n"
2593 "uqadd v2.8b, v2.8b, v6.8b \n"
2594 "uqadd v3.8b, v3.8b, v7.8b \n"
2595 MEMACCESS(2)
2596 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels
2597 "b.gt 1b \n"
2598
2599 : "+r"(src_argb0), // %0
2600 "+r"(src_argb1), // %1
2601 "+r"(dst_argb), // %2
2602 "+r"(width) // %3
2603 :
2604 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
2605 );
2606 }
2607
2608 // Subtract 2 rows of ARGB pixels, 8 pixels at a time.
ARGBSubtractRow_NEON(const uint8 * src_argb0,const uint8 * src_argb1,uint8 * dst_argb,int width)2609 void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
2610 uint8* dst_argb, int width) {
2611 asm volatile (
2612 // 8 pixel loop.
2613 "1: \n"
2614 MEMACCESS(0)
2615 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
2616 MEMACCESS(1)
2617 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more pixels.
2618 "subs %w3, %w3, #8 \n" // 8 processed per loop.
2619 "uqsub v0.8b, v0.8b, v4.8b \n"
2620 "uqsub v1.8b, v1.8b, v5.8b \n"
2621 "uqsub v2.8b, v2.8b, v6.8b \n"
2622 "uqsub v3.8b, v3.8b, v7.8b \n"
2623 MEMACCESS(2)
2624 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels
2625 "b.gt 1b \n"
2626
2627 : "+r"(src_argb0), // %0
2628 "+r"(src_argb1), // %1
2629 "+r"(dst_argb), // %2
2630 "+r"(width) // %3
2631 :
2632 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
2633 );
2634 }
2635
2636 // Adds Sobel X and Sobel Y and stores Sobel into ARGB.
2637 // A = 255
2638 // R = Sobel
2639 // G = Sobel
2640 // B = Sobel
SobelRow_NEON(const uint8 * src_sobelx,const uint8 * src_sobely,uint8 * dst_argb,int width)2641 void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
2642 uint8* dst_argb, int width) {
2643 asm volatile (
2644 "movi v3.8b, #255 \n" // alpha
2645 // 8 pixel loop.
2646 "1: \n"
2647 MEMACCESS(0)
2648 "ld1 {v0.8b}, [%0], #8 \n" // load 8 sobelx.
2649 MEMACCESS(1)
2650 "ld1 {v1.8b}, [%1], #8 \n" // load 8 sobely.
2651 "subs %w3, %w3, #8 \n" // 8 processed per loop.
2652 "uqadd v0.8b, v0.8b, v1.8b \n" // add
2653 "orr v1.8b, v0.8b, v0.8b \n"
2654 "orr v2.8b, v0.8b, v0.8b \n"
2655 MEMACCESS(2)
2656 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels
2657 "b.gt 1b \n"
2658 : "+r"(src_sobelx), // %0
2659 "+r"(src_sobely), // %1
2660 "+r"(dst_argb), // %2
2661 "+r"(width) // %3
2662 :
2663 : "cc", "memory", "v0", "v1", "v2", "v3"
2664 );
2665 }
2666
2667 // Adds Sobel X and Sobel Y and stores Sobel into plane.
SobelToPlaneRow_NEON(const uint8 * src_sobelx,const uint8 * src_sobely,uint8 * dst_y,int width)2668 void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
2669 uint8* dst_y, int width) {
2670 asm volatile (
2671 // 16 pixel loop.
2672 "1: \n"
2673 MEMACCESS(0)
2674 "ld1 {v0.16b}, [%0], #16 \n" // load 16 sobelx.
2675 MEMACCESS(1)
2676 "ld1 {v1.16b}, [%1], #16 \n" // load 16 sobely.
2677 "subs %w3, %w3, #16 \n" // 16 processed per loop.
2678 "uqadd v0.16b, v0.16b, v1.16b \n" // add
2679 MEMACCESS(2)
2680 "st1 {v0.16b}, [%2], #16 \n" // store 16 pixels.
2681 "b.gt 1b \n"
2682 : "+r"(src_sobelx), // %0
2683 "+r"(src_sobely), // %1
2684 "+r"(dst_y), // %2
2685 "+r"(width) // %3
2686 :
2687 : "cc", "memory", "v0", "v1"
2688 );
2689 }
2690
2691 // Mixes Sobel X, Sobel Y and Sobel into ARGB.
2692 // A = 255
2693 // R = Sobel X
2694 // G = Sobel
2695 // B = Sobel Y
SobelXYRow_NEON(const uint8 * src_sobelx,const uint8 * src_sobely,uint8 * dst_argb,int width)2696 void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
2697 uint8* dst_argb, int width) {
2698 asm volatile (
2699 "movi v3.8b, #255 \n" // alpha
2700 // 8 pixel loop.
2701 "1: \n"
2702 MEMACCESS(0)
2703 "ld1 {v2.8b}, [%0], #8 \n" // load 8 sobelx.
2704 MEMACCESS(1)
2705 "ld1 {v0.8b}, [%1], #8 \n" // load 8 sobely.
2706 "subs %w3, %w3, #8 \n" // 8 processed per loop.
2707 "uqadd v1.8b, v0.8b, v2.8b \n" // add
2708 MEMACCESS(2)
2709 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels
2710 "b.gt 1b \n"
2711 : "+r"(src_sobelx), // %0
2712 "+r"(src_sobely), // %1
2713 "+r"(dst_argb), // %2
2714 "+r"(width) // %3
2715 :
2716 : "cc", "memory", "v0", "v1", "v2", "v3"
2717 );
2718 }
2719
2720 // SobelX as a matrix is
2721 // -1 0 1
2722 // -2 0 2
2723 // -1 0 1
SobelXRow_NEON(const uint8 * src_y0,const uint8 * src_y1,const uint8 * src_y2,uint8 * dst_sobelx,int width)2724 void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,
2725 const uint8* src_y2, uint8* dst_sobelx, int width) {
2726 asm volatile (
2727 "1: \n"
2728 MEMACCESS(0)
2729 "ld1 {v0.8b}, [%0],%5 \n" // top
2730 MEMACCESS(0)
2731 "ld1 {v1.8b}, [%0],%6 \n"
2732 "usubl v0.8h, v0.8b, v1.8b \n"
2733 MEMACCESS(1)
2734 "ld1 {v2.8b}, [%1],%5 \n" // center * 2
2735 MEMACCESS(1)
2736 "ld1 {v3.8b}, [%1],%6 \n"
2737 "usubl v1.8h, v2.8b, v3.8b \n"
2738 "add v0.8h, v0.8h, v1.8h \n"
2739 "add v0.8h, v0.8h, v1.8h \n"
2740 MEMACCESS(2)
2741 "ld1 {v2.8b}, [%2],%5 \n" // bottom
2742 MEMACCESS(2)
2743 "ld1 {v3.8b}, [%2],%6 \n"
2744 "subs %w4, %w4, #8 \n" // 8 pixels
2745 "usubl v1.8h, v2.8b, v3.8b \n"
2746 "add v0.8h, v0.8h, v1.8h \n"
2747 "abs v0.8h, v0.8h \n"
2748 "uqxtn v0.8b, v0.8h \n"
2749 MEMACCESS(3)
2750 "st1 {v0.8b}, [%3], #8 \n" // store 8 sobelx
2751 "b.gt 1b \n"
2752 : "+r"(src_y0), // %0
2753 "+r"(src_y1), // %1
2754 "+r"(src_y2), // %2
2755 "+r"(dst_sobelx), // %3
2756 "+r"(width) // %4
2757 : "r"(2LL), // %5
2758 "r"(6LL) // %6
2759 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
2760 );
2761 }
2762
2763 // SobelY as a matrix is
2764 // -1 -2 -1
2765 // 0 0 0
2766 // 1 2 1
SobelYRow_NEON(const uint8 * src_y0,const uint8 * src_y1,uint8 * dst_sobely,int width)2767 void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
2768 uint8* dst_sobely, int width) {
2769 asm volatile (
2770 "1: \n"
2771 MEMACCESS(0)
2772 "ld1 {v0.8b}, [%0],%4 \n" // left
2773 MEMACCESS(1)
2774 "ld1 {v1.8b}, [%1],%4 \n"
2775 "usubl v0.8h, v0.8b, v1.8b \n"
2776 MEMACCESS(0)
2777 "ld1 {v2.8b}, [%0],%4 \n" // center * 2
2778 MEMACCESS(1)
2779 "ld1 {v3.8b}, [%1],%4 \n"
2780 "usubl v1.8h, v2.8b, v3.8b \n"
2781 "add v0.8h, v0.8h, v1.8h \n"
2782 "add v0.8h, v0.8h, v1.8h \n"
2783 MEMACCESS(0)
2784 "ld1 {v2.8b}, [%0],%5 \n" // right
2785 MEMACCESS(1)
2786 "ld1 {v3.8b}, [%1],%5 \n"
2787 "subs %w3, %w3, #8 \n" // 8 pixels
2788 "usubl v1.8h, v2.8b, v3.8b \n"
2789 "add v0.8h, v0.8h, v1.8h \n"
2790 "abs v0.8h, v0.8h \n"
2791 "uqxtn v0.8b, v0.8h \n"
2792 MEMACCESS(2)
2793 "st1 {v0.8b}, [%2], #8 \n" // store 8 sobely
2794 "b.gt 1b \n"
2795 : "+r"(src_y0), // %0
2796 "+r"(src_y1), // %1
2797 "+r"(dst_sobely), // %2
2798 "+r"(width) // %3
2799 : "r"(1LL), // %4
2800 "r"(6LL) // %5
2801 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
2802 );
2803 }
2804 #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
2805
2806 #ifdef __cplusplus
2807 } // extern "C"
2808 } // namespace libyuv
2809 #endif
2810