1 /*
2 * Copyright 2011 The LibYuv Project Authors. All rights reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "libyuv/row.h"
12
13 #ifdef __cplusplus
14 namespace libyuv {
15 extern "C" {
16 #endif
17
18 // This module is for GCC Neon
19 #if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
20 !defined(__aarch64__)
21
22 // d8-d15, r4-r11,r14(lr) need to be preserved if used. r13(sp),r15(pc) are
23 // reserved.
24
25 // q0: Y uint16x8_t
26 // d2: U uint8x8_t
27 // d3: V uint8x8_t
28
29 // Read 8 Y, 4 U and 4 V from 422
30 #define READYUV422 \
31 "vld1.8 {d0}, [%[src_y]]! \n" \
32 "vld1.32 {d2[0]}, [%[src_u]]! \n" \
33 "vld1.32 {d2[1]}, [%[src_v]]! \n" \
34 "vmov.u8 d1, d0 \n" \
35 "vmovl.u8 q1, d2 \n" \
36 "vzip.u8 d0, d1 \n" \
37 "vsli.u16 q1, q1, #8 \n"
38
39 // Read 8 Y, 8 U and 8 V from 444
40 #define READYUV444 \
41 "vld1.8 {d0}, [%[src_y]]! \n" \
42 "vld1.8 {d2}, [%[src_u]]! \n" \
43 "vmovl.u8 q0, d0 \n" \
44 "vld1.8 {d3}, [%[src_v]]! \n" \
45 "vsli.u16 q0, q0, #8 \n"
46
47 // Read 8 Y, and set 4 U and 4 V to 128
48 #define READYUV400 \
49 "vld1.8 {d0}, [%[src_y]]! \n" \
50 "vmov.u8 q1, #128 \n" \
51 "vmovl.u8 q0, d0 \n" \
52 "vsli.u16 q0, q0, #8 \n"
53
54 // Read 8 Y and 4 UV from NV12
55 #define READNV12 \
56 "vld1.8 {d0}, [%[src_y]]! \n" \
57 "vld1.8 {d2}, [%[src_uv]]! \n" \
58 "vmov.u8 d1, d0 \n" \
59 "vmov.u8 d3, d2 \n" \
60 "vzip.u8 d0, d1 \n" \
61 "vsli.u16 d2, d2, #8 \n" /* Duplicate low byte (U) */ \
62 "vsri.u16 d3, d3, #8 \n" /* Duplicate high byte (V) */
63
64 // Read 8 Y and 4 VU from NV21
65 #define READNV21 \
66 "vld1.8 {d0}, [%[src_y]]! \n" \
67 "vld1.8 {d2}, [%[src_vu]]! \n" \
68 "vmov.u8 d1, d0 \n" \
69 "vmov.u8 d3, d2 \n" \
70 "vzip.u8 d0, d1 \n" \
71 "vsri.u16 d2, d2, #8 \n" /* Duplicate high byte (U) */ \
72 "vsli.u16 d3, d3, #8 \n" /* Duplicate low byte (V) */
73
74 // Read 8 YUY2
75 #define READYUY2 \
76 "vld2.8 {d0, d2}, [%[src_yuy2]]! \n" \
77 "vmovl.u8 q0, d0 \n" \
78 "vmov.u8 d3, d2 \n" \
79 "vsli.u16 q0, q0, #8 \n" \
80 "vsli.u16 d2, d2, #8 \n" \
81 "vsri.u16 d3, d3, #8 \n"
82
83 // Read 8 UYVY
84 #define READUYVY \
85 "vld2.8 {d2, d3}, [%[src_uyvy]]! \n" \
86 "vmovl.u8 q0, d3 \n" \
87 "vmov.u8 d3, d2 \n" \
88 "vsli.u16 q0, q0, #8 \n" \
89 "vsli.u16 d2, d2, #8 \n" \
90 "vsri.u16 d3, d3, #8 \n"
91
92 // TODO: Use single register for kUVCoeff and multiply by lane
93 #define YUVTORGB_SETUP \
94 "vld1.16 {d31}, [%[kRGBCoeffBias]] \n" \
95 "vld4.8 {d26[], d27[], d28[], d29[]}, [%[kUVCoeff]] \n" \
96 "vdup.u16 q10, d31[1] \n" \
97 "vdup.u16 q11, d31[2] \n" \
98 "vdup.u16 q12, d31[3] \n" \
99 "vdup.u16 d31, d31[0] \n"
100
101 // q0: B uint16x8_t
102 // q1: G uint16x8_t
103 // q2: R uint16x8_t
104
105 // Convert from YUV to 2.14 fixed point RGB
106 #define YUVTORGB \
107 "vmull.u16 q2, d1, d31 \n" \
108 "vmull.u8 q8, d3, d29 \n" /* DGV */ \
109 "vmull.u16 q0, d0, d31 \n" \
110 "vmlal.u8 q8, d2, d28 \n" /* DG */ \
111 "vqshrn.u32 d0, q0, #16 \n" \
112 "vqshrn.u32 d1, q2, #16 \n" /* Y */ \
113 "vmull.u8 q9, d2, d26 \n" /* DB */ \
114 "vmull.u8 q2, d3, d27 \n" /* DR */ \
115 "vadd.u16 q4, q0, q11 \n" /* G */ \
116 "vadd.u16 q2, q0, q2 \n" /* R */ \
117 "vadd.u16 q0, q0, q9 \n" /* B */ \
118 "vqsub.u16 q1, q4, q8 \n" /* G */ \
119 "vqsub.u16 q0, q0, q10 \n" /* B */ \
120 "vqsub.u16 q2, q2, q12 \n" /* R */
121
122 // Convert from 2.14 fixed point RGB To 8 bit RGB
123 #define RGBTORGB8 \
124 "vqshrn.u16 d4, q2, #6 \n" /* R */ \
125 "vqshrn.u16 d2, q1, #6 \n" /* G */ \
126 "vqshrn.u16 d0, q0, #6 \n" /* B */
127
128 #define YUVTORGB_REGS \
129 "q0", "q1", "q2", "q4", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "d31"
130
131 #define STORERGBA \
132 "vmov.u8 d1, d0 \n" \
133 "vmov.u8 d3, d4 \n" \
134 "vmov.u8 d0, d6 \n" \
135 "vst4.8 {d0, d1, d2, d3}, [%[dst_rgba]]! \n"
136
I444ToARGBRow_NEON(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)137 void I444ToARGBRow_NEON(const uint8_t* src_y,
138 const uint8_t* src_u,
139 const uint8_t* src_v,
140 uint8_t* dst_argb,
141 const struct YuvConstants* yuvconstants,
142 int width) {
143 asm volatile(
144 YUVTORGB_SETUP
145 "vmov.u8 d6, #255 \n"
146 "1: \n" READYUV444 YUVTORGB
147 RGBTORGB8
148 "subs %[width], %[width], #8 \n"
149 "vst4.8 {d0, d2, d4, d6}, [%[dst_argb]]! \n"
150 "bgt 1b \n"
151 : [src_y] "+r"(src_y), // %[src_y]
152 [src_u] "+r"(src_u), // %[src_u]
153 [src_v] "+r"(src_v), // %[src_v]
154 [dst_argb] "+r"(dst_argb), // %[dst_argb]
155 [width] "+r"(width) // %[width]
156 : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
157 [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias]
158 : "cc", "memory", YUVTORGB_REGS, "d6");
159 }
160
I444ToRGB24Row_NEON(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_rgb24,const struct YuvConstants * yuvconstants,int width)161 void I444ToRGB24Row_NEON(const uint8_t* src_y,
162 const uint8_t* src_u,
163 const uint8_t* src_v,
164 uint8_t* dst_rgb24,
165 const struct YuvConstants* yuvconstants,
166 int width) {
167 asm volatile(
168 YUVTORGB_SETUP
169 "1: \n" READYUV444 YUVTORGB
170 RGBTORGB8
171 "subs %[width], %[width], #8 \n"
172 "vst3.8 {d0, d2, d4}, [%[dst_rgb24]]! \n"
173 "bgt 1b \n"
174 : [src_y] "+r"(src_y), // %[src_y]
175 [src_u] "+r"(src_u), // %[src_u]
176 [src_v] "+r"(src_v), // %[src_v]
177 [dst_rgb24] "+r"(dst_rgb24), // %[dst_argb]
178 [width] "+r"(width) // %[width]
179 : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
180 [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias]
181 : "cc", "memory", YUVTORGB_REGS);
182 }
183
I422ToARGBRow_NEON(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)184 void I422ToARGBRow_NEON(const uint8_t* src_y,
185 const uint8_t* src_u,
186 const uint8_t* src_v,
187 uint8_t* dst_argb,
188 const struct YuvConstants* yuvconstants,
189 int width) {
190 asm volatile(
191 YUVTORGB_SETUP
192 "vmov.u8 d6, #255 \n"
193 "1: \n" READYUV422 YUVTORGB
194 RGBTORGB8
195 "subs %[width], %[width], #8 \n"
196 "vst4.8 {d0, d2, d4, d6}, [%[dst_argb]]! \n"
197 "bgt 1b \n"
198 : [src_y] "+r"(src_y), // %[src_y]
199 [src_u] "+r"(src_u), // %[src_u]
200 [src_v] "+r"(src_v), // %[src_v]
201 [dst_argb] "+r"(dst_argb), // %[dst_argb]
202 [width] "+r"(width) // %[width]
203 : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
204 [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias]
205 : "cc", "memory", YUVTORGB_REGS, "d6");
206 }
207
I444AlphaToARGBRow_NEON(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,const uint8_t * src_a,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)208 void I444AlphaToARGBRow_NEON(const uint8_t* src_y,
209 const uint8_t* src_u,
210 const uint8_t* src_v,
211 const uint8_t* src_a,
212 uint8_t* dst_argb,
213 const struct YuvConstants* yuvconstants,
214 int width) {
215 asm volatile(
216 YUVTORGB_SETUP
217 "1: \n" READYUV444 YUVTORGB
218 RGBTORGB8
219 "vld1.8 {d6}, [%[src_a]]! \n"
220 "subs %[width], %[width], #8 \n"
221 "vst4.8 {d0, d2, d4, d6}, [%[dst_argb]]! \n"
222 "bgt 1b \n"
223 : [src_y] "+r"(src_y), // %[src_y]
224 [src_u] "+r"(src_u), // %[src_u]
225 [src_v] "+r"(src_v), // %[src_v]
226 [src_a] "+r"(src_a), // %[src_a]
227 [dst_argb] "+r"(dst_argb), // %[dst_argb]
228 [width] "+r"(width) // %[width]
229 : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
230 [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias]
231 : "cc", "memory", YUVTORGB_REGS, "d6");
232 }
233
I422AlphaToARGBRow_NEON(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,const uint8_t * src_a,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)234 void I422AlphaToARGBRow_NEON(const uint8_t* src_y,
235 const uint8_t* src_u,
236 const uint8_t* src_v,
237 const uint8_t* src_a,
238 uint8_t* dst_argb,
239 const struct YuvConstants* yuvconstants,
240 int width) {
241 asm volatile(
242 YUVTORGB_SETUP
243 "1: \n" READYUV422 YUVTORGB
244 RGBTORGB8
245 "vld1.8 {d6}, [%[src_a]]! \n"
246 "subs %[width], %[width], #8 \n"
247 "vst4.8 {d0, d2, d4, d6}, [%[dst_argb]]! \n"
248 "bgt 1b \n"
249 : [src_y] "+r"(src_y), // %[src_y]
250 [src_u] "+r"(src_u), // %[src_u]
251 [src_v] "+r"(src_v), // %[src_v]
252 [src_a] "+r"(src_a), // %[src_a]
253 [dst_argb] "+r"(dst_argb), // %[dst_argb]
254 [width] "+r"(width) // %[width]
255 : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
256 [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias]
257 : "cc", "memory", YUVTORGB_REGS, "d6");
258 }
259
I422ToRGBARow_NEON(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_rgba,const struct YuvConstants * yuvconstants,int width)260 void I422ToRGBARow_NEON(const uint8_t* src_y,
261 const uint8_t* src_u,
262 const uint8_t* src_v,
263 uint8_t* dst_rgba,
264 const struct YuvConstants* yuvconstants,
265 int width) {
266 asm volatile(
267 YUVTORGB_SETUP
268 "vmov.u8 d6, #255 \n"
269 "1: \n" READYUV422 YUVTORGB
270 RGBTORGB8 "subs %[width], %[width], #8 \n" STORERGBA
271 "bgt 1b \n"
272 : [src_y] "+r"(src_y), // %[src_y]
273 [src_u] "+r"(src_u), // %[src_u]
274 [src_v] "+r"(src_v), // %[src_v]
275 [dst_rgba] "+r"(dst_rgba), // %[dst_rgba]
276 [width] "+r"(width) // %[width]
277 : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
278 [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias]
279 : "cc", "memory", YUVTORGB_REGS, "d6");
280 }
281
I422ToRGB24Row_NEON(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_rgb24,const struct YuvConstants * yuvconstants,int width)282 void I422ToRGB24Row_NEON(const uint8_t* src_y,
283 const uint8_t* src_u,
284 const uint8_t* src_v,
285 uint8_t* dst_rgb24,
286 const struct YuvConstants* yuvconstants,
287 int width) {
288 asm volatile(
289 YUVTORGB_SETUP
290 "vmov.u8 d6, #255 \n"
291 "1: \n" READYUV422 YUVTORGB
292 RGBTORGB8
293 "subs %[width], %[width], #8 \n"
294 "vst3.8 {d0, d2, d4}, [%[dst_rgb24]]! \n"
295 "bgt 1b \n"
296 : [src_y] "+r"(src_y), // %[src_y]
297 [src_u] "+r"(src_u), // %[src_u]
298 [src_v] "+r"(src_v), // %[src_v]
299 [dst_rgb24] "+r"(dst_rgb24), // %[dst_rgb24]
300 [width] "+r"(width) // %[width]
301 : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
302 [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias]
303 : "cc", "memory", YUVTORGB_REGS);
304 }
305
306 #define ARGBTORGB565 \
307 "vshll.u8 q2, d4, #8 \n" /* R */ \
308 "vshll.u8 q1, d2, #8 \n" /* G */ \
309 "vshll.u8 q0, d0, #8 \n" /* B */ \
310 "vsri.16 q2, q1, #5 \n" /* RG */ \
311 "vsri.16 q2, q0, #11 \n" /* RGB */
312
I422ToRGB565Row_NEON(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_rgb565,const struct YuvConstants * yuvconstants,int width)313 void I422ToRGB565Row_NEON(const uint8_t* src_y,
314 const uint8_t* src_u,
315 const uint8_t* src_v,
316 uint8_t* dst_rgb565,
317 const struct YuvConstants* yuvconstants,
318 int width) {
319 asm volatile(
320 YUVTORGB_SETUP
321 "vmov.u8 d6, #255 \n"
322 "1: \n" READYUV422 YUVTORGB
323 RGBTORGB8 "subs %[width], %[width], #8 \n" ARGBTORGB565
324 "vst1.8 {q2}, [%[dst_rgb565]]! \n" // store 8 pixels RGB565.
325 "bgt 1b \n"
326 : [src_y] "+r"(src_y), // %[src_y]
327 [src_u] "+r"(src_u), // %[src_u]
328 [src_v] "+r"(src_v), // %[src_v]
329 [dst_rgb565] "+r"(dst_rgb565), // %[dst_rgb565]
330 [width] "+r"(width) // %[width]
331 : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
332 [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias]
333 : "cc", "memory", YUVTORGB_REGS);
334 }
335
336 #define ARGBTOARGB1555 \
337 "vshll.u8 q3, d6, #8 \n" /* A */ \
338 "vshll.u8 q2, d4, #8 \n" /* R */ \
339 "vshll.u8 q1, d2, #8 \n" /* G */ \
340 "vshll.u8 q0, d0, #8 \n" /* B */ \
341 "vsri.16 q3, q2, #1 \n" /* AR */ \
342 "vsri.16 q3, q1, #6 \n" /* ARG */ \
343 "vsri.16 q3, q0, #11 \n" /* ARGB */
344
I422ToARGB1555Row_NEON(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_argb1555,const struct YuvConstants * yuvconstants,int width)345 void I422ToARGB1555Row_NEON(const uint8_t* src_y,
346 const uint8_t* src_u,
347 const uint8_t* src_v,
348 uint8_t* dst_argb1555,
349 const struct YuvConstants* yuvconstants,
350 int width) {
351 asm volatile(
352 YUVTORGB_SETUP
353 "1: \n" READYUV422 YUVTORGB
354 RGBTORGB8
355 "subs %[width], %[width], #8 \n"
356 "vmov.u8 d6, #0xff \n" ARGBTOARGB1555
357 "vst1.8 {q3}, [%[dst_argb1555]]! \n" // store 8 pixels RGB1555.
358 "bgt 1b \n"
359 : [src_y] "+r"(src_y), // %[src_y]
360 [src_u] "+r"(src_u), // %[src_u]
361 [src_v] "+r"(src_v), // %[src_v]
362 [dst_argb1555] "+r"(dst_argb1555), // %[dst_argb1555]
363 [width] "+r"(width) // %[width]
364 : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
365 [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias]
366 : "cc", "memory", YUVTORGB_REGS, "q3");
367 }
368
369 #define ARGBTOARGB4444 \
370 "vshr.u8 d0, d0, #4 \n" /* B */ \
371 "vbic.32 d2, d2, d7 \n" /* G */ \
372 "vshr.u8 d4, d4, #4 \n" /* R */ \
373 "vbic.32 d6, d6, d7 \n" /* A */ \
374 "vorr d0, d0, d2 \n" /* BG */ \
375 "vorr d1, d4, d6 \n" /* RA */ \
376 "vzip.u8 d0, d1 \n" /* BGRA */
377
I422ToARGB4444Row_NEON(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_argb4444,const struct YuvConstants * yuvconstants,int width)378 void I422ToARGB4444Row_NEON(const uint8_t* src_y,
379 const uint8_t* src_u,
380 const uint8_t* src_v,
381 uint8_t* dst_argb4444,
382 const struct YuvConstants* yuvconstants,
383 int width) {
384 asm volatile(
385 YUVTORGB_SETUP
386 "vmov.u8 d6, #255 \n"
387 "vmov.u8 d7, #0x0f \n" // vbic bits to clear
388 "1: \n" READYUV422 YUVTORGB
389 RGBTORGB8
390 "subs %[width], %[width], #8 \n" ARGBTOARGB4444
391 "vst1.8 {q0}, [%[dst_argb4444]]! \n" // store 8 pixels
392 "bgt 1b \n"
393 : [src_y] "+r"(src_y), // %[src_y]
394 [src_u] "+r"(src_u), // %[src_u]
395 [src_v] "+r"(src_v), // %[src_v]
396 [dst_argb4444] "+r"(dst_argb4444), // %[dst_argb4444]
397 [width] "+r"(width) // %[width]
398 : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
399 [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias]
400 : "cc", "memory", YUVTORGB_REGS, "q3");
401 }
402
I400ToARGBRow_NEON(const uint8_t * src_y,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)403 void I400ToARGBRow_NEON(const uint8_t* src_y,
404 uint8_t* dst_argb,
405 const struct YuvConstants* yuvconstants,
406 int width) {
407 asm volatile(
408 YUVTORGB_SETUP
409 "vmov.u8 d6, #255 \n"
410 "1: \n" READYUV400 YUVTORGB
411 RGBTORGB8
412 "subs %[width], %[width], #8 \n"
413 "vst4.8 {d0, d2, d4, d6}, [%[dst_argb]]! \n"
414 "bgt 1b \n"
415 : [src_y] "+r"(src_y), // %[src_y]
416 [dst_argb] "+r"(dst_argb), // %[dst_argb]
417 [width] "+r"(width) // %[width]
418 : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
419 [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias]
420 : "cc", "memory", YUVTORGB_REGS, "d6");
421 }
422
J400ToARGBRow_NEON(const uint8_t * src_y,uint8_t * dst_argb,int width)423 void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
424 asm volatile(
425 "vmov.u8 d23, #255 \n"
426 "1: \n"
427 "vld1.8 {d20}, [%0]! \n"
428 "vmov d21, d20 \n"
429 "vmov d22, d20 \n"
430 "subs %2, %2, #8 \n"
431 "vst4.8 {d20, d21, d22, d23}, [%1]! \n"
432 "bgt 1b \n"
433 : "+r"(src_y), // %0
434 "+r"(dst_argb), // %1
435 "+r"(width) // %2
436 :
437 : "cc", "memory", "d20", "d21", "d22", "d23");
438 }
439
NV12ToARGBRow_NEON(const uint8_t * src_y,const uint8_t * src_uv,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)440 void NV12ToARGBRow_NEON(const uint8_t* src_y,
441 const uint8_t* src_uv,
442 uint8_t* dst_argb,
443 const struct YuvConstants* yuvconstants,
444 int width) {
445 asm volatile(
446 YUVTORGB_SETUP
447 "vmov.u8 d6, #255 \n"
448 "1: \n" READNV12 YUVTORGB RGBTORGB8
449 "subs %[width], %[width], #8 \n"
450 "vst4.8 {d0, d2, d4, d6}, [%[dst_argb]]! \n"
451 "bgt 1b \n"
452 : [src_y] "+r"(src_y), // %[src_y]
453 [src_uv] "+r"(src_uv), // %[src_uv]
454 [dst_argb] "+r"(dst_argb), // %[dst_argb]
455 [width] "+r"(width) // %[width]
456 : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
457 [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias]
458 : "cc", "memory", YUVTORGB_REGS, "d6");
459 }
460
NV21ToARGBRow_NEON(const uint8_t * src_y,const uint8_t * src_vu,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)461 void NV21ToARGBRow_NEON(const uint8_t* src_y,
462 const uint8_t* src_vu,
463 uint8_t* dst_argb,
464 const struct YuvConstants* yuvconstants,
465 int width) {
466 asm volatile(
467 YUVTORGB_SETUP
468 "vmov.u8 d6, #255 \n"
469 "1: \n" READNV21 YUVTORGB RGBTORGB8
470 "subs %[width], %[width], #8 \n"
471 "vst4.8 {d0, d2, d4, d6}, [%[dst_argb]]! \n"
472 "bgt 1b \n"
473 : [src_y] "+r"(src_y), // %[src_y]
474 [src_vu] "+r"(src_vu), // %[src_vu]
475 [dst_argb] "+r"(dst_argb), // %[dst_argb]
476 [width] "+r"(width) // %[width]
477 : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
478 [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias]
479 : "cc", "memory", YUVTORGB_REGS, "d6");
480 }
481
NV12ToRGB24Row_NEON(const uint8_t * src_y,const uint8_t * src_uv,uint8_t * dst_rgb24,const struct YuvConstants * yuvconstants,int width)482 void NV12ToRGB24Row_NEON(const uint8_t* src_y,
483 const uint8_t* src_uv,
484 uint8_t* dst_rgb24,
485 const struct YuvConstants* yuvconstants,
486 int width) {
487 asm volatile(
488 YUVTORGB_SETUP
489 "vmov.u8 d6, #255 \n"
490 "1: \n" READNV12 YUVTORGB RGBTORGB8
491 "subs %[width], %[width], #8 \n"
492 "vst3.8 {d0, d2, d4}, [%[dst_rgb24]]! \n"
493 "bgt 1b \n"
494 : [src_y] "+r"(src_y), // %[src_y]
495 [src_uv] "+r"(src_uv), // %[src_uv]
496 [dst_rgb24] "+r"(dst_rgb24), // %[dst_rgb24]
497 [width] "+r"(width) // %[width]
498 : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
499 [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias]
500 : "cc", "memory", YUVTORGB_REGS);
501 }
502
NV21ToRGB24Row_NEON(const uint8_t * src_y,const uint8_t * src_vu,uint8_t * dst_rgb24,const struct YuvConstants * yuvconstants,int width)503 void NV21ToRGB24Row_NEON(const uint8_t* src_y,
504 const uint8_t* src_vu,
505 uint8_t* dst_rgb24,
506 const struct YuvConstants* yuvconstants,
507 int width) {
508 asm volatile(
509 YUVTORGB_SETUP
510 "vmov.u8 d6, #255 \n"
511 "1: \n" READNV21 YUVTORGB RGBTORGB8
512 "subs %[width], %[width], #8 \n"
513 "vst3.8 {d0, d2, d4}, [%[dst_rgb24]]! \n"
514 "bgt 1b \n"
515 : [src_y] "+r"(src_y), // %[src_y]
516 [src_vu] "+r"(src_vu), // %[src_vu]
517 [dst_rgb24] "+r"(dst_rgb24), // %[dst_rgb24]
518 [width] "+r"(width) // %[width]
519 : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
520 [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias]
521 : "cc", "memory", YUVTORGB_REGS);
522 }
523
NV12ToRGB565Row_NEON(const uint8_t * src_y,const uint8_t * src_uv,uint8_t * dst_rgb565,const struct YuvConstants * yuvconstants,int width)524 void NV12ToRGB565Row_NEON(const uint8_t* src_y,
525 const uint8_t* src_uv,
526 uint8_t* dst_rgb565,
527 const struct YuvConstants* yuvconstants,
528 int width) {
529 asm volatile(
530 YUVTORGB_SETUP
531 "vmov.u8 d6, #255 \n"
532 "1: \n" READNV12 YUVTORGB RGBTORGB8
533 "subs %[width], %[width], #8 \n" ARGBTORGB565
534 "vst1.8 {q2}, [%[dst_rgb565]]! \n" // store 8 pixels RGB565.
535 "bgt 1b \n"
536 : [src_y] "+r"(src_y), // %[src_y]
537 [src_uv] "+r"(src_uv), // %[src_uv]
538 [dst_rgb565] "+r"(dst_rgb565), // %[dst_rgb565]
539 [width] "+r"(width) // %[width]
540 : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
541 [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias]
542 : "cc", "memory", YUVTORGB_REGS);
543 }
544
YUY2ToARGBRow_NEON(const uint8_t * src_yuy2,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)545 void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2,
546 uint8_t* dst_argb,
547 const struct YuvConstants* yuvconstants,
548 int width) {
549 asm volatile(
550 YUVTORGB_SETUP
551 "vmov.u8 d6, #255 \n"
552 "1: \n" READYUY2 YUVTORGB RGBTORGB8
553 "subs %[width], %[width], #8 \n"
554 "vst4.8 {d0, d2, d4, d6}, [%[dst_argb]]! \n"
555 "bgt 1b \n"
556 : [src_yuy2] "+r"(src_yuy2), // %[src_yuy2]
557 [dst_argb] "+r"(dst_argb), // %[dst_argb]
558 [width] "+r"(width) // %[width]
559 : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
560 [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias]
561 : "cc", "memory", YUVTORGB_REGS, "d6");
562 }
563
UYVYToARGBRow_NEON(const uint8_t * src_uyvy,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)564 void UYVYToARGBRow_NEON(const uint8_t* src_uyvy,
565 uint8_t* dst_argb,
566 const struct YuvConstants* yuvconstants,
567 int width) {
568 asm volatile(
569 YUVTORGB_SETUP
570 "vmov.u8 d6, #255 \n"
571 "1: \n" READUYVY YUVTORGB RGBTORGB8
572 "subs %[width], %[width], #8 \n"
573 "vst4.8 {d0, d2, d4, d6}, [%[dst_argb]]! \n"
574 "bgt 1b \n"
575 : [src_uyvy] "+r"(src_uyvy), // %[src_uyvy]
576 [dst_argb] "+r"(dst_argb), // %[dst_argb]
577 [width] "+r"(width) // %[width]
578 : [kUVCoeff] "r"(&yuvconstants->kUVCoeff), // %[kUVCoeff]
579 [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias) // %[kRGBCoeffBias]
580 : "cc", "memory", YUVTORGB_REGS, "d6");
581 }
582
583 // Reads 16 pairs of UV and write even values to dst_u and odd to dst_v.
SplitUVRow_NEON(const uint8_t * src_uv,uint8_t * dst_u,uint8_t * dst_v,int width)584 void SplitUVRow_NEON(const uint8_t* src_uv,
585 uint8_t* dst_u,
586 uint8_t* dst_v,
587 int width) {
588 asm volatile(
589 "1: \n"
590 "vld2.8 {q0, q1}, [%0]! \n" // load 16 pairs of UV
591 "subs %3, %3, #16 \n" // 16 processed per loop
592 "vst1.8 {q0}, [%1]! \n" // store U
593 "vst1.8 {q1}, [%2]! \n" // store V
594 "bgt 1b \n"
595 : "+r"(src_uv), // %0
596 "+r"(dst_u), // %1
597 "+r"(dst_v), // %2
598 "+r"(width) // %3 // Output registers
599 : // Input registers
600 : "cc", "memory", "q0", "q1" // Clobber List
601 );
602 }
603
604 // Reads 16 byte Y's from tile and writes out 16 Y's.
605 // MM21 Y tiles are 16x32 so src_tile_stride = 512 bytes
606 // MM21 UV tiles are 8x16 so src_tile_stride = 256 bytes
607 // width measured in bytes so 8 UV = 16.
DetileRow_NEON(const uint8_t * src,ptrdiff_t src_tile_stride,uint8_t * dst,int width)608 void DetileRow_NEON(const uint8_t* src,
609 ptrdiff_t src_tile_stride,
610 uint8_t* dst,
611 int width) {
612 asm volatile(
613 "1: \n"
614 "vld1.8 {q0}, [%0], %3 \n" // load 16 bytes
615 "subs %2, %2, #16 \n" // 16 processed per loop
616 "pld [%0, #1792] \n"
617 "vst1.8 {q0}, [%1]! \n" // store 16 bytes
618 "bgt 1b \n"
619 : "+r"(src), // %0
620 "+r"(dst), // %1
621 "+r"(width) // %2
622 : "r"(src_tile_stride) // %3
623 : "cc", "memory", "q0" // Clobber List
624 );
625 }
626
627 // Reads 16 byte Y's of 16 bits from tile and writes out 16 Y's.
DetileRow_16_NEON(const uint16_t * src,ptrdiff_t src_tile_stride,uint16_t * dst,int width)628 void DetileRow_16_NEON(const uint16_t* src,
629 ptrdiff_t src_tile_stride,
630 uint16_t* dst,
631 int width) {
632 asm volatile(
633 "1: \n"
634 "vld1.16 {q0, q1}, [%0], %3 \n" // load 16 pixels
635 "subs %2, %2, #16 \n" // 16 processed per loop
636 "pld [%0, #3584] \n"
637 "vst1.16 {q0, q1}, [%1]! \n" // store 16 pixels
638 "bgt 1b \n"
639 : "+r"(src), // %0
640 "+r"(dst), // %1
641 "+r"(width) // %2
642 : "r"(src_tile_stride * 2) // %3
643 : "cc", "memory", "q0", "q1" // Clobber List
644 );
645 }
646
647 // Read 16 bytes of UV, detile, and write 8 bytes of U and 8 bytes of V.
DetileSplitUVRow_NEON(const uint8_t * src_uv,ptrdiff_t src_tile_stride,uint8_t * dst_u,uint8_t * dst_v,int width)648 void DetileSplitUVRow_NEON(const uint8_t* src_uv,
649 ptrdiff_t src_tile_stride,
650 uint8_t* dst_u,
651 uint8_t* dst_v,
652 int width) {
653 asm volatile(
654 "1: \n"
655 "vld2.8 {d0, d1}, [%0], %4 \n"
656 "subs %3, %3, #16 \n"
657 "pld [%0, #1792] \n"
658 "vst1.8 {d0}, [%1]! \n"
659 "vst1.8 {d1}, [%2]! \n"
660 "bgt 1b \n"
661 : "+r"(src_uv), // %0
662 "+r"(dst_u), // %1
663 "+r"(dst_v), // %2
664 "+r"(width) // %3
665 : "r"(src_tile_stride) // %4
666 : "cc", "memory", "d0", "d1" // Clobber List
667 );
668 }
669
670 #if LIBYUV_USE_ST2
671 // Read 16 Y, 8 UV, and write 8 YUYV.
DetileToYUY2_NEON(const uint8_t * src_y,ptrdiff_t src_y_tile_stride,const uint8_t * src_uv,ptrdiff_t src_uv_tile_stride,uint8_t * dst_yuy2,int width)672 void DetileToYUY2_NEON(const uint8_t* src_y,
673 ptrdiff_t src_y_tile_stride,
674 const uint8_t* src_uv,
675 ptrdiff_t src_uv_tile_stride,
676 uint8_t* dst_yuy2,
677 int width) {
678 asm volatile(
679 "1: \n"
680 "vld1.8 {q0}, [%0], %4 \n" // Load 16 Y
681 "pld [%0, #1792] \n"
682 "vld1.8 {q1}, [%1], %5 \n" // Load 8 UV
683 "pld [%1, #1792] \n"
684 "subs %3, %3, #16 \n"
685 "vst2.8 {q0, q1}, [%2]! \n"
686 "bgt 1b \n"
687 : "+r"(src_y), // %0
688 "+r"(src_uv), // %1
689 "+r"(dst_yuy2), // %2
690 "+r"(width) // %3
691 : "r"(src_y_tile_stride), // %4
692 "r"(src_uv_tile_stride) // %5
693 : "cc", "memory", "d0", "d1", "d2", "d3" // Clobber list
694 );
695 }
696 #else
697 // Read 16 Y, 8 UV, and write 8 YUYV.
DetileToYUY2_NEON(const uint8_t * src_y,ptrdiff_t src_y_tile_stride,const uint8_t * src_uv,ptrdiff_t src_uv_tile_stride,uint8_t * dst_yuy2,int width)698 void DetileToYUY2_NEON(const uint8_t* src_y,
699 ptrdiff_t src_y_tile_stride,
700 const uint8_t* src_uv,
701 ptrdiff_t src_uv_tile_stride,
702 uint8_t* dst_yuy2,
703 int width) {
704 asm volatile(
705 "1: \n"
706 "vld1.8 {q0}, [%0], %4 \n" // Load 16 Y
707 "vld1.8 {q1}, [%1], %5 \n" // Load 8 UV
708 "subs %3, %3, #16 \n"
709 "pld [%0, #1792] \n"
710 "vzip.8 q0, q1 \n"
711 "pld [%1, #1792] \n"
712 "vst1.8 {q0, q1}, [%2]! \n"
713 "bgt 1b \n"
714 : "+r"(src_y), // %0
715 "+r"(src_uv), // %1
716 "+r"(dst_yuy2), // %2
717 "+r"(width) // %3
718 : "r"(src_y_tile_stride), // %4
719 "r"(src_uv_tile_stride) // %5
720 : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber list
721 );
722 }
723 #endif
724
UnpackMT2T_NEON(const uint8_t * src,uint16_t * dst,size_t size)725 void UnpackMT2T_NEON(const uint8_t* src, uint16_t* dst, size_t size) {
726 asm volatile(
727 "1: \n"
728 "vld1.8 {q14}, [%0]! \n" // Load lower bits.
729 "vld1.8 {q9}, [%0]! \n" // Load upper bits row
730 // by row.
731 "vld1.8 {q11}, [%0]! \n"
732 "vld1.8 {q13}, [%0]! \n"
733 "vld1.8 {q15}, [%0]! \n"
734 "vshl.u8 q8, q14, #6 \n" // Shift lower bit data
735 // appropriately.
736 "vshl.u8 q10, q14, #4 \n"
737 "vshl.u8 q12, q14, #2 \n"
738 "vzip.u8 q8, q9 \n" // Interleave upper and
739 // lower bits.
740 "vzip.u8 q10, q11 \n"
741 "vzip.u8 q12, q13 \n"
742 "vzip.u8 q14, q15 \n"
743 "vsri.u16 q8, q8, #10 \n" // Copy upper 6 bits
744 // into lower 6 bits for
745 // better accuracy in
746 // conversions.
747 "vsri.u16 q9, q9, #10 \n"
748 "vsri.u16 q10, q10, #10 \n"
749 "vsri.u16 q11, q11, #10 \n"
750 "vsri.u16 q12, q12, #10 \n"
751 "vsri.u16 q13, q13, #10 \n"
752 "vsri.u16 q14, q14, #10 \n"
753 "vsri.u16 q15, q15, #10 \n"
754 "vstmia %1!, {q8-q15} \n" // Store pixel block (64
755 // pixels).
756 "subs %2, %2, #80 \n"
757 "bgt 1b \n"
758 : "+r"(src), // %0
759 "+r"(dst), // %1
760 "+r"(size) // %2
761 :
762 : "cc", "memory", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
763 }
764
765 // Reads 16 U's and V's and writes out 16 pairs of UV.
MergeUVRow_NEON(const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_uv,int width)766 void MergeUVRow_NEON(const uint8_t* src_u,
767 const uint8_t* src_v,
768 uint8_t* dst_uv,
769 int width) {
770 asm volatile(
771 "1: \n"
772 "vld1.8 {q0}, [%0]! \n" // load U
773 "vld1.8 {q1}, [%1]! \n" // load V
774 "subs %3, %3, #16 \n" // 16 processed per loop
775 "vst2.8 {q0, q1}, [%2]! \n" // store 16 pairs of UV
776 "bgt 1b \n"
777 : "+r"(src_u), // %0
778 "+r"(src_v), // %1
779 "+r"(dst_uv), // %2
780 "+r"(width) // %3 // Output registers
781 : // Input registers
782 : "cc", "memory", "q0", "q1" // Clobber List
783 );
784 }
785
786 // Reads 16 packed RGB and write to planar dst_r, dst_g, dst_b.
SplitRGBRow_NEON(const uint8_t * src_rgb,uint8_t * dst_r,uint8_t * dst_g,uint8_t * dst_b,int width)787 void SplitRGBRow_NEON(const uint8_t* src_rgb,
788 uint8_t* dst_r,
789 uint8_t* dst_g,
790 uint8_t* dst_b,
791 int width) {
792 asm volatile(
793 "1: \n"
794 "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB
795 "vld3.8 {d1, d3, d5}, [%0]! \n" // next 8 RGB
796 "subs %4, %4, #16 \n" // 16 processed per loop
797 "vst1.8 {q0}, [%1]! \n" // store R
798 "vst1.8 {q1}, [%2]! \n" // store G
799 "vst1.8 {q2}, [%3]! \n" // store B
800 "bgt 1b \n"
801 : "+r"(src_rgb), // %0
802 "+r"(dst_r), // %1
803 "+r"(dst_g), // %2
804 "+r"(dst_b), // %3
805 "+r"(width) // %4
806 : // Input registers
807 : "cc", "memory", "q0", "q1", "q2" // Clobber List
808 );
809 }
810
811 // Reads 16 planar R's, G's and B's and writes out 16 packed RGB at a time
MergeRGBRow_NEON(const uint8_t * src_r,const uint8_t * src_g,const uint8_t * src_b,uint8_t * dst_rgb,int width)812 void MergeRGBRow_NEON(const uint8_t* src_r,
813 const uint8_t* src_g,
814 const uint8_t* src_b,
815 uint8_t* dst_rgb,
816 int width) {
817 asm volatile(
818 "1: \n"
819 "vld1.8 {q0}, [%0]! \n" // load R
820 "vld1.8 {q1}, [%1]! \n" // load G
821 "vld1.8 {q2}, [%2]! \n" // load B
822 "subs %4, %4, #16 \n" // 16 processed per loop
823 "vst3.8 {d0, d2, d4}, [%3]! \n" // store 8 RGB
824 "vst3.8 {d1, d3, d5}, [%3]! \n" // next 8 RGB
825 "bgt 1b \n"
826 : "+r"(src_r), // %0
827 "+r"(src_g), // %1
828 "+r"(src_b), // %2
829 "+r"(dst_rgb), // %3
830 "+r"(width) // %4
831 : // Input registers
832 : "cc", "memory", "q0", "q1", "q2" // Clobber List
833 );
834 }
835
836 // Reads 16 packed ARGB and write to planar dst_r, dst_g, dst_b, dst_a.
SplitARGBRow_NEON(const uint8_t * src_argb,uint8_t * dst_r,uint8_t * dst_g,uint8_t * dst_b,uint8_t * dst_a,int width)837 void SplitARGBRow_NEON(const uint8_t* src_argb,
838 uint8_t* dst_r,
839 uint8_t* dst_g,
840 uint8_t* dst_b,
841 uint8_t* dst_a,
842 int width) {
843 asm volatile(
844 "1: \n"
845 "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB
846 "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // next 8 ARGB
847 "subs %5, %5, #16 \n" // 16 processed per loop
848 "vst1.8 {q0}, [%3]! \n" // store B
849 "vst1.8 {q1}, [%2]! \n" // store G
850 "vst1.8 {q2}, [%1]! \n" // store R
851 "vst1.8 {q3}, [%4]! \n" // store A
852 "bgt 1b \n"
853 : "+r"(src_argb), // %0
854 "+r"(dst_r), // %1
855 "+r"(dst_g), // %2
856 "+r"(dst_b), // %3
857 "+r"(dst_a), // %4
858 "+r"(width) // %5
859 : // Input registers
860 : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List
861 );
862 }
863
864 // Reads 16 planar R's, G's and B's and writes out 16 packed ARGB at a time
MergeARGBRow_NEON(const uint8_t * src_r,const uint8_t * src_g,const uint8_t * src_b,const uint8_t * src_a,uint8_t * dst_argb,int width)865 void MergeARGBRow_NEON(const uint8_t* src_r,
866 const uint8_t* src_g,
867 const uint8_t* src_b,
868 const uint8_t* src_a,
869 uint8_t* dst_argb,
870 int width) {
871 asm volatile(
872 "1: \n"
873 "vld1.8 {q2}, [%0]! \n" // load R
874 "vld1.8 {q1}, [%1]! \n" // load G
875 "vld1.8 {q0}, [%2]! \n" // load B
876 "vld1.8 {q3}, [%3]! \n" // load A
877 "subs %5, %5, #16 \n" // 16 processed per loop
878 "vst4.8 {d0, d2, d4, d6}, [%4]! \n" // store 8 ARGB
879 "vst4.8 {d1, d3, d5, d7}, [%4]! \n" // next 8 ARGB
880 "bgt 1b \n"
881 : "+r"(src_r), // %0
882 "+r"(src_g), // %1
883 "+r"(src_b), // %2
884 "+r"(src_a), // %3
885 "+r"(dst_argb), // %4
886 "+r"(width) // %5
887 : // Input registers
888 : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List
889 );
890 }
891
892 // Reads 16 packed ARGB and write to planar dst_r, dst_g, dst_b.
SplitXRGBRow_NEON(const uint8_t * src_argb,uint8_t * dst_r,uint8_t * dst_g,uint8_t * dst_b,int width)893 void SplitXRGBRow_NEON(const uint8_t* src_argb,
894 uint8_t* dst_r,
895 uint8_t* dst_g,
896 uint8_t* dst_b,
897 int width) {
898 asm volatile(
899 "1: \n"
900 "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB
901 "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // next 8 ARGB
902 "subs %4, %4, #16 \n" // 16 processed per loop
903 "vst1.8 {q0}, [%3]! \n" // store B
904 "vst1.8 {q1}, [%2]! \n" // store G
905 "vst1.8 {q2}, [%1]! \n" // store R
906 "bgt 1b \n"
907 : "+r"(src_argb), // %0
908 "+r"(dst_r), // %1
909 "+r"(dst_g), // %2
910 "+r"(dst_b), // %3
911 "+r"(width) // %4
912 : // Input registers
913 : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List
914 );
915 }
916
917 // Reads 16 planar R's, G's, B's and A's and writes out 16 packed ARGB at a time
MergeXRGBRow_NEON(const uint8_t * src_r,const uint8_t * src_g,const uint8_t * src_b,uint8_t * dst_argb,int width)918 void MergeXRGBRow_NEON(const uint8_t* src_r,
919 const uint8_t* src_g,
920 const uint8_t* src_b,
921 uint8_t* dst_argb,
922 int width) {
923 asm volatile(
924 "vmov.u8 q3, #255 \n" // load A(255)
925 "1: \n"
926 "vld1.8 {q2}, [%0]! \n" // load R
927 "vld1.8 {q1}, [%1]! \n" // load G
928 "vld1.8 {q0}, [%2]! \n" // load B
929 "subs %4, %4, #16 \n" // 16 processed per loop
930 "vst4.8 {d0, d2, d4, d6}, [%3]! \n" // store 8 ARGB
931 "vst4.8 {d1, d3, d5, d7}, [%3]! \n" // next 8 ARGB
932 "bgt 1b \n"
933 : "+r"(src_r), // %0
934 "+r"(src_g), // %1
935 "+r"(src_b), // %2
936 "+r"(dst_argb), // %3
937 "+r"(width) // %4
938 : // Input registers
939 : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List
940 );
941 }
942
MergeXR30Row_NEON(const uint16_t * src_r,const uint16_t * src_g,const uint16_t * src_b,uint8_t * dst_ar30,int depth,int width)943 void MergeXR30Row_NEON(const uint16_t* src_r,
944 const uint16_t* src_g,
945 const uint16_t* src_b,
946 uint8_t* dst_ar30,
947 int depth,
948 int width) {
949 int shift = 10 - depth;
950 asm volatile(
951 "vmov.u32 q14, #1023 \n"
952 "vdup.32 q15, %5 \n"
953 "1: \n"
954 "vld1.16 {d4}, [%2]! \n" // B
955 "vld1.16 {d2}, [%1]! \n" // G
956 "vld1.16 {d0}, [%0]! \n" // R
957 "vmovl.u16 q2, d4 \n" // B
958 "vmovl.u16 q1, d2 \n" // G
959 "vmovl.u16 q0, d0 \n" // R
960 "vshl.u32 q2, q2, q15 \n" // 000B
961 "vshl.u32 q1, q1, q15 \n"
962 "vshl.u32 q0, q0, q15 \n"
963 "vmin.u32 q2, q2, q14 \n"
964 "vmin.u32 q1, q1, q14 \n"
965 "vmin.u32 q0, q0, q14 \n"
966 "vsli.u32 q2, q1, #10 \n" // 00GB
967 "vsli.u32 q2, q0, #20 \n" // 0RGB
968 "vorr.u32 q2, #0xc0000000 \n" // ARGB (AR30)
969 "subs %4, %4, #4 \n"
970 "vst1.8 {q2}, [%3]! \n"
971 "bgt 1b \n"
972 : "+r"(src_r), // %0
973 "+r"(src_g), // %1
974 "+r"(src_b), // %2
975 "+r"(dst_ar30), // %3
976 "+r"(width) // %4
977 : "r"(shift) // %5
978 : "memory", "cc", "q0", "q1", "q2", "q14", "q15");
979 }
980
MergeXR30Row_10_NEON(const uint16_t * src_r,const uint16_t * src_g,const uint16_t * src_b,uint8_t * dst_ar30,int,int width)981 void MergeXR30Row_10_NEON(const uint16_t* src_r,
982 const uint16_t* src_g,
983 const uint16_t* src_b,
984 uint8_t* dst_ar30,
985 int /* depth */,
986 int width) {
987 asm volatile(
988 "vmov.u32 q14, #1023 \n"
989 "1: \n"
990 "vld1.16 {d4}, [%2]! \n" // B
991 "vld1.16 {d2}, [%1]! \n" // G
992 "vld1.16 {d0}, [%0]! \n" // R
993 "vmovl.u16 q2, d4 \n" // 000B
994 "vmovl.u16 q1, d2 \n" // G
995 "vmovl.u16 q0, d0 \n" // R
996 "vmin.u32 q2, q2, q14 \n"
997 "vmin.u32 q1, q1, q14 \n"
998 "vmin.u32 q0, q0, q14 \n"
999 "vsli.u32 q2, q1, #10 \n" // 00GB
1000 "vsli.u32 q2, q0, #20 \n" // 0RGB
1001 "vorr.u32 q2, #0xc0000000 \n" // ARGB (AR30)
1002 "subs %4, %4, #4 \n"
1003 "vst1.8 {q2}, [%3]! \n"
1004 "bgt 1b \n"
1005 "3: \n"
1006 : "+r"(src_r), // %0
1007 "+r"(src_g), // %1
1008 "+r"(src_b), // %2
1009 "+r"(dst_ar30), // %3
1010 "+r"(width) // %4
1011 :
1012 : "memory", "cc", "q0", "q1", "q2", "q14");
1013 }
1014
MergeAR64Row_NEON(const uint16_t * src_r,const uint16_t * src_g,const uint16_t * src_b,const uint16_t * src_a,uint16_t * dst_ar64,int depth,int width)1015 void MergeAR64Row_NEON(const uint16_t* src_r,
1016 const uint16_t* src_g,
1017 const uint16_t* src_b,
1018 const uint16_t* src_a,
1019 uint16_t* dst_ar64,
1020 int depth,
1021 int width) {
1022 int shift = 16 - depth;
1023 int mask = (1 << depth) - 1;
1024 asm volatile(
1025
1026 "vdup.u16 q15, %6 \n"
1027 "vdup.u16 q14, %7 \n"
1028 "1: \n"
1029 "vld1.16 {q2}, [%0]! \n" // R
1030 "vld1.16 {q1}, [%1]! \n" // G
1031 "vld1.16 {q0}, [%2]! \n" // B
1032 "vld1.16 {q3}, [%3]! \n" // A
1033 "vmin.u16 q2, q2, q14 \n"
1034 "vmin.u16 q1, q1, q14 \n"
1035 "vmin.u16 q0, q0, q14 \n"
1036 "vmin.u16 q3, q3, q14 \n"
1037 "vshl.u16 q2, q2, q15 \n"
1038 "vshl.u16 q1, q1, q15 \n"
1039 "vshl.u16 q0, q0, q15 \n"
1040 "vshl.u16 q3, q3, q15 \n"
1041 "subs %5, %5, #8 \n"
1042 "vst4.16 {d0, d2, d4, d6}, [%4]! \n"
1043 "vst4.16 {d1, d3, d5, d7}, [%4]! \n"
1044 "bgt 1b \n"
1045 : "+r"(src_r), // %0
1046 "+r"(src_g), // %1
1047 "+r"(src_b), // %2
1048 "+r"(src_a), // %3
1049 "+r"(dst_ar64), // %4
1050 "+r"(width) // %5
1051 : "r"(shift), // %6
1052 "r"(mask) // %7
1053 : "memory", "cc", "q0", "q1", "q2", "q3", "q15");
1054 }
1055
MergeXR64Row_NEON(const uint16_t * src_r,const uint16_t * src_g,const uint16_t * src_b,uint16_t * dst_ar64,int depth,int width)1056 void MergeXR64Row_NEON(const uint16_t* src_r,
1057 const uint16_t* src_g,
1058 const uint16_t* src_b,
1059 uint16_t* dst_ar64,
1060 int depth,
1061 int width) {
1062 int shift = 16 - depth;
1063 int mask = (1 << depth) - 1;
1064 asm volatile(
1065
1066 "vmov.u8 q3, #0xff \n" // A (0xffff)
1067 "vdup.u16 q15, %5 \n"
1068 "vdup.u16 q14, %6 \n"
1069 "1: \n"
1070 "vld1.16 {q2}, [%0]! \n" // R
1071 "vld1.16 {q1}, [%1]! \n" // G
1072 "vld1.16 {q0}, [%2]! \n" // B
1073 "vmin.u16 q2, q2, q14 \n"
1074 "vmin.u16 q1, q1, q14 \n"
1075 "vmin.u16 q0, q0, q14 \n"
1076 "vshl.u16 q2, q2, q15 \n"
1077 "vshl.u16 q1, q1, q15 \n"
1078 "vshl.u16 q0, q0, q15 \n"
1079 "subs %4, %4, #8 \n"
1080 "vst4.16 {d0, d2, d4, d6}, [%3]! \n"
1081 "vst4.16 {d1, d3, d5, d7}, [%3]! \n"
1082 "bgt 1b \n"
1083 : "+r"(src_r), // %0
1084 "+r"(src_g), // %1
1085 "+r"(src_b), // %2
1086 "+r"(dst_ar64), // %3
1087 "+r"(width) // %4
1088 : "r"(shift), // %5
1089 "r"(mask) // %6
1090 : "memory", "cc", "q0", "q1", "q2", "q3", "q15");
1091 }
1092
MergeARGB16To8Row_NEON(const uint16_t * src_r,const uint16_t * src_g,const uint16_t * src_b,const uint16_t * src_a,uint8_t * dst_argb,int depth,int width)1093 void MergeARGB16To8Row_NEON(const uint16_t* src_r,
1094 const uint16_t* src_g,
1095 const uint16_t* src_b,
1096 const uint16_t* src_a,
1097 uint8_t* dst_argb,
1098 int depth,
1099 int width) {
1100 int shift = 8 - depth;
1101 asm volatile(
1102
1103 "vdup.16 q15, %6 \n"
1104 "1: \n"
1105 "vld1.16 {q2}, [%0]! \n" // R
1106 "vld1.16 {q1}, [%1]! \n" // G
1107 "vld1.16 {q0}, [%2]! \n" // B
1108 "vld1.16 {q3}, [%3]! \n" // A
1109 "vshl.u16 q2, q2, q15 \n"
1110 "vshl.u16 q1, q1, q15 \n"
1111 "vshl.u16 q0, q0, q15 \n"
1112 "vshl.u16 q3, q3, q15 \n"
1113 "vqmovn.u16 d0, q0 \n"
1114 "vqmovn.u16 d1, q1 \n"
1115 "vqmovn.u16 d2, q2 \n"
1116 "vqmovn.u16 d3, q3 \n"
1117 "subs %5, %5, #8 \n"
1118 "vst4.8 {d0, d1, d2, d3}, [%4]! \n"
1119 "bgt 1b \n"
1120 : "+r"(src_r), // %0
1121 "+r"(src_g), // %1
1122 "+r"(src_b), // %2
1123 "+r"(src_a), // %3
1124 "+r"(dst_argb), // %4
1125 "+r"(width) // %5
1126 : "r"(shift) // %6
1127 : "memory", "cc", "q0", "q1", "q2", "q3", "q15");
1128 }
1129
MergeXRGB16To8Row_NEON(const uint16_t * src_r,const uint16_t * src_g,const uint16_t * src_b,uint8_t * dst_argb,int depth,int width)1130 void MergeXRGB16To8Row_NEON(const uint16_t* src_r,
1131 const uint16_t* src_g,
1132 const uint16_t* src_b,
1133 uint8_t* dst_argb,
1134 int depth,
1135 int width) {
1136 int shift = 8 - depth;
1137 asm volatile(
1138
1139 "vdup.16 q15, %5 \n"
1140 "vmov.u8 d6, #0xff \n" // A (0xff)
1141 "1: \n"
1142 "vld1.16 {q2}, [%0]! \n" // R
1143 "vld1.16 {q1}, [%1]! \n" // G
1144 "vld1.16 {q0}, [%2]! \n" // B
1145 "vshl.u16 q2, q2, q15 \n"
1146 "vshl.u16 q1, q1, q15 \n"
1147 "vshl.u16 q0, q0, q15 \n"
1148 "vqmovn.u16 d5, q2 \n"
1149 "vqmovn.u16 d4, q1 \n"
1150 "vqmovn.u16 d3, q0 \n"
1151 "subs %4, %4, #8 \n"
1152 "vst4.u8 {d3, d4, d5, d6}, [%3]! \n"
1153 "bgt 1b \n"
1154 : "+r"(src_r), // %0
1155 "+r"(src_g), // %1
1156 "+r"(src_b), // %2
1157 "+r"(dst_argb), // %3
1158 "+r"(width) // %4
1159 : "r"(shift) // %5
1160 : "memory", "cc", "q0", "q1", "q2", "d6", "q15");
1161 }
1162
1163 // Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15.
CopyRow_NEON(const uint8_t * src,uint8_t * dst,int width)1164 void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
1165 asm volatile(
1166 "1: \n"
1167 "vld1.8 {d0, d1, d2, d3}, [%0]! \n" // load 32
1168 "subs %2, %2, #32 \n" // 32 processed per loop
1169 "vst1.8 {d0, d1, d2, d3}, [%1]! \n" // store 32
1170 "bgt 1b \n"
1171 : "+r"(src), // %0
1172 "+r"(dst), // %1
1173 "+r"(width) // %2 // Output registers
1174 : // Input registers
1175 : "cc", "memory", "q0", "q1" // Clobber List
1176 );
1177 }
1178
1179 // SetRow writes 'width' bytes using an 8 bit value repeated.
SetRow_NEON(uint8_t * dst,uint8_t v8,int width)1180 void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) {
1181 asm volatile(
1182 "vdup.8 q0, %2 \n" // duplicate 16 bytes
1183 "1: \n"
1184 "subs %1, %1, #16 \n" // 16 bytes per loop
1185 "vst1.8 {q0}, [%0]! \n" // store
1186 "bgt 1b \n"
1187 : "+r"(dst), // %0
1188 "+r"(width) // %1
1189 : "r"(v8) // %2
1190 : "cc", "memory", "q0");
1191 }
1192
1193 // ARGBSetRow writes 'width' pixels using an 32 bit value repeated.
ARGBSetRow_NEON(uint8_t * dst,uint32_t v32,int width)1194 void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width) {
1195 asm volatile(
1196 "vdup.u32 q0, %2 \n" // duplicate 4 ints
1197 "1: \n"
1198 "subs %1, %1, #4 \n" // 4 pixels per loop
1199 "vst1.8 {q0}, [%0]! \n" // store
1200 "bgt 1b \n"
1201 : "+r"(dst), // %0
1202 "+r"(width) // %1
1203 : "r"(v32) // %2
1204 : "cc", "memory", "q0");
1205 }
1206
MirrorRow_NEON(const uint8_t * src,uint8_t * dst,int width)1207 void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
1208 asm volatile(
1209 // Start at end of source row.
1210 "add %0, %0, %2 \n"
1211 "sub %0, %0, #32 \n" // 32 bytes per loop
1212
1213 "1: \n"
1214 "vld1.8 {q1, q2}, [%0], %3 \n" // src -= 32
1215 "subs %2, #32 \n" // 32 pixels per loop.
1216 "vrev64.8 q0, q2 \n"
1217 "vrev64.8 q1, q1 \n"
1218 "vswp d0, d1 \n"
1219 "vswp d2, d3 \n"
1220 "vst1.8 {q0, q1}, [%1]! \n" // dst += 32
1221 "bgt 1b \n"
1222 : "+r"(src), // %0
1223 "+r"(dst), // %1
1224 "+r"(width) // %2
1225 : "r"(-32) // %3
1226 : "cc", "memory", "q0", "q1", "q2");
1227 }
1228
MirrorUVRow_NEON(const uint8_t * src_uv,uint8_t * dst_uv,int width)1229 void MirrorUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
1230 asm volatile(
1231 // Start at end of source row.
1232 "mov r12, #-16 \n"
1233 "add %0, %0, %2, lsl #1 \n"
1234 "sub %0, #16 \n"
1235
1236 "1: \n"
1237 "vld2.8 {d0, d1}, [%0], r12 \n" // src -= 16
1238 "subs %2, #8 \n" // 8 pixels per loop.
1239 "vrev64.8 q0, q0 \n"
1240 "vst2.8 {d0, d1}, [%1]! \n" // dst += 16
1241 "bgt 1b \n"
1242 : "+r"(src_uv), // %0
1243 "+r"(dst_uv), // %1
1244 "+r"(width) // %2
1245 :
1246 : "cc", "memory", "r12", "q0");
1247 }
1248
MirrorSplitUVRow_NEON(const uint8_t * src_uv,uint8_t * dst_u,uint8_t * dst_v,int width)1249 void MirrorSplitUVRow_NEON(const uint8_t* src_uv,
1250 uint8_t* dst_u,
1251 uint8_t* dst_v,
1252 int width) {
1253 asm volatile(
1254 // Start at end of source row.
1255 "mov r12, #-16 \n"
1256 "add %0, %0, %3, lsl #1 \n"
1257 "sub %0, #16 \n"
1258
1259 "1: \n"
1260 "vld2.8 {d0, d1}, [%0], r12 \n" // src -= 16
1261 "subs %3, #8 \n" // 8 pixels per loop.
1262 "vrev64.8 q0, q0 \n"
1263 "vst1.8 {d0}, [%1]! \n" // dst += 8
1264 "vst1.8 {d1}, [%2]! \n"
1265 "bgt 1b \n"
1266 : "+r"(src_uv), // %0
1267 "+r"(dst_u), // %1
1268 "+r"(dst_v), // %2
1269 "+r"(width) // %3
1270 :
1271 : "cc", "memory", "r12", "q0");
1272 }
1273
ARGBMirrorRow_NEON(const uint8_t * src_argb,uint8_t * dst_argb,int width)1274 void ARGBMirrorRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
1275 asm volatile(
1276 "add %0, %0, %2, lsl #2 \n"
1277 "sub %0, #32 \n"
1278
1279 "1: \n"
1280 "vld4.8 {d0, d1, d2, d3}, [%0], %3 \n" // src -= 32
1281 "subs %2, #8 \n" // 8 pixels per loop.
1282 "vrev64.8 d0, d0 \n"
1283 "vrev64.8 d1, d1 \n"
1284 "vrev64.8 d2, d2 \n"
1285 "vrev64.8 d3, d3 \n"
1286 "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // dst += 32
1287 "bgt 1b \n"
1288 : "+r"(src_argb), // %0
1289 "+r"(dst_argb), // %1
1290 "+r"(width) // %2
1291 : "r"(-32) // %3
1292 : "cc", "memory", "d0", "d1", "d2", "d3");
1293 }
1294
RGB24MirrorRow_NEON(const uint8_t * src_rgb24,uint8_t * dst_rgb24,int width)1295 void RGB24MirrorRow_NEON(const uint8_t* src_rgb24,
1296 uint8_t* dst_rgb24,
1297 int width) {
1298 src_rgb24 += width * 3 - 24;
1299 asm volatile(
1300 "1: \n"
1301 "vld3.8 {d0, d1, d2}, [%0], %3 \n" // src -= 24
1302 "subs %2, #8 \n" // 8 pixels per loop.
1303 "vrev64.8 d0, d0 \n"
1304 "vrev64.8 d1, d1 \n"
1305 "vrev64.8 d2, d2 \n"
1306 "vst3.8 {d0, d1, d2}, [%1]! \n" // dst += 24
1307 "bgt 1b \n"
1308 : "+r"(src_rgb24), // %0
1309 "+r"(dst_rgb24), // %1
1310 "+r"(width) // %2
1311 : "r"(-24) // %3
1312 : "cc", "memory", "d0", "d1", "d2");
1313 }
1314
RGB24ToARGBRow_NEON(const uint8_t * src_rgb24,uint8_t * dst_argb,int width)1315 void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24,
1316 uint8_t* dst_argb,
1317 int width) {
1318 asm volatile(
1319 "vmov.u8 d4, #255 \n" // Alpha
1320 "1: \n"
1321 "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RGB24.
1322 "subs %2, %2, #8 \n" // 8 processed per loop.
1323 "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB.
1324 "bgt 1b \n"
1325 : "+r"(src_rgb24), // %0
1326 "+r"(dst_argb), // %1
1327 "+r"(width) // %2
1328 :
1329 : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List
1330 );
1331 }
1332
RAWToARGBRow_NEON(const uint8_t * src_raw,uint8_t * dst_argb,int width)1333 void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
1334 asm volatile(
1335 "vmov.u8 d4, #255 \n" // Alpha
1336 "1: \n"
1337 "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW.
1338 "subs %2, %2, #8 \n" // 8 processed per loop.
1339 "vswp.u8 d1, d3 \n" // swap R, B
1340 "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB.
1341 "bgt 1b \n"
1342 : "+r"(src_raw), // %0
1343 "+r"(dst_argb), // %1
1344 "+r"(width) // %2
1345 :
1346 : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List
1347 );
1348 }
1349
RAWToRGBARow_NEON(const uint8_t * src_raw,uint8_t * dst_rgba,int width)1350 void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
1351 asm volatile(
1352 "vmov.u8 d0, #255 \n" // Alpha
1353 "1: \n"
1354 "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW.
1355 "subs %2, %2, #8 \n" // 8 processed per loop.
1356 "vswp.u8 d1, d3 \n" // swap R, B
1357 "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of RGBA.
1358 "bgt 1b \n"
1359 : "+r"(src_raw), // %0
1360 "+r"(dst_rgba), // %1
1361 "+r"(width) // %2
1362 :
1363 : "cc", "memory", "d0", "d1", "d2", "d3" // Clobber List
1364 );
1365 }
RAWToRGB24Row_NEON(const uint8_t * src_raw,uint8_t * dst_rgb24,int width)1366 void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
1367 asm volatile(
1368 "1: \n"
1369 "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW.
1370 "subs %2, %2, #8 \n" // 8 processed per loop.
1371 "vswp.u8 d1, d3 \n" // swap R, B
1372 "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of
1373 // RGB24.
1374 "bgt 1b \n"
1375 : "+r"(src_raw), // %0
1376 "+r"(dst_rgb24), // %1
1377 "+r"(width) // %2
1378 :
1379 : "cc", "memory", "d1", "d2", "d3" // Clobber List
1380 );
1381 }
1382
1383 #define RGB565TOARGB \
1384 "vshrn.u16 d6, q0, #5 \n" /* G xxGGGGGG */ \
1385 "vuzp.u8 d0, d1 \n" /* d0 xxxBBBBB RRRRRxxx */ \
1386 "vshl.u8 d6, d6, #2 \n" /* G GGGGGG00 upper 6 */ \
1387 "vshr.u8 d1, d1, #3 \n" /* R 000RRRRR lower 5 */ \
1388 "vshl.u8 q0, q0, #3 \n" /* B,R BBBBB000 upper 5 */ \
1389 "vshr.u8 q2, q0, #5 \n" /* B,R 00000BBB lower 3 */ \
1390 "vorr.u8 d0, d0, d4 \n" /* B */ \
1391 "vshr.u8 d4, d6, #6 \n" /* G 000000GG lower 2 */ \
1392 "vorr.u8 d2, d1, d5 \n" /* R */ \
1393 "vorr.u8 d1, d4, d6 \n" /* G */
1394
RGB565ToARGBRow_NEON(const uint8_t * src_rgb565,uint8_t * dst_argb,int width)1395 void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565,
1396 uint8_t* dst_argb,
1397 int width) {
1398 asm volatile(
1399 "vmov.u8 d3, #255 \n" // Alpha
1400 "1: \n"
1401 "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
1402 "subs %2, %2, #8 \n" // 8 processed per loop.
1403 RGB565TOARGB
1404 "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
1405 "bgt 1b \n"
1406 : "+r"(src_rgb565), // %0
1407 "+r"(dst_argb), // %1
1408 "+r"(width) // %2
1409 :
1410 : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List
1411 );
1412 }
1413
1414 #define ARGB1555TOARGB \
1415 "vshrn.u16 d7, q0, #8 \n" /* A Arrrrrxx */ \
1416 "vshr.u8 d6, d7, #2 \n" /* R xxxRRRRR */ \
1417 "vshrn.u16 d5, q0, #5 \n" /* G xxxGGGGG */ \
1418 "vmovn.u16 d4, q0 \n" /* B xxxBBBBB */ \
1419 "vshr.u8 d7, d7, #7 \n" /* A 0000000A */ \
1420 "vneg.s8 d7, d7 \n" /* A AAAAAAAA upper 8 */ \
1421 "vshl.u8 d6, d6, #3 \n" /* R RRRRR000 upper 5 */ \
1422 "vshr.u8 q1, q3, #5 \n" /* R,A 00000RRR lower 3 */ \
1423 "vshl.u8 q0, q2, #3 \n" /* B,G BBBBB000 upper 5 */ \
1424 "vshr.u8 q2, q0, #5 \n" /* B,G 00000BBB lower 3 */ \
1425 "vorr.u8 q1, q1, q3 \n" /* R,A */ \
1426 "vorr.u8 q0, q0, q2 \n" /* B,G */
1427
1428 // RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha.
1429 #define RGB555TOARGB \
1430 "vshrn.u16 d6, q0, #5 \n" /* G xxxGGGGG */ \
1431 "vuzp.u8 d0, d1 \n" /* d0 xxxBBBBB xRRRRRxx */ \
1432 "vshl.u8 d6, d6, #3 \n" /* G GGGGG000 upper 5 */ \
1433 "vshr.u8 d1, d1, #2 \n" /* R 00xRRRRR lower 5 */ \
1434 "vshl.u8 q0, q0, #3 \n" /* B,R BBBBB000 upper 5 */ \
1435 "vshr.u8 q2, q0, #5 \n" /* B,R 00000BBB lower 3 */ \
1436 "vorr.u8 d0, d0, d4 \n" /* B */ \
1437 "vshr.u8 d4, d6, #5 \n" /* G 00000GGG lower 3 */ \
1438 "vorr.u8 d2, d1, d5 \n" /* R */ \
1439 "vorr.u8 d1, d4, d6 \n" /* G */
1440
ARGB1555ToARGBRow_NEON(const uint8_t * src_argb1555,uint8_t * dst_argb,int width)1441 void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555,
1442 uint8_t* dst_argb,
1443 int width) {
1444 asm volatile(
1445 "vmov.u8 d3, #255 \n" // Alpha
1446 "1: \n"
1447 "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
1448 "subs %2, %2, #8 \n" // 8 processed per loop.
1449 ARGB1555TOARGB
1450 "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
1451 "bgt 1b \n"
1452 : "+r"(src_argb1555), // %0
1453 "+r"(dst_argb), // %1
1454 "+r"(width) // %2
1455 :
1456 : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List
1457 );
1458 }
1459
1460 #define ARGB4444TOARGB \
1461 "vuzp.u8 d0, d1 \n" /* d0 BG, d1 RA */ \
1462 "vshl.u8 q2, q0, #4 \n" /* B,R BBBB0000 */ \
1463 "vshr.u8 q1, q0, #4 \n" /* G,A 0000GGGG */ \
1464 "vshr.u8 q0, q2, #4 \n" /* B,R 0000BBBB */ \
1465 "vorr.u8 q0, q0, q2 \n" /* B,R BBBBBBBB */ \
1466 "vshl.u8 q2, q1, #4 \n" /* G,A GGGG0000 */ \
1467 "vorr.u8 q1, q1, q2 \n" /* G,A GGGGGGGG */ \
1468 "vswp.u8 d1, d2 \n" /* B,R,G,A -> B,G,R,A */
1469
ARGB4444ToARGBRow_NEON(const uint8_t * src_argb4444,uint8_t * dst_argb,int width)1470 void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444,
1471 uint8_t* dst_argb,
1472 int width) {
1473 asm volatile(
1474 "vmov.u8 d3, #255 \n" // Alpha
1475 "1: \n"
1476 "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
1477 "subs %2, %2, #8 \n" // 8 processed per loop.
1478 ARGB4444TOARGB
1479 "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
1480 "bgt 1b \n"
1481 : "+r"(src_argb4444), // %0
1482 "+r"(dst_argb), // %1
1483 "+r"(width) // %2
1484 :
1485 : "cc", "memory", "q0", "q1", "q2" // Clobber List
1486 );
1487 }
1488
ARGBToRGB24Row_NEON(const uint8_t * src_argb,uint8_t * dst_rgb24,int width)1489 void ARGBToRGB24Row_NEON(const uint8_t* src_argb,
1490 uint8_t* dst_rgb24,
1491 int width) {
1492 asm volatile(
1493 "1: \n"
1494 "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 16 pixels of ARGB.
1495 "vld4.8 {d1, d3, d5, d7}, [%0]! \n"
1496 "subs %2, %2, #16 \n" // 16 processed per loop.
1497 "vst3.8 {d0, d2, d4}, [%1]! \n" // store 16 RGB24 pixels.
1498 "vst3.8 {d1, d3, d5}, [%1]! \n"
1499 "bgt 1b \n"
1500 : "+r"(src_argb), // %0
1501 "+r"(dst_rgb24), // %1
1502 "+r"(width) // %2
1503 :
1504 : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List
1505 );
1506 }
1507
ARGBToRAWRow_NEON(const uint8_t * src_argb,uint8_t * dst_raw,int width)1508 void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) {
1509 asm volatile(
1510 "1: \n"
1511 "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB.
1512 "subs %2, %2, #8 \n" // 8 processed per loop.
1513 "vswp.u8 d1, d3 \n" // swap R, B
1514 "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RAW.
1515 "bgt 1b \n"
1516 : "+r"(src_argb), // %0
1517 "+r"(dst_raw), // %1
1518 "+r"(width) // %2
1519 :
1520 : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List
1521 );
1522 }
1523
YUY2ToYRow_NEON(const uint8_t * src_yuy2,uint8_t * dst_y,int width)1524 void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
1525 asm volatile(
1526 "1: \n"
1527 "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2.
1528 "subs %2, %2, #16 \n" // 16 processed per loop.
1529 "vst1.8 {q0}, [%1]! \n" // store 16 pixels of Y.
1530 "bgt 1b \n"
1531 : "+r"(src_yuy2), // %0
1532 "+r"(dst_y), // %1
1533 "+r"(width) // %2
1534 :
1535 : "cc", "memory", "q0", "q1" // Clobber List
1536 );
1537 }
1538
UYVYToYRow_NEON(const uint8_t * src_uyvy,uint8_t * dst_y,int width)1539 void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
1540 asm volatile(
1541 "1: \n"
1542 "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of UYVY.
1543 "subs %2, %2, #16 \n" // 16 processed per loop.
1544 "vst1.8 {q1}, [%1]! \n" // store 16 pixels of Y.
1545 "bgt 1b \n"
1546 : "+r"(src_uyvy), // %0
1547 "+r"(dst_y), // %1
1548 "+r"(width) // %2
1549 :
1550 : "cc", "memory", "q0", "q1" // Clobber List
1551 );
1552 }
1553
YUY2ToUV422Row_NEON(const uint8_t * src_yuy2,uint8_t * dst_u,uint8_t * dst_v,int width)1554 void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2,
1555 uint8_t* dst_u,
1556 uint8_t* dst_v,
1557 int width) {
1558 asm volatile(
1559 "1: \n"
1560 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2.
1561 "subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
1562 "vst1.8 {d1}, [%1]! \n" // store 8 U.
1563 "vst1.8 {d3}, [%2]! \n" // store 8 V.
1564 "bgt 1b \n"
1565 : "+r"(src_yuy2), // %0
1566 "+r"(dst_u), // %1
1567 "+r"(dst_v), // %2
1568 "+r"(width) // %3
1569 :
1570 : "cc", "memory", "d0", "d1", "d2", "d3" // Clobber List
1571 );
1572 }
1573
UYVYToUV422Row_NEON(const uint8_t * src_uyvy,uint8_t * dst_u,uint8_t * dst_v,int width)1574 void UYVYToUV422Row_NEON(const uint8_t* src_uyvy,
1575 uint8_t* dst_u,
1576 uint8_t* dst_v,
1577 int width) {
1578 asm volatile(
1579 "1: \n"
1580 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY.
1581 "subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
1582 "vst1.8 {d0}, [%1]! \n" // store 8 U.
1583 "vst1.8 {d2}, [%2]! \n" // store 8 V.
1584 "bgt 1b \n"
1585 : "+r"(src_uyvy), // %0
1586 "+r"(dst_u), // %1
1587 "+r"(dst_v), // %2
1588 "+r"(width) // %3
1589 :
1590 : "cc", "memory", "d0", "d1", "d2", "d3" // Clobber List
1591 );
1592 }
1593
YUY2ToUVRow_NEON(const uint8_t * src_yuy2,int stride_yuy2,uint8_t * dst_u,uint8_t * dst_v,int width)1594 void YUY2ToUVRow_NEON(const uint8_t* src_yuy2,
1595 int stride_yuy2,
1596 uint8_t* dst_u,
1597 uint8_t* dst_v,
1598 int width) {
1599 asm volatile(
1600 "add %1, %0, %1 \n" // stride + src_yuy2
1601 "1: \n"
1602 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2.
1603 "subs %4, %4, #16 \n" // 16 pixels = 8 UVs.
1604 "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row YUY2.
1605 "vrhadd.u8 d1, d1, d5 \n" // average rows of U
1606 "vrhadd.u8 d3, d3, d7 \n" // average rows of V
1607 "vst1.8 {d1}, [%2]! \n" // store 8 U.
1608 "vst1.8 {d3}, [%3]! \n" // store 8 V.
1609 "bgt 1b \n"
1610 : "+r"(src_yuy2), // %0
1611 "+r"(stride_yuy2), // %1
1612 "+r"(dst_u), // %2
1613 "+r"(dst_v), // %3
1614 "+r"(width) // %4
1615 :
1616 : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6",
1617 "d7" // Clobber List
1618 );
1619 }
1620
UYVYToUVRow_NEON(const uint8_t * src_uyvy,int stride_uyvy,uint8_t * dst_u,uint8_t * dst_v,int width)1621 void UYVYToUVRow_NEON(const uint8_t* src_uyvy,
1622 int stride_uyvy,
1623 uint8_t* dst_u,
1624 uint8_t* dst_v,
1625 int width) {
1626 asm volatile(
1627 "add %1, %0, %1 \n" // stride + src_uyvy
1628 "1: \n"
1629 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY.
1630 "subs %4, %4, #16 \n" // 16 pixels = 8 UVs.
1631 "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row UYVY.
1632 "vrhadd.u8 d0, d0, d4 \n" // average rows of U
1633 "vrhadd.u8 d2, d2, d6 \n" // average rows of V
1634 "vst1.8 {d0}, [%2]! \n" // store 8 U.
1635 "vst1.8 {d2}, [%3]! \n" // store 8 V.
1636 "bgt 1b \n"
1637 : "+r"(src_uyvy), // %0
1638 "+r"(stride_uyvy), // %1
1639 "+r"(dst_u), // %2
1640 "+r"(dst_v), // %3
1641 "+r"(width) // %4
1642 :
1643 : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6",
1644 "d7" // Clobber List
1645 );
1646 }
1647
YUY2ToNVUVRow_NEON(const uint8_t * src_yuy2,int stride_yuy2,uint8_t * dst_uv,int width)1648 void YUY2ToNVUVRow_NEON(const uint8_t* src_yuy2,
1649 int stride_yuy2,
1650 uint8_t* dst_uv,
1651 int width) {
1652 asm volatile(
1653 "add %1, %0, %1 \n" // stride + src_yuy2
1654 "1: \n"
1655 "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2.
1656 "subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
1657 "vld2.8 {q2, q3}, [%1]! \n" // load next row YUY2.
1658 "vrhadd.u8 q4, q1, q3 \n" // average rows of UV
1659 "vst1.8 {q4}, [%2]! \n" // store 8 UV.
1660 "bgt 1b \n"
1661 : "+r"(src_yuy2), // %0
1662 "+r"(stride_yuy2), // %1
1663 "+r"(dst_uv), // %2
1664 "+r"(width) // %3
1665 :
1666 : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6",
1667 "d7" // Clobber List
1668 );
1669 }
1670
1671 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
ARGBShuffleRow_NEON(const uint8_t * src_argb,uint8_t * dst_argb,const uint8_t * shuffler,int width)1672 void ARGBShuffleRow_NEON(const uint8_t* src_argb,
1673 uint8_t* dst_argb,
1674 const uint8_t* shuffler,
1675 int width) {
1676 asm volatile(
1677 "vld1.8 {q2}, [%3] \n" // shuffler
1678 "1: \n"
1679 "vld1.8 {q0}, [%0]! \n" // load 4 pixels.
1680 "subs %2, %2, #4 \n" // 4 processed per loop
1681 "vtbl.8 d2, {d0, d1}, d4 \n" // look up 2 first pixels
1682 "vtbl.8 d3, {d0, d1}, d5 \n" // look up 2 next pixels
1683 "vst1.8 {q1}, [%1]! \n" // store 4.
1684 "bgt 1b \n"
1685 : "+r"(src_argb), // %0
1686 "+r"(dst_argb), // %1
1687 "+r"(width) // %2
1688 : "r"(shuffler) // %3
1689 : "cc", "memory", "q0", "q1", "q2" // Clobber List
1690 );
1691 }
1692
I422ToYUY2Row_NEON(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_yuy2,int width)1693 void I422ToYUY2Row_NEON(const uint8_t* src_y,
1694 const uint8_t* src_u,
1695 const uint8_t* src_v,
1696 uint8_t* dst_yuy2,
1697 int width) {
1698 asm volatile(
1699 "1: \n"
1700 "vld2.8 {d0, d2}, [%0]! \n" // load 16 Ys
1701 "vld1.8 {d1}, [%1]! \n" // load 8 Us
1702 "vld1.8 {d3}, [%2]! \n" // load 8 Vs
1703 "subs %4, %4, #16 \n" // 16 pixels
1704 "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 YUY2/16 pixels.
1705 "bgt 1b \n"
1706 : "+r"(src_y), // %0
1707 "+r"(src_u), // %1
1708 "+r"(src_v), // %2
1709 "+r"(dst_yuy2), // %3
1710 "+r"(width) // %4
1711 :
1712 : "cc", "memory", "d0", "d1", "d2", "d3");
1713 }
1714
I422ToUYVYRow_NEON(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_uyvy,int width)1715 void I422ToUYVYRow_NEON(const uint8_t* src_y,
1716 const uint8_t* src_u,
1717 const uint8_t* src_v,
1718 uint8_t* dst_uyvy,
1719 int width) {
1720 asm volatile(
1721 "1: \n"
1722 "vld2.8 {d1, d3}, [%0]! \n" // load 16 Ys
1723 "vld1.8 {d0}, [%1]! \n" // load 8 Us
1724 "vld1.8 {d2}, [%2]! \n" // load 8 Vs
1725 "subs %4, %4, #16 \n" // 16 pixels
1726 "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 UYVY/16 pixels.
1727 "bgt 1b \n"
1728 : "+r"(src_y), // %0
1729 "+r"(src_u), // %1
1730 "+r"(src_v), // %2
1731 "+r"(dst_uyvy), // %3
1732 "+r"(width) // %4
1733 :
1734 : "cc", "memory", "d0", "d1", "d2", "d3");
1735 }
1736
ARGBToRGB565Row_NEON(const uint8_t * src_argb,uint8_t * dst_rgb565,int width)1737 void ARGBToRGB565Row_NEON(const uint8_t* src_argb,
1738 uint8_t* dst_rgb565,
1739 int width) {
1740 asm volatile(
1741 "1: \n"
1742 "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 pixels of ARGB.
1743 "subs %2, %2, #8 \n" // 8 processed per loop.
1744 ARGBTORGB565
1745 "vst1.8 {q2}, [%1]! \n" // store 8 pixels RGB565.
1746 "bgt 1b \n"
1747 : "+r"(src_argb), // %0
1748 "+r"(dst_rgb565), // %1
1749 "+r"(width) // %2
1750 :
1751 : "cc", "memory", "q0", "q1", "q2", "d6");
1752 }
1753
ARGBToRGB565DitherRow_NEON(const uint8_t * src_argb,uint8_t * dst_rgb,uint32_t dither4,int width)1754 void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb,
1755 uint8_t* dst_rgb,
1756 uint32_t dither4,
1757 int width) {
1758 asm volatile(
1759 "vdup.32 d7, %2 \n" // dither4
1760 "1: \n"
1761 "vld4.8 {d0, d2, d4, d6}, [%1]! \n" // load 8 pixels of ARGB.
1762 "subs %3, %3, #8 \n" // 8 processed per loop.
1763 "vqadd.u8 d0, d0, d7 \n"
1764 "vqadd.u8 d2, d2, d7 \n"
1765 "vqadd.u8 d4, d4, d7 \n" // add for dither
1766 ARGBTORGB565
1767 "vst1.8 {q2}, [%0]! \n" // store 8 RGB565.
1768 "bgt 1b \n"
1769 : "+r"(dst_rgb) // %0
1770 : "r"(src_argb), // %1
1771 "r"(dither4), // %2
1772 "r"(width) // %3
1773 : "cc", "memory", "q0", "q1", "q2", "q3");
1774 }
1775
ARGBToARGB1555Row_NEON(const uint8_t * src_argb,uint8_t * dst_argb1555,int width)1776 void ARGBToARGB1555Row_NEON(const uint8_t* src_argb,
1777 uint8_t* dst_argb1555,
1778 int width) {
1779 asm volatile(
1780 "1: \n"
1781 "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 pixels of ARGB.
1782 "subs %2, %2, #8 \n" // 8 processed per loop.
1783 ARGBTOARGB1555
1784 "vst1.8 {q3}, [%1]! \n" // store 8 ARGB1555.
1785 "bgt 1b \n"
1786 : "+r"(src_argb), // %0
1787 "+r"(dst_argb1555), // %1
1788 "+r"(width) // %2
1789 :
1790 : "cc", "memory", "q0", "q1", "q2", "q3");
1791 }
1792
ARGBToARGB4444Row_NEON(const uint8_t * src_argb,uint8_t * dst_argb4444,int width)1793 void ARGBToARGB4444Row_NEON(const uint8_t* src_argb,
1794 uint8_t* dst_argb4444,
1795 int width) {
1796 asm volatile(
1797 "vmov.u8 d7, #0x0f \n" // bits to clear with
1798 // vbic.
1799 "1: \n"
1800 "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 pixels of ARGB.
1801 "subs %2, %2, #8 \n" // 8 processed per loop.
1802 ARGBTOARGB4444
1803 "vst1.8 {q0}, [%1]! \n" // store 8 ARGB4444.
1804 "bgt 1b \n"
1805 : "+r"(src_argb), // %0
1806 "+r"(dst_argb4444), // %1
1807 "+r"(width) // %2
1808 :
1809 : "cc", "memory", "q0", "q1", "q2", "q3");
1810 }
1811
ARGBExtractAlphaRow_NEON(const uint8_t * src_argb,uint8_t * dst_a,int width)1812 void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb,
1813 uint8_t* dst_a,
1814 int width) {
1815 asm volatile(
1816 "1: \n"
1817 "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels
1818 "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels
1819 "subs %2, %2, #16 \n" // 16 processed per loop
1820 "vst1.8 {q3}, [%1]! \n" // store 16 A's.
1821 "bgt 1b \n"
1822 : "+r"(src_argb), // %0
1823 "+r"(dst_a), // %1
1824 "+r"(width) // %2
1825 :
1826 : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List
1827 );
1828 }
1829
1830 struct RgbUVConstants {
1831 uint8_t kRGBToU[4];
1832 uint8_t kRGBToV[4];
1833 };
1834
1835 // 8x1 pixels.
ARGBToUV444MatrixRow_NEON(const uint8_t * src_argb,uint8_t * dst_u,uint8_t * dst_v,int width,const struct RgbUVConstants * rgbuvconstants)1836 void ARGBToUV444MatrixRow_NEON(const uint8_t* src_argb,
1837 uint8_t* dst_u,
1838 uint8_t* dst_v,
1839 int width,
1840 const struct RgbUVConstants* rgbuvconstants) {
1841 asm volatile(
1842
1843 "vld1.8 {d0}, [%4] \n" // load rgbuvconstants
1844 "vdup.u8 d24, d0[0] \n" // UB 0.875 coefficient
1845 "vdup.u8 d25, d0[1] \n" // UG -0.5781 coefficient
1846 "vdup.u8 d26, d0[2] \n" // UR -0.2969 coefficient
1847 "vdup.u8 d27, d0[4] \n" // VB -0.1406 coefficient
1848 "vdup.u8 d28, d0[5] \n" // VG -0.7344 coefficient
1849 "vmov.u16 q15, #0x8080 \n" // 128.5
1850
1851 "1: \n"
1852 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
1853 "subs %3, %3, #8 \n" // 8 processed per loop.
1854 "vmull.u8 q2, d0, d24 \n" // B
1855 "vmlsl.u8 q2, d1, d25 \n" // G
1856 "vmlsl.u8 q2, d2, d26 \n" // R
1857
1858 "vmull.u8 q3, d2, d24 \n" // R
1859 "vmlsl.u8 q3, d1, d28 \n" // G
1860 "vmlsl.u8 q3, d0, d27 \n" // B
1861
1862 "vaddhn.u16 d0, q2, q15 \n" // +128 -> unsigned
1863 "vaddhn.u16 d1, q3, q15 \n" // +128 -> unsigned
1864
1865 "vst1.8 {d0}, [%1]! \n" // store 8 pixels U.
1866 "vst1.8 {d1}, [%2]! \n" // store 8 pixels V.
1867 "bgt 1b \n"
1868 : "+r"(src_argb), // %0
1869 "+r"(dst_u), // %1
1870 "+r"(dst_v), // %2
1871 "+r"(width) // %3
1872 : "r"(rgbuvconstants) // %4
1873 : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q12", "q13", "q14",
1874 "q15");
1875 }
1876
1877 // RGB to bt601 coefficients
1878 // UB 0.875 coefficient = 112
1879 // UG -0.5781 coefficient = 74
1880 // UR -0.2969 coefficient = 38
1881 // VB -0.1406 coefficient = 18
1882 // VG -0.7344 coefficient = 94
1883 // VR 0.875 coefficient = 112 (ignored)
1884
1885 static const struct RgbUVConstants kRgb24I601UVConstants = {{112, 74, 38, 0},
1886 {18, 94, 112, 0}};
1887
1888 // RGB to JPeg coefficients
1889 // UB coeff 0.500 = 127
1890 // UG coeff -0.33126 = 84
1891 // UR coeff -0.16874 = 43
1892 // VB coeff -0.08131 = 20
1893 // VG coeff -0.41869 = 107
1894 // VR coeff 0.500 = 127 (ignored)
1895
1896 static const struct RgbUVConstants kRgb24JPegUVConstants = {{127, 84, 43, 0},
1897 {20, 107, 127, 0}};
1898
ARGBToUV444Row_NEON(const uint8_t * src_argb,uint8_t * dst_u,uint8_t * dst_v,int width)1899 void ARGBToUV444Row_NEON(const uint8_t* src_argb,
1900 uint8_t* dst_u,
1901 uint8_t* dst_v,
1902 int width) {
1903 ARGBToUV444MatrixRow_NEON(src_argb, dst_u, dst_v, width,
1904 &kRgb24I601UVConstants);
1905 }
1906
ARGBToUVJ444Row_NEON(const uint8_t * src_argb,uint8_t * dst_u,uint8_t * dst_v,int width)1907 void ARGBToUVJ444Row_NEON(const uint8_t* src_argb,
1908 uint8_t* dst_u,
1909 uint8_t* dst_v,
1910 int width) {
1911 ARGBToUV444MatrixRow_NEON(src_argb, dst_u, dst_v, width,
1912 &kRgb24JPegUVConstants);
1913 }
1914
1915 // clang-format off
1916 // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
1917 #define RGBTOUV(QB, QG, QR) \
1918 "vmul.s16 q8, " #QB ", q10 \n" /* B */ \
1919 "vmls.s16 q8, " #QG ", q11 \n" /* G */ \
1920 "vmls.s16 q8, " #QR ", q12 \n" /* R */ \
1921 "vmul.s16 q9, " #QR ", q10 \n" /* R */ \
1922 "vmls.s16 q9, " #QG ", q14 \n" /* G */ \
1923 "vmls.s16 q9, " #QB ", q13 \n" /* B */ \
1924 "vaddhn.u16 d0, q8, q15 \n" /* +128 -> unsigned */ \
1925 "vaddhn.u16 d1, q9, q15 \n" /* +128 -> unsigned */
1926 // clang-format on
1927
1928 // TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
ARGBToUVRow_NEON(const uint8_t * src_argb,int src_stride_argb,uint8_t * dst_u,uint8_t * dst_v,int width)1929 void ARGBToUVRow_NEON(const uint8_t* src_argb,
1930 int src_stride_argb,
1931 uint8_t* dst_u,
1932 uint8_t* dst_v,
1933 int width) {
1934 asm volatile (
1935 "add %1, %0, %1 \n" // src_stride + src_argb
1936 "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
1937 "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
1938 "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
1939 "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
1940 "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
1941 "vmov.u16 q15, #0x8080 \n" // 128.5
1942 "1: \n"
1943 "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
1944 "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
1945 "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
1946 "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
1947 "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
1948 "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels.
1949 "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels.
1950 "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
1951 "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
1952 "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts.
1953
1954 "vrshr.u16 q0, q0, #1 \n" // 2x average
1955 "vrshr.u16 q1, q1, #1 \n"
1956 "vrshr.u16 q2, q2, #1 \n"
1957
1958 "subs %4, %4, #16 \n" // 16 processed per loop.
1959 RGBTOUV(q0, q1, q2)
1960 "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
1961 "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
1962 "bgt 1b \n"
1963 : "+r"(src_argb), // %0
1964 "+r"(src_stride_argb), // %1
1965 "+r"(dst_u), // %2
1966 "+r"(dst_v), // %3
1967 "+r"(width) // %4
1968 :
1969 : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
1970 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
1971 );
1972 }
1973
1974 // TODO(fbarchard): Subsample match Intel code.
ARGBToUVJRow_NEON(const uint8_t * src_argb,int src_stride_argb,uint8_t * dst_u,uint8_t * dst_v,int width)1975 void ARGBToUVJRow_NEON(const uint8_t* src_argb,
1976 int src_stride_argb,
1977 uint8_t* dst_u,
1978 uint8_t* dst_v,
1979 int width) {
1980 asm volatile (
1981 "add %1, %0, %1 \n" // src_stride + src_argb
1982 "vmov.s16 q10, #127 / 2 \n" // UB / VR 0.500 coefficient
1983 "vmov.s16 q11, #84 / 2 \n" // UG -0.33126 coefficient
1984 "vmov.s16 q12, #43 / 2 \n" // UR -0.16874 coefficient
1985 "vmov.s16 q13, #20 / 2 \n" // VB -0.08131 coefficient
1986 "vmov.s16 q14, #107 / 2 \n" // VG -0.41869 coefficient
1987 "vmov.u16 q15, #0x8080 \n" // 128.5
1988 "1: \n"
1989 "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
1990 "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
1991 "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
1992 "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
1993 "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
1994 "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels.
1995 "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels.
1996 "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
1997 "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
1998 "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts.
1999
2000 "vrshr.u16 q0, q0, #1 \n" // 2x average
2001 "vrshr.u16 q1, q1, #1 \n"
2002 "vrshr.u16 q2, q2, #1 \n"
2003
2004 "subs %4, %4, #16 \n" // 16 processed per loop.
2005 RGBTOUV(q0, q1, q2)
2006 "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
2007 "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
2008 "bgt 1b \n"
2009 : "+r"(src_argb), // %0
2010 "+r"(src_stride_argb), // %1
2011 "+r"(dst_u), // %2
2012 "+r"(dst_v), // %3
2013 "+r"(width) // %4
2014 :
2015 : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
2016 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
2017 );
2018 }
2019
ABGRToUVJRow_NEON(const uint8_t * src_abgr,int src_stride_abgr,uint8_t * dst_uj,uint8_t * dst_vj,int width)2020 void ABGRToUVJRow_NEON(const uint8_t* src_abgr,
2021 int src_stride_abgr,
2022 uint8_t* dst_uj,
2023 uint8_t* dst_vj,
2024 int width) {
2025 asm volatile (
2026 "add %1, %0, %1 \n" // src_stride + src_argb
2027 "vmov.s16 q10, #127 / 2 \n" // UB / VR 0.500 coefficient
2028 "vmov.s16 q11, #84 / 2 \n" // UG -0.33126 coefficient
2029 "vmov.s16 q12, #43 / 2 \n" // UR -0.16874 coefficient
2030 "vmov.s16 q13, #20 / 2 \n" // VB -0.08131 coefficient
2031 "vmov.s16 q14, #107 / 2 \n" // VG -0.41869 coefficient
2032 "vmov.u16 q15, #0x8080 \n" // 128.5
2033 "1: \n"
2034 "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ABGR pixels.
2035 "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ABGR pixels.
2036 "vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts.
2037 "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
2038 "vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts.
2039 "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ABGR pixels.
2040 "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ABGR pixels.
2041 "vpadal.u8 q0, q4 \n" // R 16 bytes -> 8 shorts.
2042 "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
2043 "vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts.
2044
2045 "vrshr.u16 q0, q0, #1 \n" // 2x average
2046 "vrshr.u16 q1, q1, #1 \n"
2047 "vrshr.u16 q2, q2, #1 \n"
2048
2049 "subs %4, %4, #16 \n" // 16 processed per loop.
2050 RGBTOUV(q2, q1, q0)
2051 "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
2052 "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
2053 "bgt 1b \n"
2054 : "+r"(src_abgr), // %0
2055 "+r"(src_stride_abgr), // %1
2056 "+r"(dst_uj), // %2
2057 "+r"(dst_vj), // %3
2058 "+r"(width) // %4
2059 :
2060 : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
2061 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
2062 );
2063 }
2064
2065 // TODO(fbarchard): Subsample match C code.
RGB24ToUVJRow_NEON(const uint8_t * src_rgb24,int src_stride_rgb24,uint8_t * dst_u,uint8_t * dst_v,int width)2066 void RGB24ToUVJRow_NEON(const uint8_t* src_rgb24,
2067 int src_stride_rgb24,
2068 uint8_t* dst_u,
2069 uint8_t* dst_v,
2070 int width) {
2071 asm volatile (
2072 "add %1, %0, %1 \n" // src_stride + src_rgb24
2073 "vmov.s16 q10, #127 / 2 \n" // UB / VR 0.500 coefficient
2074 "vmov.s16 q11, #84 / 2 \n" // UG -0.33126 coefficient
2075 "vmov.s16 q12, #43 / 2 \n" // UR -0.16874 coefficient
2076 "vmov.s16 q13, #20 / 2 \n" // VB -0.08131 coefficient
2077 "vmov.s16 q14, #107 / 2 \n" // VG -0.41869 coefficient
2078 "vmov.u16 q15, #0x8080 \n" // 128.5
2079 "1: \n"
2080 "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB24 pixels.
2081 "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RGB24 pixels.
2082 "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
2083 "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
2084 "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
2085 "vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RGB24 pixels.
2086 "vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RGB24 pixels.
2087 "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
2088 "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
2089 "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts.
2090
2091 "vrshr.u16 q0, q0, #1 \n" // 2x average
2092 "vrshr.u16 q1, q1, #1 \n"
2093 "vrshr.u16 q2, q2, #1 \n"
2094
2095 "subs %4, %4, #16 \n" // 16 processed per loop.
2096 RGBTOUV(q0, q1, q2)
2097 "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
2098 "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
2099 "bgt 1b \n"
2100 : "+r"(src_rgb24), // %0
2101 "+r"(src_stride_rgb24), // %1
2102 "+r"(dst_u), // %2
2103 "+r"(dst_v), // %3
2104 "+r"(width) // %4
2105 :
2106 : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
2107 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
2108 );
2109 }
2110
2111 // TODO(fbarchard): Subsample match C code.
RAWToUVJRow_NEON(const uint8_t * src_raw,int src_stride_raw,uint8_t * dst_u,uint8_t * dst_v,int width)2112 void RAWToUVJRow_NEON(const uint8_t* src_raw,
2113 int src_stride_raw,
2114 uint8_t* dst_u,
2115 uint8_t* dst_v,
2116 int width) {
2117 asm volatile (
2118 "add %1, %0, %1 \n" // src_stride + src_raw
2119 "vmov.s16 q10, #127 / 2 \n" // UB / VR 0.500 coefficient
2120 "vmov.s16 q11, #84 / 2 \n" // UG -0.33126 coefficient
2121 "vmov.s16 q12, #43 / 2 \n" // UR -0.16874 coefficient
2122 "vmov.s16 q13, #20 / 2 \n" // VB -0.08131 coefficient
2123 "vmov.s16 q14, #107 / 2 \n" // VG -0.41869 coefficient
2124 "vmov.u16 q15, #0x8080 \n" // 128.5
2125 "1: \n"
2126 "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RAW pixels.
2127 "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RAW pixels.
2128 "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
2129 "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
2130 "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
2131 "vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RAW pixels.
2132 "vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RAW pixels.
2133 "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
2134 "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
2135 "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts.
2136
2137 "vrshr.u16 q0, q0, #1 \n" // 2x average
2138 "vrshr.u16 q1, q1, #1 \n"
2139 "vrshr.u16 q2, q2, #1 \n"
2140
2141 "subs %4, %4, #16 \n" // 16 processed per loop.
2142 RGBTOUV(q2, q1, q0)
2143 "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
2144 "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
2145 "bgt 1b \n"
2146 : "+r"(src_raw), // %0
2147 "+r"(src_stride_raw), // %1
2148 "+r"(dst_u), // %2
2149 "+r"(dst_v), // %3
2150 "+r"(width) // %4
2151 :
2152 : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
2153 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
2154 );
2155 }
2156
BGRAToUVRow_NEON(const uint8_t * src_bgra,int src_stride_bgra,uint8_t * dst_u,uint8_t * dst_v,int width)2157 void BGRAToUVRow_NEON(const uint8_t* src_bgra,
2158 int src_stride_bgra,
2159 uint8_t* dst_u,
2160 uint8_t* dst_v,
2161 int width) {
2162 asm volatile (
2163 "add %1, %0, %1 \n" // src_stride + src_bgra
2164 "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
2165 "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
2166 "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
2167 "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
2168 "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
2169 "vmov.u16 q15, #0x8080 \n" // 128.5
2170 "1: \n"
2171 "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 BGRA pixels.
2172 "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 BGRA pixels.
2173 "vpaddl.u8 q3, q3 \n" // B 16 bytes -> 8 shorts.
2174 "vpaddl.u8 q2, q2 \n" // G 16 bytes -> 8 shorts.
2175 "vpaddl.u8 q1, q1 \n" // R 16 bytes -> 8 shorts.
2176 "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more BGRA pixels.
2177 "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 BGRA pixels.
2178 "vpadal.u8 q3, q7 \n" // B 16 bytes -> 8 shorts.
2179 "vpadal.u8 q2, q6 \n" // G 16 bytes -> 8 shorts.
2180 "vpadal.u8 q1, q5 \n" // R 16 bytes -> 8 shorts.
2181
2182 "vrshr.u16 q1, q1, #1 \n" // 2x average
2183 "vrshr.u16 q2, q2, #1 \n"
2184 "vrshr.u16 q3, q3, #1 \n"
2185
2186 "subs %4, %4, #16 \n" // 16 processed per loop.
2187 RGBTOUV(q3, q2, q1)
2188 "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
2189 "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
2190 "bgt 1b \n"
2191 : "+r"(src_bgra), // %0
2192 "+r"(src_stride_bgra), // %1
2193 "+r"(dst_u), // %2
2194 "+r"(dst_v), // %3
2195 "+r"(width) // %4
2196 :
2197 : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
2198 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
2199 );
2200 }
2201
ABGRToUVRow_NEON(const uint8_t * src_abgr,int src_stride_abgr,uint8_t * dst_u,uint8_t * dst_v,int width)2202 void ABGRToUVRow_NEON(const uint8_t* src_abgr,
2203 int src_stride_abgr,
2204 uint8_t* dst_u,
2205 uint8_t* dst_v,
2206 int width) {
2207 asm volatile (
2208 "add %1, %0, %1 \n" // src_stride + src_abgr
2209 "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
2210 "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
2211 "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
2212 "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
2213 "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
2214 "vmov.u16 q15, #0x8080 \n" // 128.5
2215 "1: \n"
2216 "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ABGR pixels.
2217 "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ABGR pixels.
2218 "vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts.
2219 "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
2220 "vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts.
2221 "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ABGR pixels.
2222 "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ABGR pixels.
2223 "vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts.
2224 "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
2225 "vpadal.u8 q0, q4 \n" // R 16 bytes -> 8 shorts.
2226
2227 "vrshr.u16 q0, q0, #1 \n" // 2x average
2228 "vrshr.u16 q1, q1, #1 \n"
2229 "vrshr.u16 q2, q2, #1 \n"
2230
2231 "subs %4, %4, #16 \n" // 16 processed per loop.
2232 RGBTOUV(q2, q1, q0)
2233 "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
2234 "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
2235 "bgt 1b \n"
2236 : "+r"(src_abgr), // %0
2237 "+r"(src_stride_abgr), // %1
2238 "+r"(dst_u), // %2
2239 "+r"(dst_v), // %3
2240 "+r"(width) // %4
2241 :
2242 : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
2243 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
2244 );
2245 }
2246
RGBAToUVRow_NEON(const uint8_t * src_rgba,int src_stride_rgba,uint8_t * dst_u,uint8_t * dst_v,int width)2247 void RGBAToUVRow_NEON(const uint8_t* src_rgba,
2248 int src_stride_rgba,
2249 uint8_t* dst_u,
2250 uint8_t* dst_v,
2251 int width) {
2252 asm volatile (
2253 "add %1, %0, %1 \n" // src_stride + src_rgba
2254 "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
2255 "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
2256 "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
2257 "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
2258 "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
2259 "vmov.u16 q15, #0x8080 \n" // 128.5
2260 "1: \n"
2261 "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 RGBA pixels.
2262 "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 RGBA pixels.
2263 "vpaddl.u8 q0, q1 \n" // B 16 bytes -> 8 shorts.
2264 "vpaddl.u8 q1, q2 \n" // G 16 bytes -> 8 shorts.
2265 "vpaddl.u8 q2, q3 \n" // R 16 bytes -> 8 shorts.
2266 "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more RGBA pixels.
2267 "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 RGBA pixels.
2268 "vpadal.u8 q0, q5 \n" // B 16 bytes -> 8 shorts.
2269 "vpadal.u8 q1, q6 \n" // G 16 bytes -> 8 shorts.
2270 "vpadal.u8 q2, q7 \n" // R 16 bytes -> 8 shorts.
2271
2272 "vrshr.u16 q0, q0, #1 \n" // 2x average
2273 "vrshr.u16 q1, q1, #1 \n"
2274 "vrshr.u16 q2, q2, #1 \n"
2275
2276 "subs %4, %4, #16 \n" // 16 processed per loop.
2277 RGBTOUV(q0, q1, q2)
2278 "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
2279 "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
2280 "bgt 1b \n"
2281 : "+r"(src_rgba), // %0
2282 "+r"(src_stride_rgba), // %1
2283 "+r"(dst_u), // %2
2284 "+r"(dst_v), // %3
2285 "+r"(width) // %4
2286 :
2287 : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
2288 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
2289 );
2290 }
2291
RGB24ToUVRow_NEON(const uint8_t * src_rgb24,int src_stride_rgb24,uint8_t * dst_u,uint8_t * dst_v,int width)2292 void RGB24ToUVRow_NEON(const uint8_t* src_rgb24,
2293 int src_stride_rgb24,
2294 uint8_t* dst_u,
2295 uint8_t* dst_v,
2296 int width) {
2297 asm volatile (
2298 "add %1, %0, %1 \n" // src_stride + src_rgb24
2299 "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
2300 "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
2301 "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
2302 "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
2303 "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
2304 "vmov.u16 q15, #0x8080 \n" // 128.5
2305 "1: \n"
2306 "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB24 pixels.
2307 "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RGB24 pixels.
2308 "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
2309 "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
2310 "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
2311 "vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RGB24 pixels.
2312 "vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RGB24 pixels.
2313 "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
2314 "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
2315 "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts.
2316
2317 "vrshr.u16 q0, q0, #1 \n" // 2x average
2318 "vrshr.u16 q1, q1, #1 \n"
2319 "vrshr.u16 q2, q2, #1 \n"
2320
2321 "subs %4, %4, #16 \n" // 16 processed per loop.
2322 RGBTOUV(q0, q1, q2)
2323 "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
2324 "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
2325 "bgt 1b \n"
2326 : "+r"(src_rgb24), // %0
2327 "+r"(src_stride_rgb24), // %1
2328 "+r"(dst_u), // %2
2329 "+r"(dst_v), // %3
2330 "+r"(width) // %4
2331 :
2332 : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
2333 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
2334 );
2335 }
2336
RAWToUVRow_NEON(const uint8_t * src_raw,int src_stride_raw,uint8_t * dst_u,uint8_t * dst_v,int width)2337 void RAWToUVRow_NEON(const uint8_t* src_raw,
2338 int src_stride_raw,
2339 uint8_t* dst_u,
2340 uint8_t* dst_v,
2341 int width) {
2342 asm volatile (
2343 "add %1, %0, %1 \n" // src_stride + src_raw
2344 "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
2345 "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
2346 "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
2347 "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
2348 "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
2349 "vmov.u16 q15, #0x8080 \n" // 128.5
2350 "1: \n"
2351 "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RAW pixels.
2352 "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RAW pixels.
2353 "vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts.
2354 "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
2355 "vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts.
2356 "vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RAW pixels.
2357 "vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RAW pixels.
2358 "vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts.
2359 "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
2360 "vpadal.u8 q0, q4 \n" // R 16 bytes -> 8 shorts.
2361
2362 "vrshr.u16 q0, q0, #1 \n" // 2x average
2363 "vrshr.u16 q1, q1, #1 \n"
2364 "vrshr.u16 q2, q2, #1 \n"
2365
2366 "subs %4, %4, #16 \n" // 16 processed per loop.
2367 RGBTOUV(q2, q1, q0)
2368 "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
2369 "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
2370 "bgt 1b \n"
2371 : "+r"(src_raw), // %0
2372 "+r"(src_stride_raw), // %1
2373 "+r"(dst_u), // %2
2374 "+r"(dst_v), // %3
2375 "+r"(width) // %4
2376 :
2377 : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
2378 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
2379 );
2380 }
2381
2382 // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
RGB565ToUVRow_NEON(const uint8_t * src_rgb565,int src_stride_rgb565,uint8_t * dst_u,uint8_t * dst_v,int width)2383 void RGB565ToUVRow_NEON(const uint8_t* src_rgb565,
2384 int src_stride_rgb565,
2385 uint8_t* dst_u,
2386 uint8_t* dst_v,
2387 int width) {
2388 asm volatile(
2389 "add %1, %0, %1 \n" // src_stride + src_argb
2390 "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875
2391 // coefficient
2392 "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
2393 "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
2394 "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
2395 "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
2396 "vmov.u16 q15, #0x8080 \n" // 128.5
2397 "1: \n"
2398 "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
2399 RGB565TOARGB
2400 "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
2401 "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
2402 "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
2403 "vld1.8 {q0}, [%0]! \n" // next 8 RGB565 pixels.
2404 RGB565TOARGB
2405 "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
2406 "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
2407 "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
2408
2409 "vld1.8 {q0}, [%1]! \n" // load 8 RGB565 pixels.
2410 RGB565TOARGB
2411 "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
2412 "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
2413 "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
2414 "vld1.8 {q0}, [%1]! \n" // next 8 RGB565 pixels.
2415 RGB565TOARGB
2416 "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
2417 "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
2418 "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
2419
2420 "vrshr.u16 q4, q4, #1 \n" // 2x average
2421 "vrshr.u16 q5, q5, #1 \n"
2422 "vrshr.u16 q6, q6, #1 \n"
2423
2424 "subs %4, %4, #16 \n" // 16 processed per loop.
2425 "vmul.s16 q8, q4, q10 \n" // B
2426 "vmls.s16 q8, q5, q11 \n" // G
2427 "vmls.s16 q8, q6, q12 \n" // R
2428 "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned
2429 "vmul.s16 q9, q6, q10 \n" // R
2430 "vmls.s16 q9, q5, q14 \n" // G
2431 "vmls.s16 q9, q4, q13 \n" // B
2432 "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned
2433 "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U
2434 "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V
2435 "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
2436 "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
2437 "bgt 1b \n"
2438 : "+r"(src_rgb565), // %0
2439 "+r"(src_stride_rgb565), // %1
2440 "+r"(dst_u), // %2
2441 "+r"(dst_v), // %3
2442 "+r"(width) // %4
2443 :
2444 : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8",
2445 "q9", "q10", "q11", "q12", "q13", "q14", "q15");
2446 }
2447
2448 // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
ARGB1555ToUVRow_NEON(const uint8_t * src_argb1555,int src_stride_argb1555,uint8_t * dst_u,uint8_t * dst_v,int width)2449 void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555,
2450 int src_stride_argb1555,
2451 uint8_t* dst_u,
2452 uint8_t* dst_v,
2453 int width) {
2454 asm volatile(
2455 "add %1, %0, %1 \n" // src_stride + src_argb
2456 "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875
2457 // coefficient
2458 "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
2459 "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
2460 "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
2461 "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
2462 "vmov.u16 q15, #0x8080 \n" // 128.5
2463 "1: \n"
2464 "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
2465 RGB555TOARGB
2466 "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
2467 "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
2468 "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
2469 "vld1.8 {q0}, [%0]! \n" // next 8 ARGB1555 pixels.
2470 RGB555TOARGB
2471 "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
2472 "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
2473 "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
2474
2475 "vld1.8 {q0}, [%1]! \n" // load 8 ARGB1555 pixels.
2476 RGB555TOARGB
2477 "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
2478 "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
2479 "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
2480 "vld1.8 {q0}, [%1]! \n" // next 8 ARGB1555 pixels.
2481 RGB555TOARGB
2482 "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
2483 "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
2484 "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
2485
2486 "vrshr.u16 q4, q4, #1 \n" // 2x average
2487 "vrshr.u16 q5, q5, #1 \n"
2488 "vrshr.u16 q6, q6, #1 \n"
2489
2490 "subs %4, %4, #16 \n" // 16 processed per loop.
2491 "vmul.s16 q8, q4, q10 \n" // B
2492 "vmls.s16 q8, q5, q11 \n" // G
2493 "vmls.s16 q8, q6, q12 \n" // R
2494 "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned
2495 "vmul.s16 q9, q6, q10 \n" // R
2496 "vmls.s16 q9, q5, q14 \n" // G
2497 "vmls.s16 q9, q4, q13 \n" // B
2498 "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned
2499 "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U
2500 "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V
2501 "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
2502 "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
2503 "bgt 1b \n"
2504 : "+r"(src_argb1555), // %0
2505 "+r"(src_stride_argb1555), // %1
2506 "+r"(dst_u), // %2
2507 "+r"(dst_v), // %3
2508 "+r"(width) // %4
2509 :
2510 : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8",
2511 "q9", "q10", "q11", "q12", "q13", "q14", "q15");
2512 }
2513
2514 // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
ARGB4444ToUVRow_NEON(const uint8_t * src_argb4444,int src_stride_argb4444,uint8_t * dst_u,uint8_t * dst_v,int width)2515 void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
2516 int src_stride_argb4444,
2517 uint8_t* dst_u,
2518 uint8_t* dst_v,
2519 int width) {
2520 asm volatile(
2521 "add %1, %0, %1 \n" // src_stride + src_argb
2522 "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875
2523 // coefficient
2524 "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
2525 "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
2526 "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
2527 "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
2528 "vmov.u16 q15, #0x8080 \n" // 128.5
2529 "1: \n"
2530 "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
2531 ARGB4444TOARGB
2532 "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
2533 "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
2534 "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
2535 "vld1.8 {q0}, [%0]! \n" // next 8 ARGB4444 pixels.
2536 ARGB4444TOARGB
2537 "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
2538 "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
2539 "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
2540
2541 "vld1.8 {q0}, [%1]! \n" // load 8 ARGB4444 pixels.
2542 ARGB4444TOARGB
2543 "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
2544 "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
2545 "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
2546 "vld1.8 {q0}, [%1]! \n" // next 8 ARGB4444 pixels.
2547 ARGB4444TOARGB
2548 "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
2549 "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
2550 "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
2551
2552 "vrshr.u16 q0, q4, #1 \n" // 2x average
2553 "vrshr.u16 q1, q5, #1 \n"
2554 "vrshr.u16 q2, q6, #1 \n"
2555
2556 "subs %4, %4, #16 \n" // 16 processed per loop.
2557 RGBTOUV(q0, q1, q2)
2558 "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
2559 "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
2560 "bgt 1b \n"
2561 : "+r"(src_argb4444), // %0
2562 "+r"(src_stride_argb4444), // %1
2563 "+r"(dst_u), // %2
2564 "+r"(dst_v), // %3
2565 "+r"(width) // %4
2566 :
2567 : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8",
2568 "q9", "q10", "q11", "q12", "q13", "q14", "q15");
2569 }
2570
RGB565ToYRow_NEON(const uint8_t * src_rgb565,uint8_t * dst_y,int width)2571 void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
2572 asm volatile(
2573 "vmov.u8 d24, #25 \n" // B * 0.1016 coefficient
2574 "vmov.u8 d25, #129 \n" // G * 0.5078 coefficient
2575 "vmov.u8 d26, #66 \n" // R * 0.2578 coefficient
2576 "vmov.u8 d27, #16 \n" // Add 16 constant
2577 "1: \n"
2578 "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
2579 "subs %2, %2, #8 \n" // 8 processed per loop.
2580 RGB565TOARGB
2581 "vmull.u8 q2, d0, d24 \n" // B
2582 "vmlal.u8 q2, d1, d25 \n" // G
2583 "vmlal.u8 q2, d2, d26 \n" // R
2584 "vqrshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit Y
2585 "vqadd.u8 d0, d27 \n"
2586 "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
2587 "bgt 1b \n"
2588 : "+r"(src_rgb565), // %0
2589 "+r"(dst_y), // %1
2590 "+r"(width) // %2
2591 :
2592 : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13");
2593 }
2594
ARGB1555ToYRow_NEON(const uint8_t * src_argb1555,uint8_t * dst_y,int width)2595 void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555,
2596 uint8_t* dst_y,
2597 int width) {
2598 asm volatile(
2599 "vmov.u8 d24, #25 \n" // B * 0.1016 coefficient
2600 "vmov.u8 d25, #129 \n" // G * 0.5078 coefficient
2601 "vmov.u8 d26, #66 \n" // R * 0.2578 coefficient
2602 "vmov.u8 d27, #16 \n" // Add 16 constant
2603 "1: \n"
2604 "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
2605 "subs %2, %2, #8 \n" // 8 processed per loop.
2606 ARGB1555TOARGB
2607 "vmull.u8 q2, d0, d24 \n" // B
2608 "vmlal.u8 q2, d1, d25 \n" // G
2609 "vmlal.u8 q2, d2, d26 \n" // R
2610 "vqrshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit Y
2611 "vqadd.u8 d0, d27 \n"
2612 "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
2613 "bgt 1b \n"
2614 : "+r"(src_argb1555), // %0
2615 "+r"(dst_y), // %1
2616 "+r"(width) // %2
2617 :
2618 : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13");
2619 }
2620
ARGB4444ToYRow_NEON(const uint8_t * src_argb4444,uint8_t * dst_y,int width)2621 void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
2622 uint8_t* dst_y,
2623 int width) {
2624 asm volatile(
2625 "vmov.u8 d24, #25 \n" // B * 0.1016 coefficient
2626 "vmov.u8 d25, #129 \n" // G * 0.5078 coefficient
2627 "vmov.u8 d26, #66 \n" // R * 0.2578 coefficient
2628 "vmov.u8 d27, #16 \n" // Add 16 constant
2629 "1: \n"
2630 "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
2631 "subs %2, %2, #8 \n" // 8 processed per loop.
2632 ARGB4444TOARGB
2633 "vmull.u8 q2, d0, d24 \n" // B
2634 "vmlal.u8 q2, d1, d25 \n" // G
2635 "vmlal.u8 q2, d2, d26 \n" // R
2636 "vqrshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit Y
2637 "vqadd.u8 d0, d27 \n"
2638 "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
2639 "bgt 1b \n"
2640 : "+r"(src_argb4444), // %0
2641 "+r"(dst_y), // %1
2642 "+r"(width) // %2
2643 :
2644 : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13");
2645 }
2646
ARGBToAR64Row_NEON(const uint8_t * src_argb,uint16_t * dst_ar64,int width)2647 void ARGBToAR64Row_NEON(const uint8_t* src_argb,
2648 uint16_t* dst_ar64,
2649 int width) {
2650 asm volatile(
2651 "1: \n"
2652 "vld1.8 {q0}, [%0]! \n"
2653 "vld1.8 {q2}, [%0]! \n"
2654 "vmov.u8 q1, q0 \n"
2655 "vmov.u8 q3, q2 \n"
2656 "subs %2, %2, #8 \n" // 8 processed per loop.
2657 "vst2.8 {q0, q1}, [%1]! \n" // store 4 pixels
2658 "vst2.8 {q2, q3}, [%1]! \n" // store 4 pixels
2659 "bgt 1b \n"
2660 : "+r"(src_argb), // %0
2661 "+r"(dst_ar64), // %1
2662 "+r"(width) // %2
2663 :
2664 : "cc", "memory", "q0", "q1", "q2", "q3");
2665 }
2666
2667 static const uvec8 kShuffleARGBToABGR = {2, 1, 0, 3, 6, 5, 4, 7,
2668 10, 9, 8, 11, 14, 13, 12, 15};
2669
ARGBToAB64Row_NEON(const uint8_t * src_argb,uint16_t * dst_ab64,int width)2670 void ARGBToAB64Row_NEON(const uint8_t* src_argb,
2671 uint16_t* dst_ab64,
2672 int width) {
2673 asm volatile(
2674 "vld1.8 {q4}, [%3] \n" // shuffler
2675
2676 "1: \n"
2677 "vld1.8 {q0}, [%0]! \n"
2678 "vld1.8 {q2}, [%0]! \n"
2679 "vtbl.8 d2, {d0, d1}, d8 \n"
2680 "vtbl.8 d3, {d0, d1}, d9 \n"
2681 "vtbl.8 d6, {d4, d5}, d8 \n"
2682 "vtbl.8 d7, {d4, d5}, d9 \n"
2683 "vmov.u8 q0, q1 \n"
2684 "vmov.u8 q2, q3 \n"
2685 "subs %2, %2, #8 \n" // 8 processed per loop.
2686 "vst2.8 {q0, q1}, [%1]! \n" // store 4 pixels
2687 "vst2.8 {q2, q3}, [%1]! \n" // store 4 pixels
2688 "bgt 1b \n"
2689 : "+r"(src_argb), // %0
2690 "+r"(dst_ab64), // %1
2691 "+r"(width) // %2
2692 : "r"(&kShuffleARGBToABGR) // %3
2693 : "cc", "memory", "q0", "q1", "q2", "q3", "q4");
2694 }
2695
AR64ToARGBRow_NEON(const uint16_t * src_ar64,uint8_t * dst_argb,int width)2696 void AR64ToARGBRow_NEON(const uint16_t* src_ar64,
2697 uint8_t* dst_argb,
2698 int width) {
2699 asm volatile(
2700 "1: \n"
2701 "vld1.16 {q0}, [%0]! \n"
2702 "vld1.16 {q1}, [%0]! \n"
2703 "vld1.16 {q2}, [%0]! \n"
2704 "vld1.16 {q3}, [%0]! \n"
2705 "vshrn.u16 d0, q0, #8 \n"
2706 "vshrn.u16 d1, q1, #8 \n"
2707 "vshrn.u16 d4, q2, #8 \n"
2708 "vshrn.u16 d5, q3, #8 \n"
2709 "subs %2, %2, #8 \n" // 8 processed per loop.
2710 "vst1.8 {q0}, [%1]! \n" // store 4 pixels
2711 "vst1.8 {q2}, [%1]! \n" // store 4 pixels
2712 "bgt 1b \n"
2713 : "+r"(src_ar64), // %0
2714 "+r"(dst_argb), // %1
2715 "+r"(width) // %2
2716 :
2717 : "cc", "memory", "q0", "q1", "q2", "q3");
2718 }
2719
2720 static const uvec8 kShuffleAB64ToARGB = {5, 3, 1, 7, 13, 11, 9, 15};
2721
AB64ToARGBRow_NEON(const uint16_t * src_ab64,uint8_t * dst_argb,int width)2722 void AB64ToARGBRow_NEON(const uint16_t* src_ab64,
2723 uint8_t* dst_argb,
2724 int width) {
2725 asm volatile(
2726 "vld1.8 {d8}, [%3] \n" // shuffler
2727
2728 "1: \n"
2729 "vld1.16 {q0}, [%0]! \n"
2730 "vld1.16 {q1}, [%0]! \n"
2731 "vld1.16 {q2}, [%0]! \n"
2732 "vld1.16 {q3}, [%0]! \n"
2733 "vtbl.8 d0, {d0, d1}, d8 \n"
2734 "vtbl.8 d1, {d2, d3}, d8 \n"
2735 "vtbl.8 d4, {d4, d5}, d8 \n"
2736 "vtbl.8 d5, {d6, d7}, d8 \n"
2737 "subs %2, %2, #8 \n" // 8 processed per loop.
2738 "vst1.8 {q0}, [%1]! \n" // store 4 pixels
2739 "vst1.8 {q2}, [%1]! \n" // store 4 pixels
2740 "bgt 1b \n"
2741 : "+r"(src_ab64), // %0
2742 "+r"(dst_argb), // %1
2743 "+r"(width) // %2
2744 : "r"(&kShuffleAB64ToARGB) // %3
2745 : "cc", "memory", "q0", "q1", "q2", "q3", "q4");
2746 }
2747
2748 struct RgbConstants {
2749 uint8_t kRGBToY[4];
2750 uint16_t kAddY;
2751 };
2752
2753 // RGB to JPeg coefficients
2754 // B * 0.1140 coefficient = 29
2755 // G * 0.5870 coefficient = 150
2756 // R * 0.2990 coefficient = 77
2757 // Add 0.5 = 0x80
2758 static const struct RgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0}, 128};
2759
2760 static const struct RgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128};
2761
2762 // RGB to BT.601 coefficients
2763 // B * 0.1016 coefficient = 25
2764 // G * 0.5078 coefficient = 129
2765 // R * 0.2578 coefficient = 66
2766 // Add 16.5 = 0x1080
2767
2768 static const struct RgbConstants kRgb24I601Constants = {{25, 129, 66, 0},
2769 0x1080};
2770
2771 static const struct RgbConstants kRawI601Constants = {{66, 129, 25, 0}, 0x1080};
2772
2773 // ARGB expects first 3 values to contain RGB and 4th value is ignored.
ARGBToYMatrixRow_NEON(const uint8_t * src_argb,uint8_t * dst_y,int width,const struct RgbConstants * rgbconstants)2774 void ARGBToYMatrixRow_NEON(const uint8_t* src_argb,
2775 uint8_t* dst_y,
2776 int width,
2777 const struct RgbConstants* rgbconstants) {
2778 asm volatile(
2779 "vld1.8 {d0}, [%3] \n" // load rgbconstants
2780 "vdup.u8 d20, d0[0] \n"
2781 "vdup.u8 d21, d0[1] \n"
2782 "vdup.u8 d22, d0[2] \n"
2783 "vdup.u16 q12, d0[2] \n"
2784 "1: \n"
2785 "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 16 pixels of ARGB
2786 "vld4.8 {d1, d3, d5, d7}, [%0]! \n"
2787 "subs %2, %2, #16 \n" // 16 processed per loop.
2788 "vmull.u8 q8, d0, d20 \n" // B
2789 "vmull.u8 q9, d1, d20 \n"
2790 "vmlal.u8 q8, d2, d21 \n" // G
2791 "vmlal.u8 q9, d3, d21 \n"
2792 "vmlal.u8 q8, d4, d22 \n" // R
2793 "vmlal.u8 q9, d5, d22 \n"
2794 "vaddhn.u16 d0, q8, q12 \n" // 16 bit to 8 bit Y
2795 "vaddhn.u16 d1, q9, q12 \n"
2796 "vst1.8 {d0, d1}, [%1]! \n" // store 16 pixels Y.
2797 "bgt 1b \n"
2798 : "+r"(src_argb), // %0
2799 "+r"(dst_y), // %1
2800 "+r"(width) // %2
2801 : "r"(rgbconstants) // %3
2802 : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "d20", "d21", "d22",
2803 "q12");
2804 }
2805
ARGBToYRow_NEON(const uint8_t * src_argb,uint8_t * dst_y,int width)2806 void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
2807 ARGBToYMatrixRow_NEON(src_argb, dst_y, width, &kRgb24I601Constants);
2808 }
2809
ARGBToYJRow_NEON(const uint8_t * src_argb,uint8_t * dst_yj,int width)2810 void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_yj, int width) {
2811 ARGBToYMatrixRow_NEON(src_argb, dst_yj, width, &kRgb24JPEGConstants);
2812 }
2813
ABGRToYRow_NEON(const uint8_t * src_abgr,uint8_t * dst_y,int width)2814 void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
2815 ARGBToYMatrixRow_NEON(src_abgr, dst_y, width, &kRawI601Constants);
2816 }
2817
ABGRToYJRow_NEON(const uint8_t * src_abgr,uint8_t * dst_yj,int width)2818 void ABGRToYJRow_NEON(const uint8_t* src_abgr, uint8_t* dst_yj, int width) {
2819 ARGBToYMatrixRow_NEON(src_abgr, dst_yj, width, &kRawJPEGConstants);
2820 }
2821
2822 // RGBA expects first value to be A and ignored, then 3 values to contain RGB.
2823 // Same code as ARGB, except the LD4
RGBAToYMatrixRow_NEON(const uint8_t * src_rgba,uint8_t * dst_y,int width,const struct RgbConstants * rgbconstants)2824 void RGBAToYMatrixRow_NEON(const uint8_t* src_rgba,
2825 uint8_t* dst_y,
2826 int width,
2827 const struct RgbConstants* rgbconstants) {
2828 asm volatile(
2829 "vld1.8 {d0}, [%3] \n" // load rgbconstants
2830 "vdup.u8 d20, d0[0] \n"
2831 "vdup.u8 d21, d0[1] \n"
2832 "vdup.u8 d22, d0[2] \n"
2833 "vdup.u16 q12, d0[2] \n"
2834 "1: \n"
2835 "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 16 pixels of RGBA
2836 "vld4.8 {d1, d3, d5, d7}, [%0]! \n"
2837 "subs %2, %2, #16 \n" // 16 processed per loop.
2838 "vmull.u8 q8, d2, d20 \n" // B
2839 "vmull.u8 q9, d3, d20 \n"
2840 "vmlal.u8 q8, d4, d21 \n" // G
2841 "vmlal.u8 q9, d5, d21 \n"
2842 "vmlal.u8 q8, d6, d22 \n" // R
2843 "vmlal.u8 q9, d7, d22 \n"
2844 "vaddhn.u16 d0, q8, q12 \n" // 16 bit to 8 bit Y
2845 "vaddhn.u16 d1, q9, q12 \n"
2846 "vst1.8 {d0, d1}, [%1]! \n" // store 16 pixels Y.
2847 "bgt 1b \n"
2848 : "+r"(src_rgba), // %0
2849 "+r"(dst_y), // %1
2850 "+r"(width) // %2
2851 : "r"(rgbconstants) // %3
2852 : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "d20", "d21", "d22",
2853 "q12");
2854 }
2855
RGBAToYRow_NEON(const uint8_t * src_rgba,uint8_t * dst_y,int width)2856 void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
2857 RGBAToYMatrixRow_NEON(src_rgba, dst_y, width, &kRgb24I601Constants);
2858 }
2859
RGBAToYJRow_NEON(const uint8_t * src_rgba,uint8_t * dst_yj,int width)2860 void RGBAToYJRow_NEON(const uint8_t* src_rgba, uint8_t* dst_yj, int width) {
2861 RGBAToYMatrixRow_NEON(src_rgba, dst_yj, width, &kRgb24JPEGConstants);
2862 }
2863
BGRAToYRow_NEON(const uint8_t * src_bgra,uint8_t * dst_y,int width)2864 void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
2865 RGBAToYMatrixRow_NEON(src_bgra, dst_y, width, &kRawI601Constants);
2866 }
2867
RGBToYMatrixRow_NEON(const uint8_t * src_rgb,uint8_t * dst_y,int width,const struct RgbConstants * rgbconstants)2868 void RGBToYMatrixRow_NEON(const uint8_t* src_rgb,
2869 uint8_t* dst_y,
2870 int width,
2871 const struct RgbConstants* rgbconstants) {
2872 asm volatile(
2873 "vld1.8 {d0}, [%3] \n" // load rgbconstants
2874 "vdup.u8 d20, d0[0] \n"
2875 "vdup.u8 d21, d0[1] \n"
2876 "vdup.u8 d22, d0[2] \n"
2877 "vdup.u16 q12, d0[2] \n"
2878 "1: \n"
2879 "vld3.8 {d2, d4, d6}, [%0]! \n" // load 16 pixels of
2880 // RGB24.
2881 "vld3.8 {d3, d5, d7}, [%0]! \n"
2882 "subs %2, %2, #16 \n" // 16 processed per loop.
2883 "vmull.u8 q8, d2, d20 \n" // B
2884 "vmull.u8 q9, d3, d20 \n"
2885 "vmlal.u8 q8, d4, d21 \n" // G
2886 "vmlal.u8 q9, d5, d21 \n"
2887 "vmlal.u8 q8, d6, d22 \n" // R
2888 "vmlal.u8 q9, d7, d22 \n"
2889 "vaddhn.u16 d0, q8, q12 \n" // 16 bit to 8 bit Y
2890 "vaddhn.u16 d1, q9, q12 \n"
2891 "vst1.8 {d0, d1}, [%1]! \n" // store 16 pixels Y.
2892 "bgt 1b \n"
2893 : "+r"(src_rgb), // %0
2894 "+r"(dst_y), // %1
2895 "+r"(width) // %2
2896 : "r"(rgbconstants) // %3
2897 : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "d20", "d21", "d22",
2898 "q12");
2899 }
2900
RGB24ToYJRow_NEON(const uint8_t * src_rgb24,uint8_t * dst_yj,int width)2901 void RGB24ToYJRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) {
2902 RGBToYMatrixRow_NEON(src_rgb24, dst_yj, width, &kRgb24JPEGConstants);
2903 }
2904
RAWToYJRow_NEON(const uint8_t * src_raw,uint8_t * dst_yj,int width)2905 void RAWToYJRow_NEON(const uint8_t* src_raw, uint8_t* dst_yj, int width) {
2906 RGBToYMatrixRow_NEON(src_raw, dst_yj, width, &kRawJPEGConstants);
2907 }
2908
RGB24ToYRow_NEON(const uint8_t * src_rgb24,uint8_t * dst_y,int width)2909 void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width) {
2910 RGBToYMatrixRow_NEON(src_rgb24, dst_y, width, &kRgb24I601Constants);
2911 }
2912
RAWToYRow_NEON(const uint8_t * src_raw,uint8_t * dst_y,int width)2913 void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width) {
2914 RGBToYMatrixRow_NEON(src_raw, dst_y, width, &kRawI601Constants);
2915 }
2916
2917 // Bilinear filter 16x2 -> 16x1
InterpolateRow_NEON(uint8_t * dst_ptr,const uint8_t * src_ptr,ptrdiff_t src_stride,int dst_width,int source_y_fraction)2918 void InterpolateRow_NEON(uint8_t* dst_ptr,
2919 const uint8_t* src_ptr,
2920 ptrdiff_t src_stride,
2921 int dst_width,
2922 int source_y_fraction) {
2923 int y1_fraction = source_y_fraction;
2924 asm volatile(
2925 "cmp %4, #0 \n"
2926 "beq 100f \n"
2927 "add %2, %1 \n"
2928 "cmp %4, #128 \n"
2929 "beq 50f \n"
2930
2931 "vdup.8 d5, %4 \n"
2932 "rsb %4, #256 \n"
2933 "vdup.8 d4, %4 \n"
2934 // General purpose row blend.
2935 "1: \n"
2936 "vld1.8 {q0}, [%1]! \n"
2937 "vld1.8 {q1}, [%2]! \n"
2938 "subs %3, %3, #16 \n"
2939 "vmull.u8 q13, d0, d4 \n"
2940 "vmull.u8 q14, d1, d4 \n"
2941 "vmlal.u8 q13, d2, d5 \n"
2942 "vmlal.u8 q14, d3, d5 \n"
2943 "vrshrn.u16 d0, q13, #8 \n"
2944 "vrshrn.u16 d1, q14, #8 \n"
2945 "vst1.8 {q0}, [%0]! \n"
2946 "bgt 1b \n"
2947 "b 99f \n"
2948
2949 // Blend 50 / 50.
2950 "50: \n"
2951 "vld1.8 {q0}, [%1]! \n"
2952 "vld1.8 {q1}, [%2]! \n"
2953 "subs %3, %3, #16 \n"
2954 "vrhadd.u8 q0, q1 \n"
2955 "vst1.8 {q0}, [%0]! \n"
2956 "bgt 50b \n"
2957 "b 99f \n"
2958
2959 // Blend 100 / 0 - Copy row unchanged.
2960 "100: \n"
2961 "vld1.8 {q0}, [%1]! \n"
2962 "subs %3, %3, #16 \n"
2963 "vst1.8 {q0}, [%0]! \n"
2964 "bgt 100b \n"
2965
2966 "99: \n"
2967 : "+r"(dst_ptr), // %0
2968 "+r"(src_ptr), // %1
2969 "+r"(src_stride), // %2
2970 "+r"(dst_width), // %3
2971 "+r"(y1_fraction) // %4
2972 :
2973 : "cc", "memory", "q0", "q1", "d4", "d5", "q13", "q14");
2974 }
2975
2976 // Bilinear filter 8x2 -> 8x1
InterpolateRow_16_NEON(uint16_t * dst_ptr,const uint16_t * src_ptr,ptrdiff_t src_stride,int dst_width,int source_y_fraction)2977 void InterpolateRow_16_NEON(uint16_t* dst_ptr,
2978 const uint16_t* src_ptr,
2979 ptrdiff_t src_stride,
2980 int dst_width,
2981 int source_y_fraction) {
2982 int y1_fraction = source_y_fraction;
2983 int y0_fraction = 256 - y1_fraction;
2984 const uint16_t* src_ptr1 = src_ptr + src_stride;
2985
2986 asm volatile(
2987 "cmp %4, #0 \n"
2988 "beq 100f \n"
2989 "cmp %4, #128 \n"
2990 "beq 50f \n"
2991
2992 "vdup.16 d17, %4 \n"
2993 "vdup.16 d16, %5 \n"
2994 // General purpose row blend.
2995 "1: \n"
2996 "vld1.16 {q0}, [%1]! \n"
2997 "vld1.16 {q1}, [%2]! \n"
2998 "subs %3, %3, #8 \n"
2999 "vmull.u16 q2, d0, d16 \n"
3000 "vmull.u16 q3, d1, d16 \n"
3001 "vmlal.u16 q2, d2, d17 \n"
3002 "vmlal.u16 q3, d3, d17 \n"
3003 "vrshrn.u32 d0, q2, #8 \n"
3004 "vrshrn.u32 d1, q3, #8 \n"
3005 "vst1.16 {q0}, [%0]! \n"
3006 "bgt 1b \n"
3007 "b 99f \n"
3008
3009 // Blend 50 / 50.
3010 "50: \n"
3011 "vld1.16 {q0}, [%1]! \n"
3012 "vld1.16 {q1}, [%2]! \n"
3013 "subs %3, %3, #8 \n"
3014 "vrhadd.u16 q0, q1 \n"
3015 "vst1.16 {q0}, [%0]! \n"
3016 "bgt 50b \n"
3017 "b 99f \n"
3018
3019 // Blend 100 / 0 - Copy row unchanged.
3020 "100: \n"
3021 "vld1.16 {q0}, [%1]! \n"
3022 "subs %3, %3, #8 \n"
3023 "vst1.16 {q0}, [%0]! \n"
3024 "bgt 100b \n"
3025
3026 "99: \n"
3027 : "+r"(dst_ptr), // %0
3028 "+r"(src_ptr), // %1
3029 "+r"(src_ptr1), // %2
3030 "+r"(dst_width) // %3
3031 : "r"(y1_fraction), // %4
3032 "r"(y0_fraction) // %5
3033 : "cc", "memory", "q0", "q1", "q2", "q3", "q8");
3034 }
3035
3036 // dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr
ARGBBlendRow_NEON(const uint8_t * src_argb,const uint8_t * src_argb1,uint8_t * dst_argb,int width)3037 void ARGBBlendRow_NEON(const uint8_t* src_argb,
3038 const uint8_t* src_argb1,
3039 uint8_t* dst_argb,
3040 int width) {
3041 asm volatile(
3042 "subs %3, #8 \n"
3043 "blt 89f \n"
3044 // Blend 8 pixels.
3045 "8: \n"
3046 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB0.
3047 "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 pixels of ARGB1.
3048 "subs %3, %3, #8 \n" // 8 processed per loop.
3049 "vmull.u8 q10, d4, d3 \n" // db * a
3050 "vmull.u8 q11, d5, d3 \n" // dg * a
3051 "vmull.u8 q12, d6, d3 \n" // dr * a
3052 "vqrshrn.u16 d20, q10, #8 \n" // db >>= 8
3053 "vqrshrn.u16 d21, q11, #8 \n" // dg >>= 8
3054 "vqrshrn.u16 d22, q12, #8 \n" // dr >>= 8
3055 "vqsub.u8 q2, q2, q10 \n" // dbg - dbg * a / 256
3056 "vqsub.u8 d6, d6, d22 \n" // dr - dr * a / 256
3057 "vqadd.u8 q0, q0, q2 \n" // + sbg
3058 "vqadd.u8 d2, d2, d6 \n" // + sr
3059 "vmov.u8 d3, #255 \n" // a = 255
3060 "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 pixels of ARGB.
3061 "bge 8b \n"
3062
3063 "89: \n"
3064 "adds %3, #8-1 \n"
3065 "blt 99f \n"
3066
3067 // Blend 1 pixels.
3068 "1: \n"
3069 "vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [%0]! \n" // load 1 pixel ARGB0.
3070 "vld4.8 {d4[0],d5[0],d6[0],d7[0]}, [%1]! \n" // load 1 pixel ARGB1.
3071 "subs %3, %3, #1 \n" // 1 processed per loop.
3072 "vmull.u8 q10, d4, d3 \n" // db * a
3073 "vmull.u8 q11, d5, d3 \n" // dg * a
3074 "vmull.u8 q12, d6, d3 \n" // dr * a
3075 "vqrshrn.u16 d20, q10, #8 \n" // db >>= 8
3076 "vqrshrn.u16 d21, q11, #8 \n" // dg >>= 8
3077 "vqrshrn.u16 d22, q12, #8 \n" // dr >>= 8
3078 "vqsub.u8 q2, q2, q10 \n" // dbg - dbg * a / 256
3079 "vqsub.u8 d6, d6, d22 \n" // dr - dr * a / 256
3080 "vqadd.u8 q0, q0, q2 \n" // + sbg
3081 "vqadd.u8 d2, d2, d6 \n" // + sr
3082 "vmov.u8 d3, #255 \n" // a = 255
3083 "vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [%2]! \n" // store 1 pixel.
3084 "bge 1b \n"
3085
3086 "99: \n"
3087
3088 : "+r"(src_argb), // %0
3089 "+r"(src_argb1), // %1
3090 "+r"(dst_argb), // %2
3091 "+r"(width) // %3
3092 :
3093 : "cc", "memory", "q0", "q1", "q2", "q3", "q10", "q11", "q12");
3094 }
3095
3096 // Attenuate 8 pixels at a time.
ARGBAttenuateRow_NEON(const uint8_t * src_argb,uint8_t * dst_argb,int width)3097 void ARGBAttenuateRow_NEON(const uint8_t* src_argb,
3098 uint8_t* dst_argb,
3099 int width) {
3100 asm volatile(
3101 "vmov.u16 q15, #0x00ff \n" // 255 for rounding up
3102
3103 // Attenuate 8 pixels.
3104 "1: \n"
3105 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB.
3106 "subs %2, %2, #8 \n" // 8 processed per loop.
3107 "vmull.u8 q10, d0, d3 \n" // b * a
3108 "vmull.u8 q11, d1, d3 \n" // g * a
3109 "vmull.u8 q12, d2, d3 \n" // r * a
3110 "vaddhn.u16 d0, q10, q15 \n" // (b + 255) >> 8
3111 "vaddhn.u16 d1, q11, q15 \n" // (g + 255) >> 8
3112 "vaddhn.u16 d2, q12, q15 \n" // (r + 255) >> 8
3113 "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
3114 "bgt 1b \n"
3115 : "+r"(src_argb), // %0
3116 "+r"(dst_argb), // %1
3117 "+r"(width) // %2
3118 :
3119 : "cc", "memory", "q0", "q1", "q10", "q11", "q12", "q15");
3120 }
3121
3122 // Quantize 8 ARGB pixels (32 bytes).
3123 // dst = (dst * scale >> 16) * interval_size + interval_offset;
ARGBQuantizeRow_NEON(uint8_t * dst_argb,int scale,int interval_size,int interval_offset,int width)3124 void ARGBQuantizeRow_NEON(uint8_t* dst_argb,
3125 int scale,
3126 int interval_size,
3127 int interval_offset,
3128 int width) {
3129 asm volatile(
3130 "vdup.u16 q8, %2 \n"
3131 "vshr.u16 q8, q8, #1 \n" // scale >>= 1
3132 "vdup.u16 q9, %3 \n" // interval multiply.
3133 "vdup.u16 q10, %4 \n" // interval add
3134
3135 // 8 pixel loop.
3136 "1: \n"
3137 "vld4.8 {d0, d2, d4, d6}, [%0] \n" // load 8 pixels of ARGB.
3138 "subs %1, %1, #8 \n" // 8 processed per loop.
3139 "vmovl.u8 q0, d0 \n" // b (0 .. 255)
3140 "vmovl.u8 q1, d2 \n"
3141 "vmovl.u8 q2, d4 \n"
3142 "vqdmulh.s16 q0, q0, q8 \n" // b * scale
3143 "vqdmulh.s16 q1, q1, q8 \n" // g
3144 "vqdmulh.s16 q2, q2, q8 \n" // r
3145 "vmul.u16 q0, q0, q9 \n" // b * interval_size
3146 "vmul.u16 q1, q1, q9 \n" // g
3147 "vmul.u16 q2, q2, q9 \n" // r
3148 "vadd.u16 q0, q0, q10 \n" // b + interval_offset
3149 "vadd.u16 q1, q1, q10 \n" // g
3150 "vadd.u16 q2, q2, q10 \n" // r
3151 "vqmovn.u16 d0, q0 \n"
3152 "vqmovn.u16 d2, q1 \n"
3153 "vqmovn.u16 d4, q2 \n"
3154 "vst4.8 {d0, d2, d4, d6}, [%0]! \n" // store 8 pixels of ARGB.
3155 "bgt 1b \n"
3156 : "+r"(dst_argb), // %0
3157 "+r"(width) // %1
3158 : "r"(scale), // %2
3159 "r"(interval_size), // %3
3160 "r"(interval_offset) // %4
3161 : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10");
3162 }
3163
3164 // Shade 8 pixels at a time by specified value.
3165 // NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8.
3166 // Rounding in vqrdmulh does +1 to high if high bit of low s16 is set.
ARGBShadeRow_NEON(const uint8_t * src_argb,uint8_t * dst_argb,int width,uint32_t value)3167 void ARGBShadeRow_NEON(const uint8_t* src_argb,
3168 uint8_t* dst_argb,
3169 int width,
3170 uint32_t value) {
3171 asm volatile(
3172 "vdup.u32 q0, %3 \n" // duplicate scale value.
3173 "vzip.u8 d0, d1 \n" // d0 aarrggbb.
3174 "vshr.u16 q0, q0, #1 \n" // scale / 2.
3175
3176 // 8 pixel loop.
3177 "1: \n"
3178 "vld4.8 {d20, d22, d24, d26}, [%0]! \n" // load 8 pixels of ARGB.
3179 "subs %2, %2, #8 \n" // 8 processed per loop.
3180 "vmovl.u8 q10, d20 \n" // b (0 .. 255)
3181 "vmovl.u8 q11, d22 \n"
3182 "vmovl.u8 q12, d24 \n"
3183 "vmovl.u8 q13, d26 \n"
3184 "vqrdmulh.s16 q10, q10, d0[0] \n" // b * scale * 2
3185 "vqrdmulh.s16 q11, q11, d0[1] \n" // g
3186 "vqrdmulh.s16 q12, q12, d0[2] \n" // r
3187 "vqrdmulh.s16 q13, q13, d0[3] \n" // a
3188 "vqmovn.u16 d20, q10 \n"
3189 "vqmovn.u16 d22, q11 \n"
3190 "vqmovn.u16 d24, q12 \n"
3191 "vqmovn.u16 d26, q13 \n"
3192 "vst4.8 {d20, d22, d24, d26}, [%1]! \n" // store 8 pixels of ARGB.
3193 "bgt 1b \n"
3194 : "+r"(src_argb), // %0
3195 "+r"(dst_argb), // %1
3196 "+r"(width) // %2
3197 : "r"(value) // %3
3198 : "cc", "memory", "q0", "q10", "q11", "q12", "q13");
3199 }
3200
3201 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
3202 // Similar to ARGBToYJ but stores ARGB.
3203 // C code is (29 * b + 150 * g + 77 * r + 128) >> 8;
ARGBGrayRow_NEON(const uint8_t * src_argb,uint8_t * dst_argb,int width)3204 void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
3205 asm volatile(
3206 "vmov.u8 d24, #29 \n" // B * 0.1140 coefficient
3207 "vmov.u8 d25, #150 \n" // G * 0.5870 coefficient
3208 "vmov.u8 d26, #77 \n" // R * 0.2990 coefficient
3209 "1: \n"
3210 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
3211 "subs %2, %2, #8 \n" // 8 processed per loop.
3212 "vmull.u8 q2, d0, d24 \n" // B
3213 "vmlal.u8 q2, d1, d25 \n" // G
3214 "vmlal.u8 q2, d2, d26 \n" // R
3215 "vqrshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit B
3216 "vmov d1, d0 \n" // G
3217 "vmov d2, d0 \n" // R
3218 "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 ARGB pixels.
3219 "bgt 1b \n"
3220 : "+r"(src_argb), // %0
3221 "+r"(dst_argb), // %1
3222 "+r"(width) // %2
3223 :
3224 : "cc", "memory", "q0", "q1", "q2", "q12", "q13");
3225 }
3226
3227 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
3228 // b = (r * 35 + g * 68 + b * 17) >> 7
3229 // g = (r * 45 + g * 88 + b * 22) >> 7
3230 // r = (r * 50 + g * 98 + b * 24) >> 7
ARGBSepiaRow_NEON(uint8_t * dst_argb,int width)3231 void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width) {
3232 asm volatile(
3233 "vmov.u8 d20, #17 \n" // BB coefficient
3234 "vmov.u8 d21, #68 \n" // BG coefficient
3235 "vmov.u8 d22, #35 \n" // BR coefficient
3236 "vmov.u8 d24, #22 \n" // GB coefficient
3237 "vmov.u8 d25, #88 \n" // GG coefficient
3238 "vmov.u8 d26, #45 \n" // GR coefficient
3239 "vmov.u8 d28, #24 \n" // BB coefficient
3240 "vmov.u8 d29, #98 \n" // BG coefficient
3241 "vmov.u8 d30, #50 \n" // BR coefficient
3242 "1: \n"
3243 "vld4.8 {d0, d1, d2, d3}, [%0] \n" // load 8 ARGB pixels.
3244 "subs %1, %1, #8 \n" // 8 processed per loop.
3245 "vmull.u8 q2, d0, d20 \n" // B to Sepia B
3246 "vmlal.u8 q2, d1, d21 \n" // G
3247 "vmlal.u8 q2, d2, d22 \n" // R
3248 "vmull.u8 q3, d0, d24 \n" // B to Sepia G
3249 "vmlal.u8 q3, d1, d25 \n" // G
3250 "vmlal.u8 q3, d2, d26 \n" // R
3251 "vmull.u8 q8, d0, d28 \n" // B to Sepia R
3252 "vmlal.u8 q8, d1, d29 \n" // G
3253 "vmlal.u8 q8, d2, d30 \n" // R
3254 "vqshrn.u16 d0, q2, #7 \n" // 16 bit to 8 bit B
3255 "vqshrn.u16 d1, q3, #7 \n" // 16 bit to 8 bit G
3256 "vqshrn.u16 d2, q8, #7 \n" // 16 bit to 8 bit R
3257 "vst4.8 {d0, d1, d2, d3}, [%0]! \n" // store 8 ARGB pixels.
3258 "bgt 1b \n"
3259 : "+r"(dst_argb), // %0
3260 "+r"(width) // %1
3261 :
3262 : "cc", "memory", "q0", "q1", "q2", "q3", "q10", "q11", "q12", "q13",
3263 "q14", "q15");
3264 }
3265
3266 // Tranform 8 ARGB pixels (32 bytes) with color matrix.
3267 // TODO(fbarchard): Was same as Sepia except matrix is provided. This function
3268 // needs to saturate. Consider doing a non-saturating version.
ARGBColorMatrixRow_NEON(const uint8_t * src_argb,uint8_t * dst_argb,const int8_t * matrix_argb,int width)3269 void ARGBColorMatrixRow_NEON(const uint8_t* src_argb,
3270 uint8_t* dst_argb,
3271 const int8_t* matrix_argb,
3272 int width) {
3273 asm volatile(
3274 "vld1.8 {q2}, [%3] \n" // load 3 ARGB vectors.
3275 "vmovl.s8 q0, d4 \n" // B,G coefficients s16.
3276 "vmovl.s8 q1, d5 \n" // R,A coefficients s16.
3277
3278 "1: \n"
3279 "vld4.8 {d16, d18, d20, d22}, [%0]! \n" // load 8 ARGB pixels.
3280 "subs %2, %2, #8 \n" // 8 processed per loop.
3281 "vmovl.u8 q8, d16 \n" // b (0 .. 255) 16 bit
3282 "vmovl.u8 q9, d18 \n" // g
3283 "vmovl.u8 q10, d20 \n" // r
3284 "vmovl.u8 q11, d22 \n" // a
3285 "vmul.s16 q12, q8, d0[0] \n" // B = B * Matrix B
3286 "vmul.s16 q13, q8, d1[0] \n" // G = B * Matrix G
3287 "vmul.s16 q14, q8, d2[0] \n" // R = B * Matrix R
3288 "vmul.s16 q15, q8, d3[0] \n" // A = B * Matrix A
3289 "vmul.s16 q4, q9, d0[1] \n" // B += G * Matrix B
3290 "vmul.s16 q5, q9, d1[1] \n" // G += G * Matrix G
3291 "vmul.s16 q6, q9, d2[1] \n" // R += G * Matrix R
3292 "vmul.s16 q7, q9, d3[1] \n" // A += G * Matrix A
3293 "vqadd.s16 q12, q12, q4 \n" // Accumulate B
3294 "vqadd.s16 q13, q13, q5 \n" // Accumulate G
3295 "vqadd.s16 q14, q14, q6 \n" // Accumulate R
3296 "vqadd.s16 q15, q15, q7 \n" // Accumulate A
3297 "vmul.s16 q4, q10, d0[2] \n" // B += R * Matrix B
3298 "vmul.s16 q5, q10, d1[2] \n" // G += R * Matrix G
3299 "vmul.s16 q6, q10, d2[2] \n" // R += R * Matrix R
3300 "vmul.s16 q7, q10, d3[2] \n" // A += R * Matrix A
3301 "vqadd.s16 q12, q12, q4 \n" // Accumulate B
3302 "vqadd.s16 q13, q13, q5 \n" // Accumulate G
3303 "vqadd.s16 q14, q14, q6 \n" // Accumulate R
3304 "vqadd.s16 q15, q15, q7 \n" // Accumulate A
3305 "vmul.s16 q4, q11, d0[3] \n" // B += A * Matrix B
3306 "vmul.s16 q5, q11, d1[3] \n" // G += A * Matrix G
3307 "vmul.s16 q6, q11, d2[3] \n" // R += A * Matrix R
3308 "vmul.s16 q7, q11, d3[3] \n" // A += A * Matrix A
3309 "vqadd.s16 q12, q12, q4 \n" // Accumulate B
3310 "vqadd.s16 q13, q13, q5 \n" // Accumulate G
3311 "vqadd.s16 q14, q14, q6 \n" // Accumulate R
3312 "vqadd.s16 q15, q15, q7 \n" // Accumulate A
3313 "vqshrun.s16 d16, q12, #6 \n" // 16 bit to 8 bit B
3314 "vqshrun.s16 d18, q13, #6 \n" // 16 bit to 8 bit G
3315 "vqshrun.s16 d20, q14, #6 \n" // 16 bit to 8 bit R
3316 "vqshrun.s16 d22, q15, #6 \n" // 16 bit to 8 bit A
3317 "vst4.8 {d16, d18, d20, d22}, [%1]! \n" // store 8 ARGB pixels.
3318 "bgt 1b \n"
3319 : "+r"(src_argb), // %0
3320 "+r"(dst_argb), // %1
3321 "+r"(width) // %2
3322 : "r"(matrix_argb) // %3
3323 : "cc", "memory", "q0", "q1", "q2", "q4", "q5", "q6", "q7", "q8", "q9",
3324 "q10", "q11", "q12", "q13", "q14", "q15");
3325 }
3326
3327 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
ARGBMultiplyRow_NEON(const uint8_t * src_argb,const uint8_t * src_argb1,uint8_t * dst_argb,int width)3328 void ARGBMultiplyRow_NEON(const uint8_t* src_argb,
3329 const uint8_t* src_argb1,
3330 uint8_t* dst_argb,
3331 int width) {
3332 asm volatile(
3333 // 8 pixel loop.
3334 "1: \n"
3335 "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
3336 "vld4.8 {d1, d3, d5, d7}, [%1]! \n" // load 8 more ARGB
3337 "subs %3, %3, #8 \n" // 8 processed per loop.
3338 "vmull.u8 q0, d0, d1 \n" // multiply B
3339 "vmull.u8 q1, d2, d3 \n" // multiply G
3340 "vmull.u8 q2, d4, d5 \n" // multiply R
3341 "vmull.u8 q3, d6, d7 \n" // multiply A
3342 "vrshrn.u16 d0, q0, #8 \n" // 16 bit to 8 bit B
3343 "vrshrn.u16 d1, q1, #8 \n" // 16 bit to 8 bit G
3344 "vrshrn.u16 d2, q2, #8 \n" // 16 bit to 8 bit R
3345 "vrshrn.u16 d3, q3, #8 \n" // 16 bit to 8 bit A
3346 "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
3347 "bgt 1b \n"
3348 : "+r"(src_argb), // %0
3349 "+r"(src_argb1), // %1
3350 "+r"(dst_argb), // %2
3351 "+r"(width) // %3
3352 :
3353 : "cc", "memory", "q0", "q1", "q2", "q3");
3354 }
3355
3356 // Add 2 rows of ARGB pixels together, 8 pixels at a time.
ARGBAddRow_NEON(const uint8_t * src_argb,const uint8_t * src_argb1,uint8_t * dst_argb,int width)3357 void ARGBAddRow_NEON(const uint8_t* src_argb,
3358 const uint8_t* src_argb1,
3359 uint8_t* dst_argb,
3360 int width) {
3361 asm volatile(
3362 // 8 pixel loop.
3363 "1: \n"
3364 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
3365 "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB
3366 "subs %3, %3, #8 \n" // 8 processed per loop.
3367 "vqadd.u8 q0, q0, q2 \n" // add B, G
3368 "vqadd.u8 q1, q1, q3 \n" // add R, A
3369 "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
3370 "bgt 1b \n"
3371 : "+r"(src_argb), // %0
3372 "+r"(src_argb1), // %1
3373 "+r"(dst_argb), // %2
3374 "+r"(width) // %3
3375 :
3376 : "cc", "memory", "q0", "q1", "q2", "q3");
3377 }
3378
3379 // Subtract 2 rows of ARGB pixels, 8 pixels at a time.
ARGBSubtractRow_NEON(const uint8_t * src_argb,const uint8_t * src_argb1,uint8_t * dst_argb,int width)3380 void ARGBSubtractRow_NEON(const uint8_t* src_argb,
3381 const uint8_t* src_argb1,
3382 uint8_t* dst_argb,
3383 int width) {
3384 asm volatile(
3385 // 8 pixel loop.
3386 "1: \n"
3387 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
3388 "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB
3389 "subs %3, %3, #8 \n" // 8 processed per loop.
3390 "vqsub.u8 q0, q0, q2 \n" // subtract B, G
3391 "vqsub.u8 q1, q1, q3 \n" // subtract R, A
3392 "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
3393 "bgt 1b \n"
3394 : "+r"(src_argb), // %0
3395 "+r"(src_argb1), // %1
3396 "+r"(dst_argb), // %2
3397 "+r"(width) // %3
3398 :
3399 : "cc", "memory", "q0", "q1", "q2", "q3");
3400 }
3401
3402 // Adds Sobel X and Sobel Y and stores Sobel into ARGB.
3403 // A = 255
3404 // R = Sobel
3405 // G = Sobel
3406 // B = Sobel
SobelRow_NEON(const uint8_t * src_sobelx,const uint8_t * src_sobely,uint8_t * dst_argb,int width)3407 void SobelRow_NEON(const uint8_t* src_sobelx,
3408 const uint8_t* src_sobely,
3409 uint8_t* dst_argb,
3410 int width) {
3411 asm volatile(
3412 "vmov.u8 d3, #255 \n" // alpha
3413 // 8 pixel loop.
3414 "1: \n"
3415 "vld1.8 {d0}, [%0]! \n" // load 8 sobelx.
3416 "vld1.8 {d1}, [%1]! \n" // load 8 sobely.
3417 "subs %3, %3, #8 \n" // 8 processed per loop.
3418 "vqadd.u8 d0, d0, d1 \n" // add
3419 "vmov.u8 d1, d0 \n"
3420 "vmov.u8 d2, d0 \n"
3421 "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
3422 "bgt 1b \n"
3423 : "+r"(src_sobelx), // %0
3424 "+r"(src_sobely), // %1
3425 "+r"(dst_argb), // %2
3426 "+r"(width) // %3
3427 :
3428 : "cc", "memory", "q0", "q1");
3429 }
3430
3431 // Adds Sobel X and Sobel Y and stores Sobel into plane.
SobelToPlaneRow_NEON(const uint8_t * src_sobelx,const uint8_t * src_sobely,uint8_t * dst_y,int width)3432 void SobelToPlaneRow_NEON(const uint8_t* src_sobelx,
3433 const uint8_t* src_sobely,
3434 uint8_t* dst_y,
3435 int width) {
3436 asm volatile(
3437 // 16 pixel loop.
3438 "1: \n"
3439 "vld1.8 {q0}, [%0]! \n" // load 16 sobelx.
3440 "vld1.8 {q1}, [%1]! \n" // load 16 sobely.
3441 "subs %3, %3, #16 \n" // 16 processed per loop.
3442 "vqadd.u8 q0, q0, q1 \n" // add
3443 "vst1.8 {q0}, [%2]! \n" // store 16 pixels.
3444 "bgt 1b \n"
3445 : "+r"(src_sobelx), // %0
3446 "+r"(src_sobely), // %1
3447 "+r"(dst_y), // %2
3448 "+r"(width) // %3
3449 :
3450 : "cc", "memory", "q0", "q1");
3451 }
3452
3453 // Mixes Sobel X, Sobel Y and Sobel into ARGB.
3454 // A = 255
3455 // R = Sobel X
3456 // G = Sobel
3457 // B = Sobel Y
SobelXYRow_NEON(const uint8_t * src_sobelx,const uint8_t * src_sobely,uint8_t * dst_argb,int width)3458 void SobelXYRow_NEON(const uint8_t* src_sobelx,
3459 const uint8_t* src_sobely,
3460 uint8_t* dst_argb,
3461 int width) {
3462 asm volatile(
3463 "vmov.u8 d3, #255 \n" // alpha
3464 // 8 pixel loop.
3465 "1: \n"
3466 "vld1.8 {d2}, [%0]! \n" // load 8 sobelx.
3467 "vld1.8 {d0}, [%1]! \n" // load 8 sobely.
3468 "subs %3, %3, #8 \n" // 8 processed per loop.
3469 "vqadd.u8 d1, d0, d2 \n" // add
3470 "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
3471 "bgt 1b \n"
3472 : "+r"(src_sobelx), // %0
3473 "+r"(src_sobely), // %1
3474 "+r"(dst_argb), // %2
3475 "+r"(width) // %3
3476 :
3477 : "cc", "memory", "q0", "q1");
3478 }
3479
3480 // SobelX as a matrix is
3481 // -1 0 1
3482 // -2 0 2
3483 // -1 0 1
SobelXRow_NEON(const uint8_t * src_y0,const uint8_t * src_y1,const uint8_t * src_y2,uint8_t * dst_sobelx,int width)3484 void SobelXRow_NEON(const uint8_t* src_y0,
3485 const uint8_t* src_y1,
3486 const uint8_t* src_y2,
3487 uint8_t* dst_sobelx,
3488 int width) {
3489 asm volatile(
3490 "1: \n"
3491 "vld1.8 {d0}, [%0],%5 \n" // top
3492 "vld1.8 {d1}, [%0],%6 \n"
3493 "vsubl.u8 q0, d0, d1 \n"
3494 "vld1.8 {d2}, [%1],%5 \n" // center * 2
3495 "vld1.8 {d3}, [%1],%6 \n"
3496 "vsubl.u8 q1, d2, d3 \n"
3497 "vadd.s16 q0, q0, q1 \n"
3498 "vadd.s16 q0, q0, q1 \n"
3499 "vld1.8 {d2}, [%2],%5 \n" // bottom
3500 "vld1.8 {d3}, [%2],%6 \n"
3501 "subs %4, %4, #8 \n" // 8 pixels
3502 "vsubl.u8 q1, d2, d3 \n"
3503 "vadd.s16 q0, q0, q1 \n"
3504 "vabs.s16 q0, q0 \n"
3505 "vqmovn.u16 d0, q0 \n"
3506 "vst1.8 {d0}, [%3]! \n" // store 8 sobelx
3507 "bgt 1b \n"
3508 : "+r"(src_y0), // %0
3509 "+r"(src_y1), // %1
3510 "+r"(src_y2), // %2
3511 "+r"(dst_sobelx), // %3
3512 "+r"(width) // %4
3513 : "r"(2), // %5
3514 "r"(6) // %6
3515 : "cc", "memory", "q0", "q1" // Clobber List
3516 );
3517 }
3518
3519 // SobelY as a matrix is
3520 // -1 -2 -1
3521 // 0 0 0
3522 // 1 2 1
SobelYRow_NEON(const uint8_t * src_y0,const uint8_t * src_y1,uint8_t * dst_sobely,int width)3523 void SobelYRow_NEON(const uint8_t* src_y0,
3524 const uint8_t* src_y1,
3525 uint8_t* dst_sobely,
3526 int width) {
3527 asm volatile(
3528 "1: \n"
3529 "vld1.8 {d0}, [%0],%4 \n" // left
3530 "vld1.8 {d1}, [%1],%4 \n"
3531 "vsubl.u8 q0, d0, d1 \n"
3532 "vld1.8 {d2}, [%0],%4 \n" // center * 2
3533 "vld1.8 {d3}, [%1],%4 \n"
3534 "vsubl.u8 q1, d2, d3 \n"
3535 "vadd.s16 q0, q0, q1 \n"
3536 "vadd.s16 q0, q0, q1 \n"
3537 "vld1.8 {d2}, [%0],%5 \n" // right
3538 "vld1.8 {d3}, [%1],%5 \n"
3539 "subs %3, %3, #8 \n" // 8 pixels
3540 "vsubl.u8 q1, d2, d3 \n"
3541 "vadd.s16 q0, q0, q1 \n"
3542 "vabs.s16 q0, q0 \n"
3543 "vqmovn.u16 d0, q0 \n"
3544 "vst1.8 {d0}, [%2]! \n" // store 8 sobely
3545 "bgt 1b \n"
3546 : "+r"(src_y0), // %0
3547 "+r"(src_y1), // %1
3548 "+r"(dst_sobely), // %2
3549 "+r"(width) // %3
3550 : "r"(1), // %4
3551 "r"(6) // %5
3552 : "cc", "memory", "q0", "q1" // Clobber List
3553 );
3554 }
3555
3556 // %y passes a float as a scalar vector for vector * scalar multiply.
3557 // the regoster must be d0 to d15 and indexed with [0] or [1] to access
3558 // the float in the first or second float of the d-reg
3559
HalfFloat1Row_NEON(const uint16_t * src,uint16_t * dst,float,int width)3560 void HalfFloat1Row_NEON(const uint16_t* src,
3561 uint16_t* dst,
3562 float /*unused*/,
3563 int width) {
3564 asm volatile(
3565
3566 "1: \n"
3567 "vld1.8 {q1}, [%0]! \n" // load 8 shorts
3568 "subs %2, %2, #8 \n" // 8 pixels per loop
3569 "vmovl.u16 q2, d2 \n" // 8 int's
3570 "vmovl.u16 q3, d3 \n"
3571 "vcvt.f32.u32 q2, q2 \n" // 8 floats
3572 "vcvt.f32.u32 q3, q3 \n"
3573 "vmul.f32 q2, q2, %y3 \n" // adjust exponent
3574 "vmul.f32 q3, q3, %y3 \n"
3575 "vqshrn.u32 d2, q2, #13 \n" // isolate halffloat
3576 "vqshrn.u32 d3, q3, #13 \n"
3577 "vst1.8 {q1}, [%1]! \n"
3578 "bgt 1b \n"
3579 : "+r"(src), // %0
3580 "+r"(dst), // %1
3581 "+r"(width) // %2
3582 : "w"(1.9259299444e-34f) // %3
3583 : "cc", "memory", "q1", "q2", "q3");
3584 }
3585
HalfFloatRow_NEON(const uint16_t * src,uint16_t * dst,float scale,int width)3586 void HalfFloatRow_NEON(const uint16_t* src,
3587 uint16_t* dst,
3588 float scale,
3589 int width) {
3590 asm volatile(
3591
3592 "1: \n"
3593 "vld1.8 {q1}, [%0]! \n" // load 8 shorts
3594 "subs %2, %2, #8 \n" // 8 pixels per loop
3595 "vmovl.u16 q2, d2 \n" // 8 int's
3596 "vmovl.u16 q3, d3 \n"
3597 "vcvt.f32.u32 q2, q2 \n" // 8 floats
3598 "vcvt.f32.u32 q3, q3 \n"
3599 "vmul.f32 q2, q2, %y3 \n" // adjust exponent
3600 "vmul.f32 q3, q3, %y3 \n"
3601 "vqshrn.u32 d2, q2, #13 \n" // isolate halffloat
3602 "vqshrn.u32 d3, q3, #13 \n"
3603 "vst1.8 {q1}, [%1]! \n"
3604 "bgt 1b \n"
3605 : "+r"(src), // %0
3606 "+r"(dst), // %1
3607 "+r"(width) // %2
3608 : "w"(scale * 1.9259299444e-34f) // %3
3609 : "cc", "memory", "q1", "q2", "q3");
3610 }
3611
ByteToFloatRow_NEON(const uint8_t * src,float * dst,float scale,int width)3612 void ByteToFloatRow_NEON(const uint8_t* src,
3613 float* dst,
3614 float scale,
3615 int width) {
3616 asm volatile(
3617
3618 "1: \n"
3619 "vld1.8 {d2}, [%0]! \n" // load 8 bytes
3620 "subs %2, %2, #8 \n" // 8 pixels per loop
3621 "vmovl.u8 q1, d2 \n" // 8 shorts
3622 "vmovl.u16 q2, d2 \n" // 8 ints
3623 "vmovl.u16 q3, d3 \n"
3624 "vcvt.f32.u32 q2, q2 \n" // 8 floats
3625 "vcvt.f32.u32 q3, q3 \n"
3626 "vmul.f32 q2, q2, %y3 \n" // scale
3627 "vmul.f32 q3, q3, %y3 \n"
3628 "vst1.8 {q2, q3}, [%1]! \n" // store 8 floats
3629 "bgt 1b \n"
3630 : "+r"(src), // %0
3631 "+r"(dst), // %1
3632 "+r"(width) // %2
3633 : "w"(scale) // %3
3634 : "cc", "memory", "q1", "q2", "q3");
3635 }
3636
3637 // filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
GaussCol_NEON(const uint16_t * src0,const uint16_t * src1,const uint16_t * src2,const uint16_t * src3,const uint16_t * src4,uint32_t * dst,int width)3638 void GaussCol_NEON(const uint16_t* src0,
3639 const uint16_t* src1,
3640 const uint16_t* src2,
3641 const uint16_t* src3,
3642 const uint16_t* src4,
3643 uint32_t* dst,
3644 int width) {
3645 asm volatile(
3646 "vmov.u16 d6, #4 \n" // constant 4
3647 "vmov.u16 d7, #6 \n" // constant 6
3648
3649 "1: \n"
3650 "vld1.16 {q1}, [%0]! \n" // load 8 samples, 5 rows
3651 "vld1.16 {q2}, [%4]! \n"
3652 "vaddl.u16 q0, d2, d4 \n" // * 1
3653 "vaddl.u16 q1, d3, d5 \n" // * 1
3654 "vld1.16 {q2}, [%1]! \n"
3655 "vmlal.u16 q0, d4, d6 \n" // * 4
3656 "vmlal.u16 q1, d5, d6 \n" // * 4
3657 "vld1.16 {q2}, [%2]! \n"
3658 "vmlal.u16 q0, d4, d7 \n" // * 6
3659 "vmlal.u16 q1, d5, d7 \n" // * 6
3660 "vld1.16 {q2}, [%3]! \n"
3661 "vmlal.u16 q0, d4, d6 \n" // * 4
3662 "vmlal.u16 q1, d5, d6 \n" // * 4
3663 "subs %6, %6, #8 \n" // 8 processed per loop
3664 "vst1.32 {q0, q1}, [%5]! \n" // store 8 samples
3665 "bgt 1b \n"
3666 : "+r"(src0), // %0
3667 "+r"(src1), // %1
3668 "+r"(src2), // %2
3669 "+r"(src3), // %3
3670 "+r"(src4), // %4
3671 "+r"(dst), // %5
3672 "+r"(width) // %6
3673 :
3674 : "cc", "memory", "q0", "q1", "q2", "q3");
3675 }
3676
3677 // filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
GaussRow_NEON(const uint32_t * src,uint16_t * dst,int width)3678 void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width) {
3679 const uint32_t* src1 = src + 1;
3680 const uint32_t* src2 = src + 2;
3681 const uint32_t* src3 = src + 3;
3682 asm volatile(
3683 "vmov.u32 q10, #4 \n" // constant 4
3684 "vmov.u32 q11, #6 \n" // constant 6
3685
3686 "1: \n"
3687 "vld1.32 {q0, q1}, [%0]! \n" // load 12 source samples
3688 "vld1.32 {q2}, [%0] \n"
3689 "vadd.u32 q0, q0, q1 \n" // * 1
3690 "vadd.u32 q1, q1, q2 \n" // * 1
3691 "vld1.32 {q2, q3}, [%2]! \n"
3692 "vmla.u32 q0, q2, q11 \n" // * 6
3693 "vmla.u32 q1, q3, q11 \n" // * 6
3694 "vld1.32 {q2, q3}, [%1]! \n"
3695 "vld1.32 {q8, q9}, [%3]! \n"
3696 "vadd.u32 q2, q2, q8 \n" // add rows for * 4
3697 "vadd.u32 q3, q3, q9 \n"
3698 "vmla.u32 q0, q2, q10 \n" // * 4
3699 "vmla.u32 q1, q3, q10 \n" // * 4
3700 "subs %5, %5, #8 \n" // 8 processed per loop
3701 "vqshrn.u32 d0, q0, #8 \n" // round and pack
3702 "vqshrn.u32 d1, q1, #8 \n"
3703 "vst1.u16 {q0}, [%4]! \n" // store 8 samples
3704 "bgt 1b \n"
3705 : "+r"(src), // %0
3706 "+r"(src1), // %1
3707 "+r"(src2), // %2
3708 "+r"(src3), // %3
3709 "+r"(dst), // %4
3710 "+r"(width) // %5
3711 :
3712 : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
3713 }
3714
3715 // Convert biplanar NV21 to packed YUV24
NV21ToYUV24Row_NEON(const uint8_t * src_y,const uint8_t * src_vu,uint8_t * dst_yuv24,int width)3716 void NV21ToYUV24Row_NEON(const uint8_t* src_y,
3717 const uint8_t* src_vu,
3718 uint8_t* dst_yuv24,
3719 int width) {
3720 asm volatile(
3721 "1: \n"
3722 "vld1.8 {q2}, [%0]! \n" // load 16 Y values
3723 "vld2.8 {d0, d2}, [%1]! \n" // load 8 VU values
3724 "vmov d1, d0 \n"
3725 "vzip.u8 d0, d1 \n" // VV
3726 "vmov d3, d2 \n"
3727 "vzip.u8 d2, d3 \n" // UU
3728 "subs %3, %3, #16 \n" // 16 pixels per loop
3729 "vst3.8 {d0, d2, d4}, [%2]! \n" // store 16 YUV pixels
3730 "vst3.8 {d1, d3, d5}, [%2]! \n"
3731 "bgt 1b \n"
3732 : "+r"(src_y), // %0
3733 "+r"(src_vu), // %1
3734 "+r"(dst_yuv24), // %2
3735 "+r"(width) // %3
3736 :
3737 : "cc", "memory", "q0", "q1", "q2");
3738 }
3739
AYUVToUVRow_NEON(const uint8_t * src_ayuv,int src_stride_ayuv,uint8_t * dst_uv,int width)3740 void AYUVToUVRow_NEON(const uint8_t* src_ayuv,
3741 int src_stride_ayuv,
3742 uint8_t* dst_uv,
3743 int width) {
3744 asm volatile(
3745 "add %1, %0, %1 \n" // src_stride + src_AYUV
3746 "1: \n"
3747 "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels.
3748 "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 AYUV
3749 // pixels.
3750 "vpaddl.u8 q0, q0 \n" // V 16 bytes -> 8 shorts.
3751 "vpaddl.u8 q1, q1 \n" // U 16 bytes -> 8 shorts.
3752 "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more AYUV
3753 // pixels.
3754 "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 AYUV
3755 // pixels.
3756 "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
3757 "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
3758 "vqrshrun.s16 d1, q0, #2 \n" // 2x2 average
3759 "vqrshrun.s16 d0, q1, #2 \n"
3760 "subs %3, %3, #16 \n" // 16 processed per loop.
3761 "vst2.8 {d0, d1}, [%2]! \n" // store 8 pixels UV.
3762 "bgt 1b \n"
3763 : "+r"(src_ayuv), // %0
3764 "+r"(src_stride_ayuv), // %1
3765 "+r"(dst_uv), // %2
3766 "+r"(width) // %3
3767 :
3768 : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7");
3769 }
3770
AYUVToVURow_NEON(const uint8_t * src_ayuv,int src_stride_ayuv,uint8_t * dst_vu,int width)3771 void AYUVToVURow_NEON(const uint8_t* src_ayuv,
3772 int src_stride_ayuv,
3773 uint8_t* dst_vu,
3774 int width) {
3775 asm volatile(
3776 "add %1, %0, %1 \n" // src_stride + src_AYUV
3777 "1: \n"
3778 "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels.
3779 "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 AYUV
3780 // pixels.
3781 "vpaddl.u8 q0, q0 \n" // V 16 bytes -> 8 shorts.
3782 "vpaddl.u8 q1, q1 \n" // U 16 bytes -> 8 shorts.
3783 "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more AYUV
3784 // pixels.
3785 "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 AYUV
3786 // pixels.
3787 "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
3788 "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
3789 "vqrshrun.s16 d0, q0, #2 \n" // 2x2 average
3790 "vqrshrun.s16 d1, q1, #2 \n"
3791 "subs %3, %3, #16 \n" // 16 processed per loop.
3792 "vst2.8 {d0, d1}, [%2]! \n" // store 8 pixels VU.
3793 "bgt 1b \n"
3794 : "+r"(src_ayuv), // %0
3795 "+r"(src_stride_ayuv), // %1
3796 "+r"(dst_vu), // %2
3797 "+r"(width) // %3
3798 :
3799 : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7");
3800 }
3801
3802 // Copy row of AYUV Y's into Y.
3803 // Similar to ARGBExtractAlphaRow_NEON
AYUVToYRow_NEON(const uint8_t * src_ayuv,uint8_t * dst_y,int width)3804 void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) {
3805 asm volatile(
3806 "1: \n"
3807 "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels
3808 "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 AYUV pixels
3809 "subs %2, %2, #16 \n" // 16 processed per loop
3810 "vst1.8 {q2}, [%1]! \n" // store 16 Y's.
3811 "bgt 1b \n"
3812 : "+r"(src_ayuv), // %0
3813 "+r"(dst_y), // %1
3814 "+r"(width) // %2
3815 :
3816 : "cc", "memory", "q0", "q1", "q2", "q3");
3817 }
3818
3819 // Convert UV plane of NV12 to VU of NV21.
SwapUVRow_NEON(const uint8_t * src_uv,uint8_t * dst_vu,int width)3820 void SwapUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
3821 asm volatile(
3822 "1: \n"
3823 "vld2.8 {d0, d2}, [%0]! \n" // load 16 UV values
3824 "vld2.8 {d1, d3}, [%0]! \n"
3825 "vorr.u8 q2, q0, q0 \n" // move U after V
3826 "subs %2, %2, #16 \n" // 16 pixels per loop
3827 "vst2.8 {q1, q2}, [%1]! \n" // store 16 VU pixels
3828 "bgt 1b \n"
3829 : "+r"(src_uv), // %0
3830 "+r"(dst_vu), // %1
3831 "+r"(width) // %2
3832 :
3833 : "cc", "memory", "q0", "q1", "q2");
3834 }
3835
HalfMergeUVRow_NEON(const uint8_t * src_u,int src_stride_u,const uint8_t * src_v,int src_stride_v,uint8_t * dst_uv,int width)3836 void HalfMergeUVRow_NEON(const uint8_t* src_u,
3837 int src_stride_u,
3838 const uint8_t* src_v,
3839 int src_stride_v,
3840 uint8_t* dst_uv,
3841 int width) {
3842 const uint8_t* src_u_1 = src_u + src_stride_u;
3843 const uint8_t* src_v_1 = src_v + src_stride_v;
3844 asm volatile(
3845 "1: \n"
3846 "vld1.8 {q0}, [%0]! \n" // load 16 U values
3847 "vld1.8 {q1}, [%2]! \n" // load 16 V values
3848 "vld1.8 {q2}, [%1]! \n"
3849 "vld1.8 {q3}, [%3]! \n"
3850 "vpaddl.u8 q0, q0 \n" // half size
3851 "vpaddl.u8 q1, q1 \n"
3852 "vpadal.u8 q0, q2 \n"
3853 "vpadal.u8 q1, q3 \n"
3854 "vqrshrn.u16 d0, q0, #2 \n"
3855 "vqrshrn.u16 d1, q1, #2 \n"
3856 "subs %5, %5, #16 \n" // 16 src pixels per loop
3857 "vst2.8 {d0, d1}, [%4]! \n" // store 8 UV pixels
3858 "bgt 1b \n"
3859 : "+r"(src_u), // %0
3860 "+r"(src_u_1), // %1
3861 "+r"(src_v), // %2
3862 "+r"(src_v_1), // %3
3863 "+r"(dst_uv), // %4
3864 "+r"(width) // %5
3865 :
3866 : "cc", "memory", "q0", "q1", "q2", "q3");
3867 }
3868
SplitUVRow_16_NEON(const uint16_t * src_uv,uint16_t * dst_u,uint16_t * dst_v,int depth,int width)3869 void SplitUVRow_16_NEON(const uint16_t* src_uv,
3870 uint16_t* dst_u,
3871 uint16_t* dst_v,
3872 int depth,
3873 int width) {
3874 int shift = depth - 16; // Negative for right shift.
3875 asm volatile(
3876 "vdup.16 q2, %4 \n"
3877 "1: \n"
3878 "vld2.16 {q0, q1}, [%0]! \n" // load 8 UV
3879 "vshl.u16 q0, q0, q2 \n"
3880 "vshl.u16 q1, q1, q2 \n"
3881 "subs %3, %3, #8 \n" // 8 src pixels per loop
3882 "vst1.16 {q0}, [%1]! \n" // store 8 U pixels
3883 "vst1.16 {q1}, [%2]! \n" // store 8 V pixels
3884 "bgt 1b \n"
3885 : "+r"(src_uv), // %0
3886 "+r"(dst_u), // %1
3887 "+r"(dst_v), // %2
3888 "+r"(width) // %3
3889 : "r"(shift) // %4
3890 : "cc", "memory", "q0", "q1", "q2");
3891 }
3892
MergeUVRow_16_NEON(const uint16_t * src_u,const uint16_t * src_v,uint16_t * dst_uv,int depth,int width)3893 void MergeUVRow_16_NEON(const uint16_t* src_u,
3894 const uint16_t* src_v,
3895 uint16_t* dst_uv,
3896 int depth,
3897 int width) {
3898 int shift = 16 - depth;
3899 asm volatile(
3900 "vdup.16 q2, %4 \n"
3901 "1: \n"
3902 "vld1.16 {q0}, [%0]! \n" // load 8 U
3903 "vld1.16 {q1}, [%1]! \n" // load 8 V
3904 "vshl.u16 q0, q0, q2 \n"
3905 "vshl.u16 q1, q1, q2 \n"
3906 "subs %3, %3, #8 \n" // 8 src pixels per loop
3907 "vst2.16 {q0, q1}, [%2]! \n" // store 8 UV pixels
3908 "bgt 1b \n"
3909 : "+r"(src_u), // %0
3910 "+r"(src_v), // %1
3911 "+r"(dst_uv), // %2
3912 "+r"(width) // %3
3913 : "r"(shift) // %4
3914 : "cc", "memory", "q0", "q1", "q2");
3915 }
3916
MultiplyRow_16_NEON(const uint16_t * src_y,uint16_t * dst_y,int scale,int width)3917 void MultiplyRow_16_NEON(const uint16_t* src_y,
3918 uint16_t* dst_y,
3919 int scale,
3920 int width) {
3921 asm volatile(
3922 "vdup.16 q2, %3 \n"
3923 "1: \n"
3924 "vld1.16 {q0}, [%0]! \n"
3925 "vld1.16 {q1}, [%0]! \n"
3926 "vmul.u16 q0, q0, q2 \n"
3927 "vmul.u16 q1, q1, q2 \n"
3928 "vst1.16 {q0}, [%1]! \n"
3929 "vst1.16 {q1}, [%1]! \n"
3930 "subs %2, %2, #16 \n" // 16 src pixels per loop
3931 "bgt 1b \n"
3932 : "+r"(src_y), // %0
3933 "+r"(dst_y), // %1
3934 "+r"(width) // %2
3935 : "r"(scale) // %3
3936 : "cc", "memory", "q0", "q1", "q2");
3937 }
3938
DivideRow_16_NEON(const uint16_t * src_y,uint16_t * dst_y,int scale,int width)3939 void DivideRow_16_NEON(const uint16_t* src_y,
3940 uint16_t* dst_y,
3941 int scale,
3942 int width) {
3943 asm volatile(
3944 "vdup.16 d8, %3 \n"
3945 "1: \n"
3946 "vld1.16 {q2, q3}, [%0]! \n"
3947 "vmull.u16 q0, d4, d8 \n"
3948 "vmull.u16 q1, d5, d8 \n"
3949 "vmull.u16 q2, d6, d8 \n"
3950 "vmull.u16 q3, d7, d8 \n"
3951 "vshrn.u32 d0, q0, #16 \n"
3952 "vshrn.u32 d1, q1, #16 \n"
3953 "vshrn.u32 d2, q2, #16 \n"
3954 "vshrn.u32 d3, q3, #16 \n"
3955 "vst1.16 {q0, q1}, [%1]! \n" // store 16 pixels
3956 "subs %2, %2, #16 \n" // 16 src pixels per loop
3957 "bgt 1b \n"
3958 : "+r"(src_y), // %0
3959 "+r"(dst_y), // %1
3960 "+r"(width) // %2
3961 : "r"(scale) // %3
3962 : "cc", "memory", "q0", "q1", "q2", "q3", "d8");
3963 }
3964
3965 // Use scale to convert lsb formats to msb, depending how many bits there are:
3966 // 32768 = 9 bits = shr 1
3967 // 16384 = 10 bits = shr 2
3968 // 4096 = 12 bits = shr 4
3969 // 256 = 16 bits = shr 8
Convert16To8Row_NEON(const uint16_t * src_y,uint8_t * dst_y,int scale,int width)3970 void Convert16To8Row_NEON(const uint16_t* src_y,
3971 uint8_t* dst_y,
3972 int scale,
3973 int width) {
3974 int shift = 15 - __builtin_clz((int32_t)scale); // Negative shl is shr
3975 asm volatile(
3976 "vdup.16 q2, %3 \n"
3977 "1: \n"
3978 "vld1.16 {q0}, [%0]! \n"
3979 "vld1.16 {q1}, [%0]! \n"
3980 "vshl.u16 q0, q0, q2 \n" // shr = q2 is negative
3981 "vshl.u16 q1, q1, q2 \n"
3982 "vqmovn.u16 d0, q0 \n"
3983 "vqmovn.u16 d1, q1 \n"
3984 "subs %2, %2, #16 \n" // 16 src pixels per loop
3985 "vst1.8 {q0}, [%1]! \n"
3986 "bgt 1b \n"
3987 : "+r"(src_y), // %0
3988 "+r"(dst_y), // %1
3989 "+r"(width) // %2
3990 : "r"(shift) // %3
3991 : "cc", "memory", "q0", "q1", "q2");
3992 }
3993
3994 #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)..
3995
3996 #ifdef __cplusplus
3997 } // extern "C"
3998 } // namespace libyuv
3999 #endif
4000