1 /*
2 * Copyright 2011 The LibYuv Project Authors. All rights reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "libyuv/row.h"
12 #include "libyuv/rotate_row.h"
13
14 #include "libyuv/basic_types.h"
15
16 #ifdef __cplusplus
17 namespace libyuv {
18 extern "C" {
19 #endif
20
21 #if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
22 !defined(__aarch64__)
23
24 static uvec8 kVTbl4x4Transpose =
25 { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 };
26
TransposeWx8_NEON(const uint8 * src,int src_stride,uint8 * dst,int dst_stride,int width)27 void TransposeWx8_NEON(const uint8* src, int src_stride,
28 uint8* dst, int dst_stride,
29 int width) {
30 const uint8* src_temp = NULL;
31 asm volatile (
32 // loops are on blocks of 8. loop will stop when
33 // counter gets to or below 0. starting the counter
34 // at w-8 allow for this
35 "sub %5, #8 \n"
36
37 // handle 8x8 blocks. this should be the majority of the plane
38 ".p2align 2 \n"
39 "1: \n"
40 "mov %0, %1 \n"
41
42 MEMACCESS(0)
43 "vld1.8 {d0}, [%0], %2 \n"
44 MEMACCESS(0)
45 "vld1.8 {d1}, [%0], %2 \n"
46 MEMACCESS(0)
47 "vld1.8 {d2}, [%0], %2 \n"
48 MEMACCESS(0)
49 "vld1.8 {d3}, [%0], %2 \n"
50 MEMACCESS(0)
51 "vld1.8 {d4}, [%0], %2 \n"
52 MEMACCESS(0)
53 "vld1.8 {d5}, [%0], %2 \n"
54 MEMACCESS(0)
55 "vld1.8 {d6}, [%0], %2 \n"
56 MEMACCESS(0)
57 "vld1.8 {d7}, [%0] \n"
58
59 "vtrn.8 d1, d0 \n"
60 "vtrn.8 d3, d2 \n"
61 "vtrn.8 d5, d4 \n"
62 "vtrn.8 d7, d6 \n"
63
64 "vtrn.16 d1, d3 \n"
65 "vtrn.16 d0, d2 \n"
66 "vtrn.16 d5, d7 \n"
67 "vtrn.16 d4, d6 \n"
68
69 "vtrn.32 d1, d5 \n"
70 "vtrn.32 d0, d4 \n"
71 "vtrn.32 d3, d7 \n"
72 "vtrn.32 d2, d6 \n"
73
74 "vrev16.8 q0, q0 \n"
75 "vrev16.8 q1, q1 \n"
76 "vrev16.8 q2, q2 \n"
77 "vrev16.8 q3, q3 \n"
78
79 "mov %0, %3 \n"
80
81 MEMACCESS(0)
82 "vst1.8 {d1}, [%0], %4 \n"
83 MEMACCESS(0)
84 "vst1.8 {d0}, [%0], %4 \n"
85 MEMACCESS(0)
86 "vst1.8 {d3}, [%0], %4 \n"
87 MEMACCESS(0)
88 "vst1.8 {d2}, [%0], %4 \n"
89 MEMACCESS(0)
90 "vst1.8 {d5}, [%0], %4 \n"
91 MEMACCESS(0)
92 "vst1.8 {d4}, [%0], %4 \n"
93 MEMACCESS(0)
94 "vst1.8 {d7}, [%0], %4 \n"
95 MEMACCESS(0)
96 "vst1.8 {d6}, [%0] \n"
97
98 "add %1, #8 \n" // src += 8
99 "add %3, %3, %4, lsl #3 \n" // dst += 8 * dst_stride
100 "subs %5, #8 \n" // w -= 8
101 "bge 1b \n"
102
103 // add 8 back to counter. if the result is 0 there are
104 // no residuals.
105 "adds %5, #8 \n"
106 "beq 4f \n"
107
108 // some residual, so between 1 and 7 lines left to transpose
109 "cmp %5, #2 \n"
110 "blt 3f \n"
111
112 "cmp %5, #4 \n"
113 "blt 2f \n"
114
115 // 4x8 block
116 "mov %0, %1 \n"
117 MEMACCESS(0)
118 "vld1.32 {d0[0]}, [%0], %2 \n"
119 MEMACCESS(0)
120 "vld1.32 {d0[1]}, [%0], %2 \n"
121 MEMACCESS(0)
122 "vld1.32 {d1[0]}, [%0], %2 \n"
123 MEMACCESS(0)
124 "vld1.32 {d1[1]}, [%0], %2 \n"
125 MEMACCESS(0)
126 "vld1.32 {d2[0]}, [%0], %2 \n"
127 MEMACCESS(0)
128 "vld1.32 {d2[1]}, [%0], %2 \n"
129 MEMACCESS(0)
130 "vld1.32 {d3[0]}, [%0], %2 \n"
131 MEMACCESS(0)
132 "vld1.32 {d3[1]}, [%0] \n"
133
134 "mov %0, %3 \n"
135
136 MEMACCESS(6)
137 "vld1.8 {q3}, [%6] \n"
138
139 "vtbl.8 d4, {d0, d1}, d6 \n"
140 "vtbl.8 d5, {d0, d1}, d7 \n"
141 "vtbl.8 d0, {d2, d3}, d6 \n"
142 "vtbl.8 d1, {d2, d3}, d7 \n"
143
144 // TODO(frkoenig): Rework shuffle above to
145 // write out with 4 instead of 8 writes.
146 MEMACCESS(0)
147 "vst1.32 {d4[0]}, [%0], %4 \n"
148 MEMACCESS(0)
149 "vst1.32 {d4[1]}, [%0], %4 \n"
150 MEMACCESS(0)
151 "vst1.32 {d5[0]}, [%0], %4 \n"
152 MEMACCESS(0)
153 "vst1.32 {d5[1]}, [%0] \n"
154
155 "add %0, %3, #4 \n"
156 MEMACCESS(0)
157 "vst1.32 {d0[0]}, [%0], %4 \n"
158 MEMACCESS(0)
159 "vst1.32 {d0[1]}, [%0], %4 \n"
160 MEMACCESS(0)
161 "vst1.32 {d1[0]}, [%0], %4 \n"
162 MEMACCESS(0)
163 "vst1.32 {d1[1]}, [%0] \n"
164
165 "add %1, #4 \n" // src += 4
166 "add %3, %3, %4, lsl #2 \n" // dst += 4 * dst_stride
167 "subs %5, #4 \n" // w -= 4
168 "beq 4f \n"
169
170 // some residual, check to see if it includes a 2x8 block,
171 // or less
172 "cmp %5, #2 \n"
173 "blt 3f \n"
174
175 // 2x8 block
176 "2: \n"
177 "mov %0, %1 \n"
178 MEMACCESS(0)
179 "vld1.16 {d0[0]}, [%0], %2 \n"
180 MEMACCESS(0)
181 "vld1.16 {d1[0]}, [%0], %2 \n"
182 MEMACCESS(0)
183 "vld1.16 {d0[1]}, [%0], %2 \n"
184 MEMACCESS(0)
185 "vld1.16 {d1[1]}, [%0], %2 \n"
186 MEMACCESS(0)
187 "vld1.16 {d0[2]}, [%0], %2 \n"
188 MEMACCESS(0)
189 "vld1.16 {d1[2]}, [%0], %2 \n"
190 MEMACCESS(0)
191 "vld1.16 {d0[3]}, [%0], %2 \n"
192 MEMACCESS(0)
193 "vld1.16 {d1[3]}, [%0] \n"
194
195 "vtrn.8 d0, d1 \n"
196
197 "mov %0, %3 \n"
198
199 MEMACCESS(0)
200 "vst1.64 {d0}, [%0], %4 \n"
201 MEMACCESS(0)
202 "vst1.64 {d1}, [%0] \n"
203
204 "add %1, #2 \n" // src += 2
205 "add %3, %3, %4, lsl #1 \n" // dst += 2 * dst_stride
206 "subs %5, #2 \n" // w -= 2
207 "beq 4f \n"
208
209 // 1x8 block
210 "3: \n"
211 MEMACCESS(1)
212 "vld1.8 {d0[0]}, [%1], %2 \n"
213 MEMACCESS(1)
214 "vld1.8 {d0[1]}, [%1], %2 \n"
215 MEMACCESS(1)
216 "vld1.8 {d0[2]}, [%1], %2 \n"
217 MEMACCESS(1)
218 "vld1.8 {d0[3]}, [%1], %2 \n"
219 MEMACCESS(1)
220 "vld1.8 {d0[4]}, [%1], %2 \n"
221 MEMACCESS(1)
222 "vld1.8 {d0[5]}, [%1], %2 \n"
223 MEMACCESS(1)
224 "vld1.8 {d0[6]}, [%1], %2 \n"
225 MEMACCESS(1)
226 "vld1.8 {d0[7]}, [%1] \n"
227
228 MEMACCESS(3)
229 "vst1.64 {d0}, [%3] \n"
230
231 "4: \n"
232
233 : "+r"(src_temp), // %0
234 "+r"(src), // %1
235 "+r"(src_stride), // %2
236 "+r"(dst), // %3
237 "+r"(dst_stride), // %4
238 "+r"(width) // %5
239 : "r"(&kVTbl4x4Transpose) // %6
240 : "memory", "cc", "q0", "q1", "q2", "q3"
241 );
242 }
243
244 static uvec8 kVTbl4x4TransposeDi =
245 { 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 };
246
TransposeUVWx8_NEON(const uint8 * src,int src_stride,uint8 * dst_a,int dst_stride_a,uint8 * dst_b,int dst_stride_b,int width)247 void TransposeUVWx8_NEON(const uint8* src, int src_stride,
248 uint8* dst_a, int dst_stride_a,
249 uint8* dst_b, int dst_stride_b,
250 int width) {
251 const uint8* src_temp = NULL;
252 asm volatile (
253 // loops are on blocks of 8. loop will stop when
254 // counter gets to or below 0. starting the counter
255 // at w-8 allow for this
256 "sub %7, #8 \n"
257
258 // handle 8x8 blocks. this should be the majority of the plane
259 ".p2align 2 \n"
260 "1: \n"
261 "mov %0, %1 \n"
262
263 MEMACCESS(0)
264 "vld2.8 {d0, d1}, [%0], %2 \n"
265 MEMACCESS(0)
266 "vld2.8 {d2, d3}, [%0], %2 \n"
267 MEMACCESS(0)
268 "vld2.8 {d4, d5}, [%0], %2 \n"
269 MEMACCESS(0)
270 "vld2.8 {d6, d7}, [%0], %2 \n"
271 MEMACCESS(0)
272 "vld2.8 {d16, d17}, [%0], %2 \n"
273 MEMACCESS(0)
274 "vld2.8 {d18, d19}, [%0], %2 \n"
275 MEMACCESS(0)
276 "vld2.8 {d20, d21}, [%0], %2 \n"
277 MEMACCESS(0)
278 "vld2.8 {d22, d23}, [%0] \n"
279
280 "vtrn.8 q1, q0 \n"
281 "vtrn.8 q3, q2 \n"
282 "vtrn.8 q9, q8 \n"
283 "vtrn.8 q11, q10 \n"
284
285 "vtrn.16 q1, q3 \n"
286 "vtrn.16 q0, q2 \n"
287 "vtrn.16 q9, q11 \n"
288 "vtrn.16 q8, q10 \n"
289
290 "vtrn.32 q1, q9 \n"
291 "vtrn.32 q0, q8 \n"
292 "vtrn.32 q3, q11 \n"
293 "vtrn.32 q2, q10 \n"
294
295 "vrev16.8 q0, q0 \n"
296 "vrev16.8 q1, q1 \n"
297 "vrev16.8 q2, q2 \n"
298 "vrev16.8 q3, q3 \n"
299 "vrev16.8 q8, q8 \n"
300 "vrev16.8 q9, q9 \n"
301 "vrev16.8 q10, q10 \n"
302 "vrev16.8 q11, q11 \n"
303
304 "mov %0, %3 \n"
305
306 MEMACCESS(0)
307 "vst1.8 {d2}, [%0], %4 \n"
308 MEMACCESS(0)
309 "vst1.8 {d0}, [%0], %4 \n"
310 MEMACCESS(0)
311 "vst1.8 {d6}, [%0], %4 \n"
312 MEMACCESS(0)
313 "vst1.8 {d4}, [%0], %4 \n"
314 MEMACCESS(0)
315 "vst1.8 {d18}, [%0], %4 \n"
316 MEMACCESS(0)
317 "vst1.8 {d16}, [%0], %4 \n"
318 MEMACCESS(0)
319 "vst1.8 {d22}, [%0], %4 \n"
320 MEMACCESS(0)
321 "vst1.8 {d20}, [%0] \n"
322
323 "mov %0, %5 \n"
324
325 MEMACCESS(0)
326 "vst1.8 {d3}, [%0], %6 \n"
327 MEMACCESS(0)
328 "vst1.8 {d1}, [%0], %6 \n"
329 MEMACCESS(0)
330 "vst1.8 {d7}, [%0], %6 \n"
331 MEMACCESS(0)
332 "vst1.8 {d5}, [%0], %6 \n"
333 MEMACCESS(0)
334 "vst1.8 {d19}, [%0], %6 \n"
335 MEMACCESS(0)
336 "vst1.8 {d17}, [%0], %6 \n"
337 MEMACCESS(0)
338 "vst1.8 {d23}, [%0], %6 \n"
339 MEMACCESS(0)
340 "vst1.8 {d21}, [%0] \n"
341
342 "add %1, #8*2 \n" // src += 8*2
343 "add %3, %3, %4, lsl #3 \n" // dst_a += 8 * dst_stride_a
344 "add %5, %5, %6, lsl #3 \n" // dst_b += 8 * dst_stride_b
345 "subs %7, #8 \n" // w -= 8
346 "bge 1b \n"
347
348 // add 8 back to counter. if the result is 0 there are
349 // no residuals.
350 "adds %7, #8 \n"
351 "beq 4f \n"
352
353 // some residual, so between 1 and 7 lines left to transpose
354 "cmp %7, #2 \n"
355 "blt 3f \n"
356
357 "cmp %7, #4 \n"
358 "blt 2f \n"
359
360 // TODO(frkoenig): Clean this up
361 // 4x8 block
362 "mov %0, %1 \n"
363 MEMACCESS(0)
364 "vld1.64 {d0}, [%0], %2 \n"
365 MEMACCESS(0)
366 "vld1.64 {d1}, [%0], %2 \n"
367 MEMACCESS(0)
368 "vld1.64 {d2}, [%0], %2 \n"
369 MEMACCESS(0)
370 "vld1.64 {d3}, [%0], %2 \n"
371 MEMACCESS(0)
372 "vld1.64 {d4}, [%0], %2 \n"
373 MEMACCESS(0)
374 "vld1.64 {d5}, [%0], %2 \n"
375 MEMACCESS(0)
376 "vld1.64 {d6}, [%0], %2 \n"
377 MEMACCESS(0)
378 "vld1.64 {d7}, [%0] \n"
379
380 MEMACCESS(8)
381 "vld1.8 {q15}, [%8] \n"
382
383 "vtrn.8 q0, q1 \n"
384 "vtrn.8 q2, q3 \n"
385
386 "vtbl.8 d16, {d0, d1}, d30 \n"
387 "vtbl.8 d17, {d0, d1}, d31 \n"
388 "vtbl.8 d18, {d2, d3}, d30 \n"
389 "vtbl.8 d19, {d2, d3}, d31 \n"
390 "vtbl.8 d20, {d4, d5}, d30 \n"
391 "vtbl.8 d21, {d4, d5}, d31 \n"
392 "vtbl.8 d22, {d6, d7}, d30 \n"
393 "vtbl.8 d23, {d6, d7}, d31 \n"
394
395 "mov %0, %3 \n"
396
397 MEMACCESS(0)
398 "vst1.32 {d16[0]}, [%0], %4 \n"
399 MEMACCESS(0)
400 "vst1.32 {d16[1]}, [%0], %4 \n"
401 MEMACCESS(0)
402 "vst1.32 {d17[0]}, [%0], %4 \n"
403 MEMACCESS(0)
404 "vst1.32 {d17[1]}, [%0], %4 \n"
405
406 "add %0, %3, #4 \n"
407 MEMACCESS(0)
408 "vst1.32 {d20[0]}, [%0], %4 \n"
409 MEMACCESS(0)
410 "vst1.32 {d20[1]}, [%0], %4 \n"
411 MEMACCESS(0)
412 "vst1.32 {d21[0]}, [%0], %4 \n"
413 MEMACCESS(0)
414 "vst1.32 {d21[1]}, [%0] \n"
415
416 "mov %0, %5 \n"
417
418 MEMACCESS(0)
419 "vst1.32 {d18[0]}, [%0], %6 \n"
420 MEMACCESS(0)
421 "vst1.32 {d18[1]}, [%0], %6 \n"
422 MEMACCESS(0)
423 "vst1.32 {d19[0]}, [%0], %6 \n"
424 MEMACCESS(0)
425 "vst1.32 {d19[1]}, [%0], %6 \n"
426
427 "add %0, %5, #4 \n"
428 MEMACCESS(0)
429 "vst1.32 {d22[0]}, [%0], %6 \n"
430 MEMACCESS(0)
431 "vst1.32 {d22[1]}, [%0], %6 \n"
432 MEMACCESS(0)
433 "vst1.32 {d23[0]}, [%0], %6 \n"
434 MEMACCESS(0)
435 "vst1.32 {d23[1]}, [%0] \n"
436
437 "add %1, #4*2 \n" // src += 4 * 2
438 "add %3, %3, %4, lsl #2 \n" // dst_a += 4 * dst_stride_a
439 "add %5, %5, %6, lsl #2 \n" // dst_b += 4 * dst_stride_b
440 "subs %7, #4 \n" // w -= 4
441 "beq 4f \n"
442
443 // some residual, check to see if it includes a 2x8 block,
444 // or less
445 "cmp %7, #2 \n"
446 "blt 3f \n"
447
448 // 2x8 block
449 "2: \n"
450 "mov %0, %1 \n"
451 MEMACCESS(0)
452 "vld2.16 {d0[0], d2[0]}, [%0], %2 \n"
453 MEMACCESS(0)
454 "vld2.16 {d1[0], d3[0]}, [%0], %2 \n"
455 MEMACCESS(0)
456 "vld2.16 {d0[1], d2[1]}, [%0], %2 \n"
457 MEMACCESS(0)
458 "vld2.16 {d1[1], d3[1]}, [%0], %2 \n"
459 MEMACCESS(0)
460 "vld2.16 {d0[2], d2[2]}, [%0], %2 \n"
461 MEMACCESS(0)
462 "vld2.16 {d1[2], d3[2]}, [%0], %2 \n"
463 MEMACCESS(0)
464 "vld2.16 {d0[3], d2[3]}, [%0], %2 \n"
465 MEMACCESS(0)
466 "vld2.16 {d1[3], d3[3]}, [%0] \n"
467
468 "vtrn.8 d0, d1 \n"
469 "vtrn.8 d2, d3 \n"
470
471 "mov %0, %3 \n"
472
473 MEMACCESS(0)
474 "vst1.64 {d0}, [%0], %4 \n"
475 MEMACCESS(0)
476 "vst1.64 {d2}, [%0] \n"
477
478 "mov %0, %5 \n"
479
480 MEMACCESS(0)
481 "vst1.64 {d1}, [%0], %6 \n"
482 MEMACCESS(0)
483 "vst1.64 {d3}, [%0] \n"
484
485 "add %1, #2*2 \n" // src += 2 * 2
486 "add %3, %3, %4, lsl #1 \n" // dst_a += 2 * dst_stride_a
487 "add %5, %5, %6, lsl #1 \n" // dst_b += 2 * dst_stride_b
488 "subs %7, #2 \n" // w -= 2
489 "beq 4f \n"
490
491 // 1x8 block
492 "3: \n"
493 MEMACCESS(1)
494 "vld2.8 {d0[0], d1[0]}, [%1], %2 \n"
495 MEMACCESS(1)
496 "vld2.8 {d0[1], d1[1]}, [%1], %2 \n"
497 MEMACCESS(1)
498 "vld2.8 {d0[2], d1[2]}, [%1], %2 \n"
499 MEMACCESS(1)
500 "vld2.8 {d0[3], d1[3]}, [%1], %2 \n"
501 MEMACCESS(1)
502 "vld2.8 {d0[4], d1[4]}, [%1], %2 \n"
503 MEMACCESS(1)
504 "vld2.8 {d0[5], d1[5]}, [%1], %2 \n"
505 MEMACCESS(1)
506 "vld2.8 {d0[6], d1[6]}, [%1], %2 \n"
507 MEMACCESS(1)
508 "vld2.8 {d0[7], d1[7]}, [%1] \n"
509
510 MEMACCESS(3)
511 "vst1.64 {d0}, [%3] \n"
512 MEMACCESS(5)
513 "vst1.64 {d1}, [%5] \n"
514
515 "4: \n"
516
517 : "+r"(src_temp), // %0
518 "+r"(src), // %1
519 "+r"(src_stride), // %2
520 "+r"(dst_a), // %3
521 "+r"(dst_stride_a), // %4
522 "+r"(dst_b), // %5
523 "+r"(dst_stride_b), // %6
524 "+r"(width) // %7
525 : "r"(&kVTbl4x4TransposeDi) // %8
526 : "memory", "cc",
527 "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"
528 );
529 }
530 #endif // defined(__ARM_NEON__) && !defined(__aarch64__)
531
532 #ifdef __cplusplus
533 } // extern "C"
534 } // namespace libyuv
535 #endif
536