1 /*
2 * Copyright 2011 The LibYuv Project Authors. All rights reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "libyuv/rotate_row.h"
12 #include "libyuv/row.h"
13
14 #include "libyuv/basic_types.h"
15
16 #ifdef __cplusplus
17 namespace libyuv {
18 extern "C" {
19 #endif
20
21 #if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
22 !defined(__aarch64__)
23
24 static uvec8 kVTbl4x4Transpose = {0, 4, 8, 12, 1, 5, 9, 13,
25 2, 6, 10, 14, 3, 7, 11, 15};
26
TransposeWx8_NEON(const uint8 * src,int src_stride,uint8 * dst,int dst_stride,int width)27 void TransposeWx8_NEON(const uint8* src,
28 int src_stride,
29 uint8* dst,
30 int dst_stride,
31 int width) {
32 const uint8* src_temp;
33 asm volatile (
34 // loops are on blocks of 8. loop will stop when
35 // counter gets to or below 0. starting the counter
36 // at w-8 allow for this
37 "sub %5, #8 \n"
38
39 // handle 8x8 blocks. this should be the majority of the plane
40 "1: \n"
41 "mov %0, %1 \n"
42
43 MEMACCESS(0)
44 "vld1.8 {d0}, [%0], %2 \n"
45 MEMACCESS(0)
46 "vld1.8 {d1}, [%0], %2 \n"
47 MEMACCESS(0)
48 "vld1.8 {d2}, [%0], %2 \n"
49 MEMACCESS(0)
50 "vld1.8 {d3}, [%0], %2 \n"
51 MEMACCESS(0)
52 "vld1.8 {d4}, [%0], %2 \n"
53 MEMACCESS(0)
54 "vld1.8 {d5}, [%0], %2 \n"
55 MEMACCESS(0)
56 "vld1.8 {d6}, [%0], %2 \n"
57 MEMACCESS(0)
58 "vld1.8 {d7}, [%0] \n"
59
60 "vtrn.8 d1, d0 \n"
61 "vtrn.8 d3, d2 \n"
62 "vtrn.8 d5, d4 \n"
63 "vtrn.8 d7, d6 \n"
64
65 "vtrn.16 d1, d3 \n"
66 "vtrn.16 d0, d2 \n"
67 "vtrn.16 d5, d7 \n"
68 "vtrn.16 d4, d6 \n"
69
70 "vtrn.32 d1, d5 \n"
71 "vtrn.32 d0, d4 \n"
72 "vtrn.32 d3, d7 \n"
73 "vtrn.32 d2, d6 \n"
74
75 "vrev16.8 q0, q0 \n"
76 "vrev16.8 q1, q1 \n"
77 "vrev16.8 q2, q2 \n"
78 "vrev16.8 q3, q3 \n"
79
80 "mov %0, %3 \n"
81
82 MEMACCESS(0)
83 "vst1.8 {d1}, [%0], %4 \n"
84 MEMACCESS(0)
85 "vst1.8 {d0}, [%0], %4 \n"
86 MEMACCESS(0)
87 "vst1.8 {d3}, [%0], %4 \n"
88 MEMACCESS(0)
89 "vst1.8 {d2}, [%0], %4 \n"
90 MEMACCESS(0)
91 "vst1.8 {d5}, [%0], %4 \n"
92 MEMACCESS(0)
93 "vst1.8 {d4}, [%0], %4 \n"
94 MEMACCESS(0)
95 "vst1.8 {d7}, [%0], %4 \n"
96 MEMACCESS(0)
97 "vst1.8 {d6}, [%0] \n"
98
99 "add %1, #8 \n" // src += 8
100 "add %3, %3, %4, lsl #3 \n" // dst += 8 * dst_stride
101 "subs %5, #8 \n" // w -= 8
102 "bge 1b \n"
103
104 // add 8 back to counter. if the result is 0 there are
105 // no residuals.
106 "adds %5, #8 \n"
107 "beq 4f \n"
108
109 // some residual, so between 1 and 7 lines left to transpose
110 "cmp %5, #2 \n"
111 "blt 3f \n"
112
113 "cmp %5, #4 \n"
114 "blt 2f \n"
115
116 // 4x8 block
117 "mov %0, %1 \n"
118 MEMACCESS(0)
119 "vld1.32 {d0[0]}, [%0], %2 \n"
120 MEMACCESS(0)
121 "vld1.32 {d0[1]}, [%0], %2 \n"
122 MEMACCESS(0)
123 "vld1.32 {d1[0]}, [%0], %2 \n"
124 MEMACCESS(0)
125 "vld1.32 {d1[1]}, [%0], %2 \n"
126 MEMACCESS(0)
127 "vld1.32 {d2[0]}, [%0], %2 \n"
128 MEMACCESS(0)
129 "vld1.32 {d2[1]}, [%0], %2 \n"
130 MEMACCESS(0)
131 "vld1.32 {d3[0]}, [%0], %2 \n"
132 MEMACCESS(0)
133 "vld1.32 {d3[1]}, [%0] \n"
134
135 "mov %0, %3 \n"
136
137 MEMACCESS(6)
138 "vld1.8 {q3}, [%6] \n"
139
140 "vtbl.8 d4, {d0, d1}, d6 \n"
141 "vtbl.8 d5, {d0, d1}, d7 \n"
142 "vtbl.8 d0, {d2, d3}, d6 \n"
143 "vtbl.8 d1, {d2, d3}, d7 \n"
144
145 // TODO(frkoenig): Rework shuffle above to
146 // write out with 4 instead of 8 writes.
147 MEMACCESS(0)
148 "vst1.32 {d4[0]}, [%0], %4 \n"
149 MEMACCESS(0)
150 "vst1.32 {d4[1]}, [%0], %4 \n"
151 MEMACCESS(0)
152 "vst1.32 {d5[0]}, [%0], %4 \n"
153 MEMACCESS(0)
154 "vst1.32 {d5[1]}, [%0] \n"
155
156 "add %0, %3, #4 \n"
157 MEMACCESS(0)
158 "vst1.32 {d0[0]}, [%0], %4 \n"
159 MEMACCESS(0)
160 "vst1.32 {d0[1]}, [%0], %4 \n"
161 MEMACCESS(0)
162 "vst1.32 {d1[0]}, [%0], %4 \n"
163 MEMACCESS(0)
164 "vst1.32 {d1[1]}, [%0] \n"
165
166 "add %1, #4 \n" // src += 4
167 "add %3, %3, %4, lsl #2 \n" // dst += 4 * dst_stride
168 "subs %5, #4 \n" // w -= 4
169 "beq 4f \n"
170
171 // some residual, check to see if it includes a 2x8 block,
172 // or less
173 "cmp %5, #2 \n"
174 "blt 3f \n"
175
176 // 2x8 block
177 "2: \n"
178 "mov %0, %1 \n"
179 MEMACCESS(0)
180 "vld1.16 {d0[0]}, [%0], %2 \n"
181 MEMACCESS(0)
182 "vld1.16 {d1[0]}, [%0], %2 \n"
183 MEMACCESS(0)
184 "vld1.16 {d0[1]}, [%0], %2 \n"
185 MEMACCESS(0)
186 "vld1.16 {d1[1]}, [%0], %2 \n"
187 MEMACCESS(0)
188 "vld1.16 {d0[2]}, [%0], %2 \n"
189 MEMACCESS(0)
190 "vld1.16 {d1[2]}, [%0], %2 \n"
191 MEMACCESS(0)
192 "vld1.16 {d0[3]}, [%0], %2 \n"
193 MEMACCESS(0)
194 "vld1.16 {d1[3]}, [%0] \n"
195
196 "vtrn.8 d0, d1 \n"
197
198 "mov %0, %3 \n"
199
200 MEMACCESS(0)
201 "vst1.64 {d0}, [%0], %4 \n"
202 MEMACCESS(0)
203 "vst1.64 {d1}, [%0] \n"
204
205 "add %1, #2 \n" // src += 2
206 "add %3, %3, %4, lsl #1 \n" // dst += 2 * dst_stride
207 "subs %5, #2 \n" // w -= 2
208 "beq 4f \n"
209
210 // 1x8 block
211 "3: \n"
212 MEMACCESS(1)
213 "vld1.8 {d0[0]}, [%1], %2 \n"
214 MEMACCESS(1)
215 "vld1.8 {d0[1]}, [%1], %2 \n"
216 MEMACCESS(1)
217 "vld1.8 {d0[2]}, [%1], %2 \n"
218 MEMACCESS(1)
219 "vld1.8 {d0[3]}, [%1], %2 \n"
220 MEMACCESS(1)
221 "vld1.8 {d0[4]}, [%1], %2 \n"
222 MEMACCESS(1)
223 "vld1.8 {d0[5]}, [%1], %2 \n"
224 MEMACCESS(1)
225 "vld1.8 {d0[6]}, [%1], %2 \n"
226 MEMACCESS(1)
227 "vld1.8 {d0[7]}, [%1] \n"
228
229 MEMACCESS(3)
230 "vst1.64 {d0}, [%3] \n"
231
232 "4: \n"
233
234 : "=&r"(src_temp), // %0
235 "+r"(src), // %1
236 "+r"(src_stride), // %2
237 "+r"(dst), // %3
238 "+r"(dst_stride), // %4
239 "+r"(width) // %5
240 : "r"(&kVTbl4x4Transpose) // %6
241 : "memory", "cc", "q0", "q1", "q2", "q3"
242 );
243 }
244
245 static uvec8 kVTbl4x4TransposeDi = {0, 8, 1, 9, 2, 10, 3, 11,
246 4, 12, 5, 13, 6, 14, 7, 15};
247
TransposeUVWx8_NEON(const uint8 * src,int src_stride,uint8 * dst_a,int dst_stride_a,uint8 * dst_b,int dst_stride_b,int width)248 void TransposeUVWx8_NEON(const uint8* src,
249 int src_stride,
250 uint8* dst_a,
251 int dst_stride_a,
252 uint8* dst_b,
253 int dst_stride_b,
254 int width) {
255 const uint8* src_temp;
256 asm volatile (
257 // loops are on blocks of 8. loop will stop when
258 // counter gets to or below 0. starting the counter
259 // at w-8 allow for this
260 "sub %7, #8 \n"
261
262 // handle 8x8 blocks. this should be the majority of the plane
263 "1: \n"
264 "mov %0, %1 \n"
265
266 MEMACCESS(0)
267 "vld2.8 {d0, d1}, [%0], %2 \n"
268 MEMACCESS(0)
269 "vld2.8 {d2, d3}, [%0], %2 \n"
270 MEMACCESS(0)
271 "vld2.8 {d4, d5}, [%0], %2 \n"
272 MEMACCESS(0)
273 "vld2.8 {d6, d7}, [%0], %2 \n"
274 MEMACCESS(0)
275 "vld2.8 {d16, d17}, [%0], %2 \n"
276 MEMACCESS(0)
277 "vld2.8 {d18, d19}, [%0], %2 \n"
278 MEMACCESS(0)
279 "vld2.8 {d20, d21}, [%0], %2 \n"
280 MEMACCESS(0)
281 "vld2.8 {d22, d23}, [%0] \n"
282
283 "vtrn.8 q1, q0 \n"
284 "vtrn.8 q3, q2 \n"
285 "vtrn.8 q9, q8 \n"
286 "vtrn.8 q11, q10 \n"
287
288 "vtrn.16 q1, q3 \n"
289 "vtrn.16 q0, q2 \n"
290 "vtrn.16 q9, q11 \n"
291 "vtrn.16 q8, q10 \n"
292
293 "vtrn.32 q1, q9 \n"
294 "vtrn.32 q0, q8 \n"
295 "vtrn.32 q3, q11 \n"
296 "vtrn.32 q2, q10 \n"
297
298 "vrev16.8 q0, q0 \n"
299 "vrev16.8 q1, q1 \n"
300 "vrev16.8 q2, q2 \n"
301 "vrev16.8 q3, q3 \n"
302 "vrev16.8 q8, q8 \n"
303 "vrev16.8 q9, q9 \n"
304 "vrev16.8 q10, q10 \n"
305 "vrev16.8 q11, q11 \n"
306
307 "mov %0, %3 \n"
308
309 MEMACCESS(0)
310 "vst1.8 {d2}, [%0], %4 \n"
311 MEMACCESS(0)
312 "vst1.8 {d0}, [%0], %4 \n"
313 MEMACCESS(0)
314 "vst1.8 {d6}, [%0], %4 \n"
315 MEMACCESS(0)
316 "vst1.8 {d4}, [%0], %4 \n"
317 MEMACCESS(0)
318 "vst1.8 {d18}, [%0], %4 \n"
319 MEMACCESS(0)
320 "vst1.8 {d16}, [%0], %4 \n"
321 MEMACCESS(0)
322 "vst1.8 {d22}, [%0], %4 \n"
323 MEMACCESS(0)
324 "vst1.8 {d20}, [%0] \n"
325
326 "mov %0, %5 \n"
327
328 MEMACCESS(0)
329 "vst1.8 {d3}, [%0], %6 \n"
330 MEMACCESS(0)
331 "vst1.8 {d1}, [%0], %6 \n"
332 MEMACCESS(0)
333 "vst1.8 {d7}, [%0], %6 \n"
334 MEMACCESS(0)
335 "vst1.8 {d5}, [%0], %6 \n"
336 MEMACCESS(0)
337 "vst1.8 {d19}, [%0], %6 \n"
338 MEMACCESS(0)
339 "vst1.8 {d17}, [%0], %6 \n"
340 MEMACCESS(0)
341 "vst1.8 {d23}, [%0], %6 \n"
342 MEMACCESS(0)
343 "vst1.8 {d21}, [%0] \n"
344
345 "add %1, #8*2 \n" // src += 8*2
346 "add %3, %3, %4, lsl #3 \n" // dst_a += 8 * dst_stride_a
347 "add %5, %5, %6, lsl #3 \n" // dst_b += 8 * dst_stride_b
348 "subs %7, #8 \n" // w -= 8
349 "bge 1b \n"
350
351 // add 8 back to counter. if the result is 0 there are
352 // no residuals.
353 "adds %7, #8 \n"
354 "beq 4f \n"
355
356 // some residual, so between 1 and 7 lines left to transpose
357 "cmp %7, #2 \n"
358 "blt 3f \n"
359
360 "cmp %7, #4 \n"
361 "blt 2f \n"
362
363 // TODO(frkoenig): Clean this up
364 // 4x8 block
365 "mov %0, %1 \n"
366 MEMACCESS(0)
367 "vld1.64 {d0}, [%0], %2 \n"
368 MEMACCESS(0)
369 "vld1.64 {d1}, [%0], %2 \n"
370 MEMACCESS(0)
371 "vld1.64 {d2}, [%0], %2 \n"
372 MEMACCESS(0)
373 "vld1.64 {d3}, [%0], %2 \n"
374 MEMACCESS(0)
375 "vld1.64 {d4}, [%0], %2 \n"
376 MEMACCESS(0)
377 "vld1.64 {d5}, [%0], %2 \n"
378 MEMACCESS(0)
379 "vld1.64 {d6}, [%0], %2 \n"
380 MEMACCESS(0)
381 "vld1.64 {d7}, [%0] \n"
382
383 MEMACCESS(8)
384 "vld1.8 {q15}, [%8] \n"
385
386 "vtrn.8 q0, q1 \n"
387 "vtrn.8 q2, q3 \n"
388
389 "vtbl.8 d16, {d0, d1}, d30 \n"
390 "vtbl.8 d17, {d0, d1}, d31 \n"
391 "vtbl.8 d18, {d2, d3}, d30 \n"
392 "vtbl.8 d19, {d2, d3}, d31 \n"
393 "vtbl.8 d20, {d4, d5}, d30 \n"
394 "vtbl.8 d21, {d4, d5}, d31 \n"
395 "vtbl.8 d22, {d6, d7}, d30 \n"
396 "vtbl.8 d23, {d6, d7}, d31 \n"
397
398 "mov %0, %3 \n"
399
400 MEMACCESS(0)
401 "vst1.32 {d16[0]}, [%0], %4 \n"
402 MEMACCESS(0)
403 "vst1.32 {d16[1]}, [%0], %4 \n"
404 MEMACCESS(0)
405 "vst1.32 {d17[0]}, [%0], %4 \n"
406 MEMACCESS(0)
407 "vst1.32 {d17[1]}, [%0], %4 \n"
408
409 "add %0, %3, #4 \n"
410 MEMACCESS(0)
411 "vst1.32 {d20[0]}, [%0], %4 \n"
412 MEMACCESS(0)
413 "vst1.32 {d20[1]}, [%0], %4 \n"
414 MEMACCESS(0)
415 "vst1.32 {d21[0]}, [%0], %4 \n"
416 MEMACCESS(0)
417 "vst1.32 {d21[1]}, [%0] \n"
418
419 "mov %0, %5 \n"
420
421 MEMACCESS(0)
422 "vst1.32 {d18[0]}, [%0], %6 \n"
423 MEMACCESS(0)
424 "vst1.32 {d18[1]}, [%0], %6 \n"
425 MEMACCESS(0)
426 "vst1.32 {d19[0]}, [%0], %6 \n"
427 MEMACCESS(0)
428 "vst1.32 {d19[1]}, [%0], %6 \n"
429
430 "add %0, %5, #4 \n"
431 MEMACCESS(0)
432 "vst1.32 {d22[0]}, [%0], %6 \n"
433 MEMACCESS(0)
434 "vst1.32 {d22[1]}, [%0], %6 \n"
435 MEMACCESS(0)
436 "vst1.32 {d23[0]}, [%0], %6 \n"
437 MEMACCESS(0)
438 "vst1.32 {d23[1]}, [%0] \n"
439
440 "add %1, #4*2 \n" // src += 4 * 2
441 "add %3, %3, %4, lsl #2 \n" // dst_a += 4 * dst_stride_a
442 "add %5, %5, %6, lsl #2 \n" // dst_b += 4 * dst_stride_b
443 "subs %7, #4 \n" // w -= 4
444 "beq 4f \n"
445
446 // some residual, check to see if it includes a 2x8 block,
447 // or less
448 "cmp %7, #2 \n"
449 "blt 3f \n"
450
451 // 2x8 block
452 "2: \n"
453 "mov %0, %1 \n"
454 MEMACCESS(0)
455 "vld2.16 {d0[0], d2[0]}, [%0], %2 \n"
456 MEMACCESS(0)
457 "vld2.16 {d1[0], d3[0]}, [%0], %2 \n"
458 MEMACCESS(0)
459 "vld2.16 {d0[1], d2[1]}, [%0], %2 \n"
460 MEMACCESS(0)
461 "vld2.16 {d1[1], d3[1]}, [%0], %2 \n"
462 MEMACCESS(0)
463 "vld2.16 {d0[2], d2[2]}, [%0], %2 \n"
464 MEMACCESS(0)
465 "vld2.16 {d1[2], d3[2]}, [%0], %2 \n"
466 MEMACCESS(0)
467 "vld2.16 {d0[3], d2[3]}, [%0], %2 \n"
468 MEMACCESS(0)
469 "vld2.16 {d1[3], d3[3]}, [%0] \n"
470
471 "vtrn.8 d0, d1 \n"
472 "vtrn.8 d2, d3 \n"
473
474 "mov %0, %3 \n"
475
476 MEMACCESS(0)
477 "vst1.64 {d0}, [%0], %4 \n"
478 MEMACCESS(0)
479 "vst1.64 {d2}, [%0] \n"
480
481 "mov %0, %5 \n"
482
483 MEMACCESS(0)
484 "vst1.64 {d1}, [%0], %6 \n"
485 MEMACCESS(0)
486 "vst1.64 {d3}, [%0] \n"
487
488 "add %1, #2*2 \n" // src += 2 * 2
489 "add %3, %3, %4, lsl #1 \n" // dst_a += 2 * dst_stride_a
490 "add %5, %5, %6, lsl #1 \n" // dst_b += 2 * dst_stride_b
491 "subs %7, #2 \n" // w -= 2
492 "beq 4f \n"
493
494 // 1x8 block
495 "3: \n"
496 MEMACCESS(1)
497 "vld2.8 {d0[0], d1[0]}, [%1], %2 \n"
498 MEMACCESS(1)
499 "vld2.8 {d0[1], d1[1]}, [%1], %2 \n"
500 MEMACCESS(1)
501 "vld2.8 {d0[2], d1[2]}, [%1], %2 \n"
502 MEMACCESS(1)
503 "vld2.8 {d0[3], d1[3]}, [%1], %2 \n"
504 MEMACCESS(1)
505 "vld2.8 {d0[4], d1[4]}, [%1], %2 \n"
506 MEMACCESS(1)
507 "vld2.8 {d0[5], d1[5]}, [%1], %2 \n"
508 MEMACCESS(1)
509 "vld2.8 {d0[6], d1[6]}, [%1], %2 \n"
510 MEMACCESS(1)
511 "vld2.8 {d0[7], d1[7]}, [%1] \n"
512
513 MEMACCESS(3)
514 "vst1.64 {d0}, [%3] \n"
515 MEMACCESS(5)
516 "vst1.64 {d1}, [%5] \n"
517
518 "4: \n"
519
520 : "=&r"(src_temp), // %0
521 "+r"(src), // %1
522 "+r"(src_stride), // %2
523 "+r"(dst_a), // %3
524 "+r"(dst_stride_a), // %4
525 "+r"(dst_b), // %5
526 "+r"(dst_stride_b), // %6
527 "+r"(width) // %7
528 : "r"(&kVTbl4x4TransposeDi) // %8
529 : "memory", "cc",
530 "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"
531 );
532 }
533 #endif // defined(__ARM_NEON__) && !defined(__aarch64__)
534
535 #ifdef __cplusplus
536 } // extern "C"
537 } // namespace libyuv
538 #endif
539