1 /*
2 * Copyright 2011 The LibYuv Project Authors. All rights reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "libyuv/row.h"
12 #include "libyuv/rotate_row.h"
13
14 #include "libyuv/basic_types.h"
15
16 #ifdef __cplusplus
17 namespace libyuv {
18 extern "C" {
19 #endif
20
21 #if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
22 !defined(__aarch64__)
23
24 static uvec8 kVTbl4x4Transpose =
25 { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 };
26
TransposeWx8_NEON(const uint8 * src,int src_stride,uint8 * dst,int dst_stride,int width)27 void TransposeWx8_NEON(const uint8* src, int src_stride,
28 uint8* dst, int dst_stride,
29 int width) {
30 const uint8* src_temp;
31 asm volatile (
32 // loops are on blocks of 8. loop will stop when
33 // counter gets to or below 0. starting the counter
34 // at w-8 allow for this
35 "sub %5, #8 \n"
36
37 // handle 8x8 blocks. this should be the majority of the plane
38 "1: \n"
39 "mov %0, %1 \n"
40
41 MEMACCESS(0)
42 "vld1.8 {d0}, [%0], %2 \n"
43 MEMACCESS(0)
44 "vld1.8 {d1}, [%0], %2 \n"
45 MEMACCESS(0)
46 "vld1.8 {d2}, [%0], %2 \n"
47 MEMACCESS(0)
48 "vld1.8 {d3}, [%0], %2 \n"
49 MEMACCESS(0)
50 "vld1.8 {d4}, [%0], %2 \n"
51 MEMACCESS(0)
52 "vld1.8 {d5}, [%0], %2 \n"
53 MEMACCESS(0)
54 "vld1.8 {d6}, [%0], %2 \n"
55 MEMACCESS(0)
56 "vld1.8 {d7}, [%0] \n"
57
58 "vtrn.8 d1, d0 \n"
59 "vtrn.8 d3, d2 \n"
60 "vtrn.8 d5, d4 \n"
61 "vtrn.8 d7, d6 \n"
62
63 "vtrn.16 d1, d3 \n"
64 "vtrn.16 d0, d2 \n"
65 "vtrn.16 d5, d7 \n"
66 "vtrn.16 d4, d6 \n"
67
68 "vtrn.32 d1, d5 \n"
69 "vtrn.32 d0, d4 \n"
70 "vtrn.32 d3, d7 \n"
71 "vtrn.32 d2, d6 \n"
72
73 "vrev16.8 q0, q0 \n"
74 "vrev16.8 q1, q1 \n"
75 "vrev16.8 q2, q2 \n"
76 "vrev16.8 q3, q3 \n"
77
78 "mov %0, %3 \n"
79
80 MEMACCESS(0)
81 "vst1.8 {d1}, [%0], %4 \n"
82 MEMACCESS(0)
83 "vst1.8 {d0}, [%0], %4 \n"
84 MEMACCESS(0)
85 "vst1.8 {d3}, [%0], %4 \n"
86 MEMACCESS(0)
87 "vst1.8 {d2}, [%0], %4 \n"
88 MEMACCESS(0)
89 "vst1.8 {d5}, [%0], %4 \n"
90 MEMACCESS(0)
91 "vst1.8 {d4}, [%0], %4 \n"
92 MEMACCESS(0)
93 "vst1.8 {d7}, [%0], %4 \n"
94 MEMACCESS(0)
95 "vst1.8 {d6}, [%0] \n"
96
97 "add %1, #8 \n" // src += 8
98 "add %3, %3, %4, lsl #3 \n" // dst += 8 * dst_stride
99 "subs %5, #8 \n" // w -= 8
100 "bge 1b \n"
101
102 // add 8 back to counter. if the result is 0 there are
103 // no residuals.
104 "adds %5, #8 \n"
105 "beq 4f \n"
106
107 // some residual, so between 1 and 7 lines left to transpose
108 "cmp %5, #2 \n"
109 "blt 3f \n"
110
111 "cmp %5, #4 \n"
112 "blt 2f \n"
113
114 // 4x8 block
115 "mov %0, %1 \n"
116 MEMACCESS(0)
117 "vld1.32 {d0[0]}, [%0], %2 \n"
118 MEMACCESS(0)
119 "vld1.32 {d0[1]}, [%0], %2 \n"
120 MEMACCESS(0)
121 "vld1.32 {d1[0]}, [%0], %2 \n"
122 MEMACCESS(0)
123 "vld1.32 {d1[1]}, [%0], %2 \n"
124 MEMACCESS(0)
125 "vld1.32 {d2[0]}, [%0], %2 \n"
126 MEMACCESS(0)
127 "vld1.32 {d2[1]}, [%0], %2 \n"
128 MEMACCESS(0)
129 "vld1.32 {d3[0]}, [%0], %2 \n"
130 MEMACCESS(0)
131 "vld1.32 {d3[1]}, [%0] \n"
132
133 "mov %0, %3 \n"
134
135 MEMACCESS(6)
136 "vld1.8 {q3}, [%6] \n"
137
138 "vtbl.8 d4, {d0, d1}, d6 \n"
139 "vtbl.8 d5, {d0, d1}, d7 \n"
140 "vtbl.8 d0, {d2, d3}, d6 \n"
141 "vtbl.8 d1, {d2, d3}, d7 \n"
142
143 // TODO(frkoenig): Rework shuffle above to
144 // write out with 4 instead of 8 writes.
145 MEMACCESS(0)
146 "vst1.32 {d4[0]}, [%0], %4 \n"
147 MEMACCESS(0)
148 "vst1.32 {d4[1]}, [%0], %4 \n"
149 MEMACCESS(0)
150 "vst1.32 {d5[0]}, [%0], %4 \n"
151 MEMACCESS(0)
152 "vst1.32 {d5[1]}, [%0] \n"
153
154 "add %0, %3, #4 \n"
155 MEMACCESS(0)
156 "vst1.32 {d0[0]}, [%0], %4 \n"
157 MEMACCESS(0)
158 "vst1.32 {d0[1]}, [%0], %4 \n"
159 MEMACCESS(0)
160 "vst1.32 {d1[0]}, [%0], %4 \n"
161 MEMACCESS(0)
162 "vst1.32 {d1[1]}, [%0] \n"
163
164 "add %1, #4 \n" // src += 4
165 "add %3, %3, %4, lsl #2 \n" // dst += 4 * dst_stride
166 "subs %5, #4 \n" // w -= 4
167 "beq 4f \n"
168
169 // some residual, check to see if it includes a 2x8 block,
170 // or less
171 "cmp %5, #2 \n"
172 "blt 3f \n"
173
174 // 2x8 block
175 "2: \n"
176 "mov %0, %1 \n"
177 MEMACCESS(0)
178 "vld1.16 {d0[0]}, [%0], %2 \n"
179 MEMACCESS(0)
180 "vld1.16 {d1[0]}, [%0], %2 \n"
181 MEMACCESS(0)
182 "vld1.16 {d0[1]}, [%0], %2 \n"
183 MEMACCESS(0)
184 "vld1.16 {d1[1]}, [%0], %2 \n"
185 MEMACCESS(0)
186 "vld1.16 {d0[2]}, [%0], %2 \n"
187 MEMACCESS(0)
188 "vld1.16 {d1[2]}, [%0], %2 \n"
189 MEMACCESS(0)
190 "vld1.16 {d0[3]}, [%0], %2 \n"
191 MEMACCESS(0)
192 "vld1.16 {d1[3]}, [%0] \n"
193
194 "vtrn.8 d0, d1 \n"
195
196 "mov %0, %3 \n"
197
198 MEMACCESS(0)
199 "vst1.64 {d0}, [%0], %4 \n"
200 MEMACCESS(0)
201 "vst1.64 {d1}, [%0] \n"
202
203 "add %1, #2 \n" // src += 2
204 "add %3, %3, %4, lsl #1 \n" // dst += 2 * dst_stride
205 "subs %5, #2 \n" // w -= 2
206 "beq 4f \n"
207
208 // 1x8 block
209 "3: \n"
210 MEMACCESS(1)
211 "vld1.8 {d0[0]}, [%1], %2 \n"
212 MEMACCESS(1)
213 "vld1.8 {d0[1]}, [%1], %2 \n"
214 MEMACCESS(1)
215 "vld1.8 {d0[2]}, [%1], %2 \n"
216 MEMACCESS(1)
217 "vld1.8 {d0[3]}, [%1], %2 \n"
218 MEMACCESS(1)
219 "vld1.8 {d0[4]}, [%1], %2 \n"
220 MEMACCESS(1)
221 "vld1.8 {d0[5]}, [%1], %2 \n"
222 MEMACCESS(1)
223 "vld1.8 {d0[6]}, [%1], %2 \n"
224 MEMACCESS(1)
225 "vld1.8 {d0[7]}, [%1] \n"
226
227 MEMACCESS(3)
228 "vst1.64 {d0}, [%3] \n"
229
230 "4: \n"
231
232 : "=&r"(src_temp), // %0
233 "+r"(src), // %1
234 "+r"(src_stride), // %2
235 "+r"(dst), // %3
236 "+r"(dst_stride), // %4
237 "+r"(width) // %5
238 : "r"(&kVTbl4x4Transpose) // %6
239 : "memory", "cc", "q0", "q1", "q2", "q3"
240 );
241 }
242
243 static uvec8 kVTbl4x4TransposeDi =
244 { 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 };
245
TransposeUVWx8_NEON(const uint8 * src,int src_stride,uint8 * dst_a,int dst_stride_a,uint8 * dst_b,int dst_stride_b,int width)246 void TransposeUVWx8_NEON(const uint8* src, int src_stride,
247 uint8* dst_a, int dst_stride_a,
248 uint8* dst_b, int dst_stride_b,
249 int width) {
250 const uint8* src_temp;
251 asm volatile (
252 // loops are on blocks of 8. loop will stop when
253 // counter gets to or below 0. starting the counter
254 // at w-8 allow for this
255 "sub %7, #8 \n"
256
257 // handle 8x8 blocks. this should be the majority of the plane
258 "1: \n"
259 "mov %0, %1 \n"
260
261 MEMACCESS(0)
262 "vld2.8 {d0, d1}, [%0], %2 \n"
263 MEMACCESS(0)
264 "vld2.8 {d2, d3}, [%0], %2 \n"
265 MEMACCESS(0)
266 "vld2.8 {d4, d5}, [%0], %2 \n"
267 MEMACCESS(0)
268 "vld2.8 {d6, d7}, [%0], %2 \n"
269 MEMACCESS(0)
270 "vld2.8 {d16, d17}, [%0], %2 \n"
271 MEMACCESS(0)
272 "vld2.8 {d18, d19}, [%0], %2 \n"
273 MEMACCESS(0)
274 "vld2.8 {d20, d21}, [%0], %2 \n"
275 MEMACCESS(0)
276 "vld2.8 {d22, d23}, [%0] \n"
277
278 "vtrn.8 q1, q0 \n"
279 "vtrn.8 q3, q2 \n"
280 "vtrn.8 q9, q8 \n"
281 "vtrn.8 q11, q10 \n"
282
283 "vtrn.16 q1, q3 \n"
284 "vtrn.16 q0, q2 \n"
285 "vtrn.16 q9, q11 \n"
286 "vtrn.16 q8, q10 \n"
287
288 "vtrn.32 q1, q9 \n"
289 "vtrn.32 q0, q8 \n"
290 "vtrn.32 q3, q11 \n"
291 "vtrn.32 q2, q10 \n"
292
293 "vrev16.8 q0, q0 \n"
294 "vrev16.8 q1, q1 \n"
295 "vrev16.8 q2, q2 \n"
296 "vrev16.8 q3, q3 \n"
297 "vrev16.8 q8, q8 \n"
298 "vrev16.8 q9, q9 \n"
299 "vrev16.8 q10, q10 \n"
300 "vrev16.8 q11, q11 \n"
301
302 "mov %0, %3 \n"
303
304 MEMACCESS(0)
305 "vst1.8 {d2}, [%0], %4 \n"
306 MEMACCESS(0)
307 "vst1.8 {d0}, [%0], %4 \n"
308 MEMACCESS(0)
309 "vst1.8 {d6}, [%0], %4 \n"
310 MEMACCESS(0)
311 "vst1.8 {d4}, [%0], %4 \n"
312 MEMACCESS(0)
313 "vst1.8 {d18}, [%0], %4 \n"
314 MEMACCESS(0)
315 "vst1.8 {d16}, [%0], %4 \n"
316 MEMACCESS(0)
317 "vst1.8 {d22}, [%0], %4 \n"
318 MEMACCESS(0)
319 "vst1.8 {d20}, [%0] \n"
320
321 "mov %0, %5 \n"
322
323 MEMACCESS(0)
324 "vst1.8 {d3}, [%0], %6 \n"
325 MEMACCESS(0)
326 "vst1.8 {d1}, [%0], %6 \n"
327 MEMACCESS(0)
328 "vst1.8 {d7}, [%0], %6 \n"
329 MEMACCESS(0)
330 "vst1.8 {d5}, [%0], %6 \n"
331 MEMACCESS(0)
332 "vst1.8 {d19}, [%0], %6 \n"
333 MEMACCESS(0)
334 "vst1.8 {d17}, [%0], %6 \n"
335 MEMACCESS(0)
336 "vst1.8 {d23}, [%0], %6 \n"
337 MEMACCESS(0)
338 "vst1.8 {d21}, [%0] \n"
339
340 "add %1, #8*2 \n" // src += 8*2
341 "add %3, %3, %4, lsl #3 \n" // dst_a += 8 * dst_stride_a
342 "add %5, %5, %6, lsl #3 \n" // dst_b += 8 * dst_stride_b
343 "subs %7, #8 \n" // w -= 8
344 "bge 1b \n"
345
346 // add 8 back to counter. if the result is 0 there are
347 // no residuals.
348 "adds %7, #8 \n"
349 "beq 4f \n"
350
351 // some residual, so between 1 and 7 lines left to transpose
352 "cmp %7, #2 \n"
353 "blt 3f \n"
354
355 "cmp %7, #4 \n"
356 "blt 2f \n"
357
358 // TODO(frkoenig): Clean this up
359 // 4x8 block
360 "mov %0, %1 \n"
361 MEMACCESS(0)
362 "vld1.64 {d0}, [%0], %2 \n"
363 MEMACCESS(0)
364 "vld1.64 {d1}, [%0], %2 \n"
365 MEMACCESS(0)
366 "vld1.64 {d2}, [%0], %2 \n"
367 MEMACCESS(0)
368 "vld1.64 {d3}, [%0], %2 \n"
369 MEMACCESS(0)
370 "vld1.64 {d4}, [%0], %2 \n"
371 MEMACCESS(0)
372 "vld1.64 {d5}, [%0], %2 \n"
373 MEMACCESS(0)
374 "vld1.64 {d6}, [%0], %2 \n"
375 MEMACCESS(0)
376 "vld1.64 {d7}, [%0] \n"
377
378 MEMACCESS(8)
379 "vld1.8 {q15}, [%8] \n"
380
381 "vtrn.8 q0, q1 \n"
382 "vtrn.8 q2, q3 \n"
383
384 "vtbl.8 d16, {d0, d1}, d30 \n"
385 "vtbl.8 d17, {d0, d1}, d31 \n"
386 "vtbl.8 d18, {d2, d3}, d30 \n"
387 "vtbl.8 d19, {d2, d3}, d31 \n"
388 "vtbl.8 d20, {d4, d5}, d30 \n"
389 "vtbl.8 d21, {d4, d5}, d31 \n"
390 "vtbl.8 d22, {d6, d7}, d30 \n"
391 "vtbl.8 d23, {d6, d7}, d31 \n"
392
393 "mov %0, %3 \n"
394
395 MEMACCESS(0)
396 "vst1.32 {d16[0]}, [%0], %4 \n"
397 MEMACCESS(0)
398 "vst1.32 {d16[1]}, [%0], %4 \n"
399 MEMACCESS(0)
400 "vst1.32 {d17[0]}, [%0], %4 \n"
401 MEMACCESS(0)
402 "vst1.32 {d17[1]}, [%0], %4 \n"
403
404 "add %0, %3, #4 \n"
405 MEMACCESS(0)
406 "vst1.32 {d20[0]}, [%0], %4 \n"
407 MEMACCESS(0)
408 "vst1.32 {d20[1]}, [%0], %4 \n"
409 MEMACCESS(0)
410 "vst1.32 {d21[0]}, [%0], %4 \n"
411 MEMACCESS(0)
412 "vst1.32 {d21[1]}, [%0] \n"
413
414 "mov %0, %5 \n"
415
416 MEMACCESS(0)
417 "vst1.32 {d18[0]}, [%0], %6 \n"
418 MEMACCESS(0)
419 "vst1.32 {d18[1]}, [%0], %6 \n"
420 MEMACCESS(0)
421 "vst1.32 {d19[0]}, [%0], %6 \n"
422 MEMACCESS(0)
423 "vst1.32 {d19[1]}, [%0], %6 \n"
424
425 "add %0, %5, #4 \n"
426 MEMACCESS(0)
427 "vst1.32 {d22[0]}, [%0], %6 \n"
428 MEMACCESS(0)
429 "vst1.32 {d22[1]}, [%0], %6 \n"
430 MEMACCESS(0)
431 "vst1.32 {d23[0]}, [%0], %6 \n"
432 MEMACCESS(0)
433 "vst1.32 {d23[1]}, [%0] \n"
434
435 "add %1, #4*2 \n" // src += 4 * 2
436 "add %3, %3, %4, lsl #2 \n" // dst_a += 4 * dst_stride_a
437 "add %5, %5, %6, lsl #2 \n" // dst_b += 4 * dst_stride_b
438 "subs %7, #4 \n" // w -= 4
439 "beq 4f \n"
440
441 // some residual, check to see if it includes a 2x8 block,
442 // or less
443 "cmp %7, #2 \n"
444 "blt 3f \n"
445
446 // 2x8 block
447 "2: \n"
448 "mov %0, %1 \n"
449 MEMACCESS(0)
450 "vld2.16 {d0[0], d2[0]}, [%0], %2 \n"
451 MEMACCESS(0)
452 "vld2.16 {d1[0], d3[0]}, [%0], %2 \n"
453 MEMACCESS(0)
454 "vld2.16 {d0[1], d2[1]}, [%0], %2 \n"
455 MEMACCESS(0)
456 "vld2.16 {d1[1], d3[1]}, [%0], %2 \n"
457 MEMACCESS(0)
458 "vld2.16 {d0[2], d2[2]}, [%0], %2 \n"
459 MEMACCESS(0)
460 "vld2.16 {d1[2], d3[2]}, [%0], %2 \n"
461 MEMACCESS(0)
462 "vld2.16 {d0[3], d2[3]}, [%0], %2 \n"
463 MEMACCESS(0)
464 "vld2.16 {d1[3], d3[3]}, [%0] \n"
465
466 "vtrn.8 d0, d1 \n"
467 "vtrn.8 d2, d3 \n"
468
469 "mov %0, %3 \n"
470
471 MEMACCESS(0)
472 "vst1.64 {d0}, [%0], %4 \n"
473 MEMACCESS(0)
474 "vst1.64 {d2}, [%0] \n"
475
476 "mov %0, %5 \n"
477
478 MEMACCESS(0)
479 "vst1.64 {d1}, [%0], %6 \n"
480 MEMACCESS(0)
481 "vst1.64 {d3}, [%0] \n"
482
483 "add %1, #2*2 \n" // src += 2 * 2
484 "add %3, %3, %4, lsl #1 \n" // dst_a += 2 * dst_stride_a
485 "add %5, %5, %6, lsl #1 \n" // dst_b += 2 * dst_stride_b
486 "subs %7, #2 \n" // w -= 2
487 "beq 4f \n"
488
489 // 1x8 block
490 "3: \n"
491 MEMACCESS(1)
492 "vld2.8 {d0[0], d1[0]}, [%1], %2 \n"
493 MEMACCESS(1)
494 "vld2.8 {d0[1], d1[1]}, [%1], %2 \n"
495 MEMACCESS(1)
496 "vld2.8 {d0[2], d1[2]}, [%1], %2 \n"
497 MEMACCESS(1)
498 "vld2.8 {d0[3], d1[3]}, [%1], %2 \n"
499 MEMACCESS(1)
500 "vld2.8 {d0[4], d1[4]}, [%1], %2 \n"
501 MEMACCESS(1)
502 "vld2.8 {d0[5], d1[5]}, [%1], %2 \n"
503 MEMACCESS(1)
504 "vld2.8 {d0[6], d1[6]}, [%1], %2 \n"
505 MEMACCESS(1)
506 "vld2.8 {d0[7], d1[7]}, [%1] \n"
507
508 MEMACCESS(3)
509 "vst1.64 {d0}, [%3] \n"
510 MEMACCESS(5)
511 "vst1.64 {d1}, [%5] \n"
512
513 "4: \n"
514
515 : "=&r"(src_temp), // %0
516 "+r"(src), // %1
517 "+r"(src_stride), // %2
518 "+r"(dst_a), // %3
519 "+r"(dst_stride_a), // %4
520 "+r"(dst_b), // %5
521 "+r"(dst_stride_b), // %6
522 "+r"(width) // %7
523 : "r"(&kVTbl4x4TransposeDi) // %8
524 : "memory", "cc",
525 "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"
526 );
527 }
528 #endif // defined(__ARM_NEON__) && !defined(__aarch64__)
529
530 #ifdef __cplusplus
531 } // extern "C"
532 } // namespace libyuv
533 #endif
534