1 /*
2 * Copyright 2014 The LibYuv Project Authors. All rights reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "libyuv/rotate_row.h"
12 #include "libyuv/row.h"
13
14 #include "libyuv/basic_types.h"
15
16 #ifdef __cplusplus
17 namespace libyuv {
18 extern "C" {
19 #endif
20
21 // This module is for GCC Neon armv8 64 bit.
22 #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
23
24 static uvec8 kVTbl4x4Transpose = {0, 4, 8, 12, 1, 5, 9, 13,
25 2, 6, 10, 14, 3, 7, 11, 15};
26
TransposeWx8_NEON(const uint8 * src,int src_stride,uint8 * dst,int dst_stride,int width)27 void TransposeWx8_NEON(const uint8* src,
28 int src_stride,
29 uint8* dst,
30 int dst_stride,
31 int width) {
32 const uint8* src_temp;
33 int64 width64 = (int64)width; // Work around clang 3.4 warning.
34 asm volatile (
35 // loops are on blocks of 8. loop will stop when
36 // counter gets to or below 0. starting the counter
37 // at w-8 allow for this
38 "sub %3, %3, #8 \n"
39
40 // handle 8x8 blocks. this should be the majority of the plane
41 "1: \n"
42 "mov %0, %1 \n"
43
44 MEMACCESS(0)
45 "ld1 {v0.8b}, [%0], %5 \n"
46 MEMACCESS(0)
47 "ld1 {v1.8b}, [%0], %5 \n"
48 MEMACCESS(0)
49 "ld1 {v2.8b}, [%0], %5 \n"
50 MEMACCESS(0)
51 "ld1 {v3.8b}, [%0], %5 \n"
52 MEMACCESS(0)
53 "ld1 {v4.8b}, [%0], %5 \n"
54 MEMACCESS(0)
55 "ld1 {v5.8b}, [%0], %5 \n"
56 MEMACCESS(0)
57 "ld1 {v6.8b}, [%0], %5 \n"
58 MEMACCESS(0)
59 "ld1 {v7.8b}, [%0] \n"
60
61 "trn2 v16.8b, v0.8b, v1.8b \n"
62 "trn1 v17.8b, v0.8b, v1.8b \n"
63 "trn2 v18.8b, v2.8b, v3.8b \n"
64 "trn1 v19.8b, v2.8b, v3.8b \n"
65 "trn2 v20.8b, v4.8b, v5.8b \n"
66 "trn1 v21.8b, v4.8b, v5.8b \n"
67 "trn2 v22.8b, v6.8b, v7.8b \n"
68 "trn1 v23.8b, v6.8b, v7.8b \n"
69
70 "trn2 v3.4h, v17.4h, v19.4h \n"
71 "trn1 v1.4h, v17.4h, v19.4h \n"
72 "trn2 v2.4h, v16.4h, v18.4h \n"
73 "trn1 v0.4h, v16.4h, v18.4h \n"
74 "trn2 v7.4h, v21.4h, v23.4h \n"
75 "trn1 v5.4h, v21.4h, v23.4h \n"
76 "trn2 v6.4h, v20.4h, v22.4h \n"
77 "trn1 v4.4h, v20.4h, v22.4h \n"
78
79 "trn2 v21.2s, v1.2s, v5.2s \n"
80 "trn1 v17.2s, v1.2s, v5.2s \n"
81 "trn2 v20.2s, v0.2s, v4.2s \n"
82 "trn1 v16.2s, v0.2s, v4.2s \n"
83 "trn2 v23.2s, v3.2s, v7.2s \n"
84 "trn1 v19.2s, v3.2s, v7.2s \n"
85 "trn2 v22.2s, v2.2s, v6.2s \n"
86 "trn1 v18.2s, v2.2s, v6.2s \n"
87
88 "mov %0, %2 \n"
89
90 MEMACCESS(0)
91 "st1 {v17.8b}, [%0], %6 \n"
92 MEMACCESS(0)
93 "st1 {v16.8b}, [%0], %6 \n"
94 MEMACCESS(0)
95 "st1 {v19.8b}, [%0], %6 \n"
96 MEMACCESS(0)
97 "st1 {v18.8b}, [%0], %6 \n"
98 MEMACCESS(0)
99 "st1 {v21.8b}, [%0], %6 \n"
100 MEMACCESS(0)
101 "st1 {v20.8b}, [%0], %6 \n"
102 MEMACCESS(0)
103 "st1 {v23.8b}, [%0], %6 \n"
104 MEMACCESS(0)
105 "st1 {v22.8b}, [%0] \n"
106
107 "add %1, %1, #8 \n" // src += 8
108 "add %2, %2, %6, lsl #3 \n" // dst += 8 * dst_stride
109 "subs %3, %3, #8 \n" // w -= 8
110 "b.ge 1b \n"
111
112 // add 8 back to counter. if the result is 0 there are
113 // no residuals.
114 "adds %3, %3, #8 \n"
115 "b.eq 4f \n"
116
117 // some residual, so between 1 and 7 lines left to transpose
118 "cmp %3, #2 \n"
119 "b.lt 3f \n"
120
121 "cmp %3, #4 \n"
122 "b.lt 2f \n"
123
124 // 4x8 block
125 "mov %0, %1 \n"
126 MEMACCESS(0)
127 "ld1 {v0.s}[0], [%0], %5 \n"
128 MEMACCESS(0)
129 "ld1 {v0.s}[1], [%0], %5 \n"
130 MEMACCESS(0)
131 "ld1 {v0.s}[2], [%0], %5 \n"
132 MEMACCESS(0)
133 "ld1 {v0.s}[3], [%0], %5 \n"
134 MEMACCESS(0)
135 "ld1 {v1.s}[0], [%0], %5 \n"
136 MEMACCESS(0)
137 "ld1 {v1.s}[1], [%0], %5 \n"
138 MEMACCESS(0)
139 "ld1 {v1.s}[2], [%0], %5 \n"
140 MEMACCESS(0)
141 "ld1 {v1.s}[3], [%0] \n"
142
143 "mov %0, %2 \n"
144
145 MEMACCESS(4)
146 "ld1 {v2.16b}, [%4] \n"
147
148 "tbl v3.16b, {v0.16b}, v2.16b \n"
149 "tbl v0.16b, {v1.16b}, v2.16b \n"
150
151 // TODO(frkoenig): Rework shuffle above to
152 // write out with 4 instead of 8 writes.
153 MEMACCESS(0)
154 "st1 {v3.s}[0], [%0], %6 \n"
155 MEMACCESS(0)
156 "st1 {v3.s}[1], [%0], %6 \n"
157 MEMACCESS(0)
158 "st1 {v3.s}[2], [%0], %6 \n"
159 MEMACCESS(0)
160 "st1 {v3.s}[3], [%0] \n"
161
162 "add %0, %2, #4 \n"
163 MEMACCESS(0)
164 "st1 {v0.s}[0], [%0], %6 \n"
165 MEMACCESS(0)
166 "st1 {v0.s}[1], [%0], %6 \n"
167 MEMACCESS(0)
168 "st1 {v0.s}[2], [%0], %6 \n"
169 MEMACCESS(0)
170 "st1 {v0.s}[3], [%0] \n"
171
172 "add %1, %1, #4 \n" // src += 4
173 "add %2, %2, %6, lsl #2 \n" // dst += 4 * dst_stride
174 "subs %3, %3, #4 \n" // w -= 4
175 "b.eq 4f \n"
176
177 // some residual, check to see if it includes a 2x8 block,
178 // or less
179 "cmp %3, #2 \n"
180 "b.lt 3f \n"
181
182 // 2x8 block
183 "2: \n"
184 "mov %0, %1 \n"
185 MEMACCESS(0)
186 "ld1 {v0.h}[0], [%0], %5 \n"
187 MEMACCESS(0)
188 "ld1 {v1.h}[0], [%0], %5 \n"
189 MEMACCESS(0)
190 "ld1 {v0.h}[1], [%0], %5 \n"
191 MEMACCESS(0)
192 "ld1 {v1.h}[1], [%0], %5 \n"
193 MEMACCESS(0)
194 "ld1 {v0.h}[2], [%0], %5 \n"
195 MEMACCESS(0)
196 "ld1 {v1.h}[2], [%0], %5 \n"
197 MEMACCESS(0)
198 "ld1 {v0.h}[3], [%0], %5 \n"
199 MEMACCESS(0)
200 "ld1 {v1.h}[3], [%0] \n"
201
202 "trn2 v2.8b, v0.8b, v1.8b \n"
203 "trn1 v3.8b, v0.8b, v1.8b \n"
204
205 "mov %0, %2 \n"
206
207 MEMACCESS(0)
208 "st1 {v3.8b}, [%0], %6 \n"
209 MEMACCESS(0)
210 "st1 {v2.8b}, [%0] \n"
211
212 "add %1, %1, #2 \n" // src += 2
213 "add %2, %2, %6, lsl #1 \n" // dst += 2 * dst_stride
214 "subs %3, %3, #2 \n" // w -= 2
215 "b.eq 4f \n"
216
217 // 1x8 block
218 "3: \n"
219 MEMACCESS(1)
220 "ld1 {v0.b}[0], [%1], %5 \n"
221 MEMACCESS(1)
222 "ld1 {v0.b}[1], [%1], %5 \n"
223 MEMACCESS(1)
224 "ld1 {v0.b}[2], [%1], %5 \n"
225 MEMACCESS(1)
226 "ld1 {v0.b}[3], [%1], %5 \n"
227 MEMACCESS(1)
228 "ld1 {v0.b}[4], [%1], %5 \n"
229 MEMACCESS(1)
230 "ld1 {v0.b}[5], [%1], %5 \n"
231 MEMACCESS(1)
232 "ld1 {v0.b}[6], [%1], %5 \n"
233 MEMACCESS(1)
234 "ld1 {v0.b}[7], [%1] \n"
235
236 MEMACCESS(2)
237 "st1 {v0.8b}, [%2] \n"
238
239 "4: \n"
240
241 : "=&r"(src_temp), // %0
242 "+r"(src), // %1
243 "+r"(dst), // %2
244 "+r"(width64) // %3
245 : "r"(&kVTbl4x4Transpose), // %4
246 "r"(static_cast<ptrdiff_t>(src_stride)), // %5
247 "r"(static_cast<ptrdiff_t>(dst_stride)) // %6
248 : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
249 "v17", "v18", "v19", "v20", "v21", "v22", "v23"
250 );
251 }
252
253 static uint8 kVTbl4x4TransposeDi[32] = {
254 0, 16, 32, 48, 2, 18, 34, 50, 4, 20, 36, 52, 6, 22, 38, 54,
255 1, 17, 33, 49, 3, 19, 35, 51, 5, 21, 37, 53, 7, 23, 39, 55};
256
TransposeUVWx8_NEON(const uint8 * src,int src_stride,uint8 * dst_a,int dst_stride_a,uint8 * dst_b,int dst_stride_b,int width)257 void TransposeUVWx8_NEON(const uint8* src,
258 int src_stride,
259 uint8* dst_a,
260 int dst_stride_a,
261 uint8* dst_b,
262 int dst_stride_b,
263 int width) {
264 const uint8* src_temp;
265 int64 width64 = (int64)width; // Work around clang 3.4 warning.
266 asm volatile (
267 // loops are on blocks of 8. loop will stop when
268 // counter gets to or below 0. starting the counter
269 // at w-8 allow for this
270 "sub %4, %4, #8 \n"
271
272 // handle 8x8 blocks. this should be the majority of the plane
273 "1: \n"
274 "mov %0, %1 \n"
275
276 MEMACCESS(0)
277 "ld1 {v0.16b}, [%0], %5 \n"
278 MEMACCESS(0)
279 "ld1 {v1.16b}, [%0], %5 \n"
280 MEMACCESS(0)
281 "ld1 {v2.16b}, [%0], %5 \n"
282 MEMACCESS(0)
283 "ld1 {v3.16b}, [%0], %5 \n"
284 MEMACCESS(0)
285 "ld1 {v4.16b}, [%0], %5 \n"
286 MEMACCESS(0)
287 "ld1 {v5.16b}, [%0], %5 \n"
288 MEMACCESS(0)
289 "ld1 {v6.16b}, [%0], %5 \n"
290 MEMACCESS(0)
291 "ld1 {v7.16b}, [%0] \n"
292
293 "trn1 v16.16b, v0.16b, v1.16b \n"
294 "trn2 v17.16b, v0.16b, v1.16b \n"
295 "trn1 v18.16b, v2.16b, v3.16b \n"
296 "trn2 v19.16b, v2.16b, v3.16b \n"
297 "trn1 v20.16b, v4.16b, v5.16b \n"
298 "trn2 v21.16b, v4.16b, v5.16b \n"
299 "trn1 v22.16b, v6.16b, v7.16b \n"
300 "trn2 v23.16b, v6.16b, v7.16b \n"
301
302 "trn1 v0.8h, v16.8h, v18.8h \n"
303 "trn2 v1.8h, v16.8h, v18.8h \n"
304 "trn1 v2.8h, v20.8h, v22.8h \n"
305 "trn2 v3.8h, v20.8h, v22.8h \n"
306 "trn1 v4.8h, v17.8h, v19.8h \n"
307 "trn2 v5.8h, v17.8h, v19.8h \n"
308 "trn1 v6.8h, v21.8h, v23.8h \n"
309 "trn2 v7.8h, v21.8h, v23.8h \n"
310
311 "trn1 v16.4s, v0.4s, v2.4s \n"
312 "trn2 v17.4s, v0.4s, v2.4s \n"
313 "trn1 v18.4s, v1.4s, v3.4s \n"
314 "trn2 v19.4s, v1.4s, v3.4s \n"
315 "trn1 v20.4s, v4.4s, v6.4s \n"
316 "trn2 v21.4s, v4.4s, v6.4s \n"
317 "trn1 v22.4s, v5.4s, v7.4s \n"
318 "trn2 v23.4s, v5.4s, v7.4s \n"
319
320 "mov %0, %2 \n"
321
322 MEMACCESS(0)
323 "st1 {v16.d}[0], [%0], %6 \n"
324 MEMACCESS(0)
325 "st1 {v18.d}[0], [%0], %6 \n"
326 MEMACCESS(0)
327 "st1 {v17.d}[0], [%0], %6 \n"
328 MEMACCESS(0)
329 "st1 {v19.d}[0], [%0], %6 \n"
330 MEMACCESS(0)
331 "st1 {v16.d}[1], [%0], %6 \n"
332 MEMACCESS(0)
333 "st1 {v18.d}[1], [%0], %6 \n"
334 MEMACCESS(0)
335 "st1 {v17.d}[1], [%0], %6 \n"
336 MEMACCESS(0)
337 "st1 {v19.d}[1], [%0] \n"
338
339 "mov %0, %3 \n"
340
341 MEMACCESS(0)
342 "st1 {v20.d}[0], [%0], %7 \n"
343 MEMACCESS(0)
344 "st1 {v22.d}[0], [%0], %7 \n"
345 MEMACCESS(0)
346 "st1 {v21.d}[0], [%0], %7 \n"
347 MEMACCESS(0)
348 "st1 {v23.d}[0], [%0], %7 \n"
349 MEMACCESS(0)
350 "st1 {v20.d}[1], [%0], %7 \n"
351 MEMACCESS(0)
352 "st1 {v22.d}[1], [%0], %7 \n"
353 MEMACCESS(0)
354 "st1 {v21.d}[1], [%0], %7 \n"
355 MEMACCESS(0)
356 "st1 {v23.d}[1], [%0] \n"
357
358 "add %1, %1, #16 \n" // src += 8*2
359 "add %2, %2, %6, lsl #3 \n" // dst_a += 8 * dst_stride_a
360 "add %3, %3, %7, lsl #3 \n" // dst_b += 8 * dst_stride_b
361 "subs %4, %4, #8 \n" // w -= 8
362 "b.ge 1b \n"
363
364 // add 8 back to counter. if the result is 0 there are
365 // no residuals.
366 "adds %4, %4, #8 \n"
367 "b.eq 4f \n"
368
369 // some residual, so between 1 and 7 lines left to transpose
370 "cmp %4, #2 \n"
371 "b.lt 3f \n"
372
373 "cmp %4, #4 \n"
374 "b.lt 2f \n"
375
376 // TODO(frkoenig): Clean this up
377 // 4x8 block
378 "mov %0, %1 \n"
379 MEMACCESS(0)
380 "ld1 {v0.8b}, [%0], %5 \n"
381 MEMACCESS(0)
382 "ld1 {v1.8b}, [%0], %5 \n"
383 MEMACCESS(0)
384 "ld1 {v2.8b}, [%0], %5 \n"
385 MEMACCESS(0)
386 "ld1 {v3.8b}, [%0], %5 \n"
387 MEMACCESS(0)
388 "ld1 {v4.8b}, [%0], %5 \n"
389 MEMACCESS(0)
390 "ld1 {v5.8b}, [%0], %5 \n"
391 MEMACCESS(0)
392 "ld1 {v6.8b}, [%0], %5 \n"
393 MEMACCESS(0)
394 "ld1 {v7.8b}, [%0] \n"
395
396 MEMACCESS(8)
397 "ld1 {v30.16b}, [%8], #16 \n"
398 "ld1 {v31.16b}, [%8] \n"
399
400 "tbl v16.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v30.16b \n"
401 "tbl v17.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v31.16b \n"
402 "tbl v18.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v30.16b \n"
403 "tbl v19.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v31.16b \n"
404
405 "mov %0, %2 \n"
406
407 MEMACCESS(0)
408 "st1 {v16.s}[0], [%0], %6 \n"
409 MEMACCESS(0)
410 "st1 {v16.s}[1], [%0], %6 \n"
411 MEMACCESS(0)
412 "st1 {v16.s}[2], [%0], %6 \n"
413 MEMACCESS(0)
414 "st1 {v16.s}[3], [%0], %6 \n"
415
416 "add %0, %2, #4 \n"
417 MEMACCESS(0)
418 "st1 {v18.s}[0], [%0], %6 \n"
419 MEMACCESS(0)
420 "st1 {v18.s}[1], [%0], %6 \n"
421 MEMACCESS(0)
422 "st1 {v18.s}[2], [%0], %6 \n"
423 MEMACCESS(0)
424 "st1 {v18.s}[3], [%0] \n"
425
426 "mov %0, %3 \n"
427
428 MEMACCESS(0)
429 "st1 {v17.s}[0], [%0], %7 \n"
430 MEMACCESS(0)
431 "st1 {v17.s}[1], [%0], %7 \n"
432 MEMACCESS(0)
433 "st1 {v17.s}[2], [%0], %7 \n"
434 MEMACCESS(0)
435 "st1 {v17.s}[3], [%0], %7 \n"
436
437 "add %0, %3, #4 \n"
438 MEMACCESS(0)
439 "st1 {v19.s}[0], [%0], %7 \n"
440 MEMACCESS(0)
441 "st1 {v19.s}[1], [%0], %7 \n"
442 MEMACCESS(0)
443 "st1 {v19.s}[2], [%0], %7 \n"
444 MEMACCESS(0)
445 "st1 {v19.s}[3], [%0] \n"
446
447 "add %1, %1, #8 \n" // src += 4 * 2
448 "add %2, %2, %6, lsl #2 \n" // dst_a += 4 * dst_stride_a
449 "add %3, %3, %7, lsl #2 \n" // dst_b += 4 * dst_stride_b
450 "subs %4, %4, #4 \n" // w -= 4
451 "b.eq 4f \n"
452
453 // some residual, check to see if it includes a 2x8 block,
454 // or less
455 "cmp %4, #2 \n"
456 "b.lt 3f \n"
457
458 // 2x8 block
459 "2: \n"
460 "mov %0, %1 \n"
461 MEMACCESS(0)
462 "ld2 {v0.h, v1.h}[0], [%0], %5 \n"
463 MEMACCESS(0)
464 "ld2 {v2.h, v3.h}[0], [%0], %5 \n"
465 MEMACCESS(0)
466 "ld2 {v0.h, v1.h}[1], [%0], %5 \n"
467 MEMACCESS(0)
468 "ld2 {v2.h, v3.h}[1], [%0], %5 \n"
469 MEMACCESS(0)
470 "ld2 {v0.h, v1.h}[2], [%0], %5 \n"
471 MEMACCESS(0)
472 "ld2 {v2.h, v3.h}[2], [%0], %5 \n"
473 MEMACCESS(0)
474 "ld2 {v0.h, v1.h}[3], [%0], %5 \n"
475 MEMACCESS(0)
476 "ld2 {v2.h, v3.h}[3], [%0] \n"
477
478 "trn1 v4.8b, v0.8b, v2.8b \n"
479 "trn2 v5.8b, v0.8b, v2.8b \n"
480 "trn1 v6.8b, v1.8b, v3.8b \n"
481 "trn2 v7.8b, v1.8b, v3.8b \n"
482
483 "mov %0, %2 \n"
484
485 MEMACCESS(0)
486 "st1 {v4.d}[0], [%0], %6 \n"
487 MEMACCESS(0)
488 "st1 {v6.d}[0], [%0] \n"
489
490 "mov %0, %3 \n"
491
492 MEMACCESS(0)
493 "st1 {v5.d}[0], [%0], %7 \n"
494 MEMACCESS(0)
495 "st1 {v7.d}[0], [%0] \n"
496
497 "add %1, %1, #4 \n" // src += 2 * 2
498 "add %2, %2, %6, lsl #1 \n" // dst_a += 2 * dst_stride_a
499 "add %3, %3, %7, lsl #1 \n" // dst_b += 2 * dst_stride_b
500 "subs %4, %4, #2 \n" // w -= 2
501 "b.eq 4f \n"
502
503 // 1x8 block
504 "3: \n"
505 MEMACCESS(1)
506 "ld2 {v0.b, v1.b}[0], [%1], %5 \n"
507 MEMACCESS(1)
508 "ld2 {v0.b, v1.b}[1], [%1], %5 \n"
509 MEMACCESS(1)
510 "ld2 {v0.b, v1.b}[2], [%1], %5 \n"
511 MEMACCESS(1)
512 "ld2 {v0.b, v1.b}[3], [%1], %5 \n"
513 MEMACCESS(1)
514 "ld2 {v0.b, v1.b}[4], [%1], %5 \n"
515 MEMACCESS(1)
516 "ld2 {v0.b, v1.b}[5], [%1], %5 \n"
517 MEMACCESS(1)
518 "ld2 {v0.b, v1.b}[6], [%1], %5 \n"
519 MEMACCESS(1)
520 "ld2 {v0.b, v1.b}[7], [%1] \n"
521
522 MEMACCESS(2)
523 "st1 {v0.d}[0], [%2] \n"
524 MEMACCESS(3)
525 "st1 {v1.d}[0], [%3] \n"
526
527 "4: \n"
528
529 : "=&r"(src_temp), // %0
530 "+r"(src), // %1
531 "+r"(dst_a), // %2
532 "+r"(dst_b), // %3
533 "+r"(width64) // %4
534 : "r"(static_cast<ptrdiff_t>(src_stride)), // %5
535 "r"(static_cast<ptrdiff_t>(dst_stride_a)), // %6
536 "r"(static_cast<ptrdiff_t>(dst_stride_b)), // %7
537 "r"(&kVTbl4x4TransposeDi) // %8
538 : "memory", "cc",
539 "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
540 "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
541 "v30", "v31"
542 );
543 }
544 #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
545
546 #ifdef __cplusplus
547 } // extern "C"
548 } // namespace libyuv
549 #endif
550