1 /*
2 * Copyright 2014 The LibYuv Project Authors. All rights reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "libyuv/rotate_row.h"
12 #include "libyuv/row.h"
13
14 #include "libyuv/basic_types.h"
15
16 #ifdef __cplusplus
17 namespace libyuv {
18 extern "C" {
19 #endif
20
21 // This module is for GCC Neon armv8 64 bit.
22 #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
23
24 static const uvec8 kVTbl4x4Transpose = {0, 4, 8, 12, 1, 5, 9, 13,
25 2, 6, 10, 14, 3, 7, 11, 15};
26
TransposeWx8_NEON(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int width)27 void TransposeWx8_NEON(const uint8_t* src,
28 int src_stride,
29 uint8_t* dst,
30 int dst_stride,
31 int width) {
32 const uint8_t* src_temp;
33 asm volatile(
34 // loops are on blocks of 8. loop will stop when
35 // counter gets to or below 0. starting the counter
36 // at w-8 allow for this
37 "sub %w3, %w3, #8 \n"
38
39 // handle 8x8 blocks. this should be the majority of the plane
40 "1: \n"
41 "mov %0, %1 \n"
42
43 "ld1 {v0.8b}, [%0], %5 \n"
44 "ld1 {v1.8b}, [%0], %5 \n"
45 "ld1 {v2.8b}, [%0], %5 \n"
46 "ld1 {v3.8b}, [%0], %5 \n"
47 "ld1 {v4.8b}, [%0], %5 \n"
48 "ld1 {v5.8b}, [%0], %5 \n"
49 "ld1 {v6.8b}, [%0], %5 \n"
50 "ld1 {v7.8b}, [%0] \n"
51 "mov %0, %1 \n"
52
53 "trn2 v16.8b, v0.8b, v1.8b \n"
54 "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
55 "trn1 v17.8b, v0.8b, v1.8b \n"
56 "add %0, %0, %5 \n"
57 "trn2 v18.8b, v2.8b, v3.8b \n"
58 "prfm pldl1keep, [%0, 448] \n" // row 1
59 "trn1 v19.8b, v2.8b, v3.8b \n"
60 "add %0, %0, %5 \n"
61 "trn2 v20.8b, v4.8b, v5.8b \n"
62 "prfm pldl1keep, [%0, 448] \n" // row 2
63 "trn1 v21.8b, v4.8b, v5.8b \n"
64 "add %0, %0, %5 \n"
65 "trn2 v22.8b, v6.8b, v7.8b \n"
66 "prfm pldl1keep, [%0, 448] \n" // row 3
67 "trn1 v23.8b, v6.8b, v7.8b \n"
68 "add %0, %0, %5 \n"
69
70 "trn2 v3.4h, v17.4h, v19.4h \n"
71 "prfm pldl1keep, [%0, 448] \n" // row 4
72 "trn1 v1.4h, v17.4h, v19.4h \n"
73 "add %0, %0, %5 \n"
74 "trn2 v2.4h, v16.4h, v18.4h \n"
75 "prfm pldl1keep, [%0, 448] \n" // row 5
76 "trn1 v0.4h, v16.4h, v18.4h \n"
77 "add %0, %0, %5 \n"
78 "trn2 v7.4h, v21.4h, v23.4h \n"
79 "prfm pldl1keep, [%0, 448] \n" // row 6
80 "trn1 v5.4h, v21.4h, v23.4h \n"
81 "add %0, %0, %5 \n"
82 "trn2 v6.4h, v20.4h, v22.4h \n"
83 "prfm pldl1keep, [%0, 448] \n" // row 7
84 "trn1 v4.4h, v20.4h, v22.4h \n"
85
86 "trn2 v21.2s, v1.2s, v5.2s \n"
87 "trn1 v17.2s, v1.2s, v5.2s \n"
88 "trn2 v20.2s, v0.2s, v4.2s \n"
89 "trn1 v16.2s, v0.2s, v4.2s \n"
90 "trn2 v23.2s, v3.2s, v7.2s \n"
91 "trn1 v19.2s, v3.2s, v7.2s \n"
92 "trn2 v22.2s, v2.2s, v6.2s \n"
93 "trn1 v18.2s, v2.2s, v6.2s \n"
94
95 "mov %0, %2 \n"
96
97 "st1 {v17.8b}, [%0], %6 \n"
98 "st1 {v16.8b}, [%0], %6 \n"
99 "st1 {v19.8b}, [%0], %6 \n"
100 "st1 {v18.8b}, [%0], %6 \n"
101 "st1 {v21.8b}, [%0], %6 \n"
102 "st1 {v20.8b}, [%0], %6 \n"
103 "st1 {v23.8b}, [%0], %6 \n"
104 "st1 {v22.8b}, [%0] \n"
105
106 "add %1, %1, #8 \n" // src += 8
107 "add %2, %2, %6, lsl #3 \n" // dst += 8 * dst_stride
108 "subs %w3, %w3, #8 \n" // w -= 8
109 "b.ge 1b \n"
110
111 // add 8 back to counter. if the result is 0 there are
112 // no residuals.
113 "adds %w3, %w3, #8 \n"
114 "b.eq 4f \n"
115
116 // some residual, so between 1 and 7 lines left to transpose
117 "cmp %w3, #2 \n"
118 "b.lt 3f \n"
119
120 "cmp %w3, #4 \n"
121 "b.lt 2f \n"
122
123 // 4x8 block
124 "mov %0, %1 \n"
125 "ld1 {v0.s}[0], [%0], %5 \n"
126 "ld1 {v0.s}[1], [%0], %5 \n"
127 "ld1 {v0.s}[2], [%0], %5 \n"
128 "ld1 {v0.s}[3], [%0], %5 \n"
129 "ld1 {v1.s}[0], [%0], %5 \n"
130 "ld1 {v1.s}[1], [%0], %5 \n"
131 "ld1 {v1.s}[2], [%0], %5 \n"
132 "ld1 {v1.s}[3], [%0] \n"
133
134 "mov %0, %2 \n"
135
136 "ld1 {v2.16b}, [%4] \n"
137
138 "tbl v3.16b, {v0.16b}, v2.16b \n"
139 "tbl v0.16b, {v1.16b}, v2.16b \n"
140
141 // TODO(frkoenig): Rework shuffle above to
142 // write out with 4 instead of 8 writes.
143 "st1 {v3.s}[0], [%0], %6 \n"
144 "st1 {v3.s}[1], [%0], %6 \n"
145 "st1 {v3.s}[2], [%0], %6 \n"
146 "st1 {v3.s}[3], [%0] \n"
147
148 "add %0, %2, #4 \n"
149 "st1 {v0.s}[0], [%0], %6 \n"
150 "st1 {v0.s}[1], [%0], %6 \n"
151 "st1 {v0.s}[2], [%0], %6 \n"
152 "st1 {v0.s}[3], [%0] \n"
153
154 "add %1, %1, #4 \n" // src += 4
155 "add %2, %2, %6, lsl #2 \n" // dst += 4 * dst_stride
156 "subs %w3, %w3, #4 \n" // w -= 4
157 "b.eq 4f \n"
158
159 // some residual, check to see if it includes a 2x8 block,
160 // or less
161 "cmp %w3, #2 \n"
162 "b.lt 3f \n"
163
164 // 2x8 block
165 "2: \n"
166 "mov %0, %1 \n"
167 "ld1 {v0.h}[0], [%0], %5 \n"
168 "ld1 {v1.h}[0], [%0], %5 \n"
169 "ld1 {v0.h}[1], [%0], %5 \n"
170 "ld1 {v1.h}[1], [%0], %5 \n"
171 "ld1 {v0.h}[2], [%0], %5 \n"
172 "ld1 {v1.h}[2], [%0], %5 \n"
173 "ld1 {v0.h}[3], [%0], %5 \n"
174 "ld1 {v1.h}[3], [%0] \n"
175
176 "trn2 v2.8b, v0.8b, v1.8b \n"
177 "trn1 v3.8b, v0.8b, v1.8b \n"
178
179 "mov %0, %2 \n"
180
181 "st1 {v3.8b}, [%0], %6 \n"
182 "st1 {v2.8b}, [%0] \n"
183
184 "add %1, %1, #2 \n" // src += 2
185 "add %2, %2, %6, lsl #1 \n" // dst += 2 * dst_stride
186 "subs %w3, %w3, #2 \n" // w -= 2
187 "b.eq 4f \n"
188
189 // 1x8 block
190 "3: \n"
191 "ld1 {v0.b}[0], [%1], %5 \n"
192 "ld1 {v0.b}[1], [%1], %5 \n"
193 "ld1 {v0.b}[2], [%1], %5 \n"
194 "ld1 {v0.b}[3], [%1], %5 \n"
195 "ld1 {v0.b}[4], [%1], %5 \n"
196 "ld1 {v0.b}[5], [%1], %5 \n"
197 "ld1 {v0.b}[6], [%1], %5 \n"
198 "ld1 {v0.b}[7], [%1] \n"
199
200 "st1 {v0.8b}, [%2] \n"
201
202 "4: \n"
203
204 : "=&r"(src_temp), // %0
205 "+r"(src), // %1
206 "+r"(dst), // %2
207 "+r"(width) // %3
208 : "r"(&kVTbl4x4Transpose), // %4
209 "r"(static_cast<ptrdiff_t>(src_stride)), // %5
210 "r"(static_cast<ptrdiff_t>(dst_stride)) // %6
211 : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
212 "v17", "v18", "v19", "v20", "v21", "v22", "v23");
213 }
214
215 static const uint8_t kVTbl4x4TransposeDi[32] = {
216 0, 16, 32, 48, 2, 18, 34, 50, 4, 20, 36, 52, 6, 22, 38, 54,
217 1, 17, 33, 49, 3, 19, 35, 51, 5, 21, 37, 53, 7, 23, 39, 55};
218
TransposeUVWx8_NEON(const uint8_t * src,int src_stride,uint8_t * dst_a,int dst_stride_a,uint8_t * dst_b,int dst_stride_b,int width)219 void TransposeUVWx8_NEON(const uint8_t* src,
220 int src_stride,
221 uint8_t* dst_a,
222 int dst_stride_a,
223 uint8_t* dst_b,
224 int dst_stride_b,
225 int width) {
226 const uint8_t* src_temp;
227 asm volatile(
228 // loops are on blocks of 8. loop will stop when
229 // counter gets to or below 0. starting the counter
230 // at w-8 allow for this
231 "sub %w4, %w4, #8 \n"
232
233 // handle 8x8 blocks. this should be the majority of the plane
234 "1: \n"
235 "mov %0, %1 \n"
236
237 "ld1 {v0.16b}, [%0], %5 \n"
238 "ld1 {v1.16b}, [%0], %5 \n"
239 "ld1 {v2.16b}, [%0], %5 \n"
240 "ld1 {v3.16b}, [%0], %5 \n"
241 "ld1 {v4.16b}, [%0], %5 \n"
242 "ld1 {v5.16b}, [%0], %5 \n"
243 "ld1 {v6.16b}, [%0], %5 \n"
244 "ld1 {v7.16b}, [%0] \n"
245 "mov %0, %1 \n"
246
247 "trn1 v16.16b, v0.16b, v1.16b \n"
248 "trn2 v17.16b, v0.16b, v1.16b \n"
249 "trn1 v18.16b, v2.16b, v3.16b \n"
250 "trn2 v19.16b, v2.16b, v3.16b \n"
251 "trn1 v20.16b, v4.16b, v5.16b \n"
252 "trn2 v21.16b, v4.16b, v5.16b \n"
253 "trn1 v22.16b, v6.16b, v7.16b \n"
254 "trn2 v23.16b, v6.16b, v7.16b \n"
255
256 "trn1 v0.8h, v16.8h, v18.8h \n"
257 "trn2 v1.8h, v16.8h, v18.8h \n"
258 "trn1 v2.8h, v20.8h, v22.8h \n"
259 "trn2 v3.8h, v20.8h, v22.8h \n"
260 "trn1 v4.8h, v17.8h, v19.8h \n"
261 "trn2 v5.8h, v17.8h, v19.8h \n"
262 "trn1 v6.8h, v21.8h, v23.8h \n"
263 "trn2 v7.8h, v21.8h, v23.8h \n"
264
265 "trn1 v16.4s, v0.4s, v2.4s \n"
266 "trn2 v17.4s, v0.4s, v2.4s \n"
267 "trn1 v18.4s, v1.4s, v3.4s \n"
268 "trn2 v19.4s, v1.4s, v3.4s \n"
269 "trn1 v20.4s, v4.4s, v6.4s \n"
270 "trn2 v21.4s, v4.4s, v6.4s \n"
271 "trn1 v22.4s, v5.4s, v7.4s \n"
272 "trn2 v23.4s, v5.4s, v7.4s \n"
273
274 "mov %0, %2 \n"
275
276 "st1 {v16.d}[0], [%0], %6 \n"
277 "st1 {v18.d}[0], [%0], %6 \n"
278 "st1 {v17.d}[0], [%0], %6 \n"
279 "st1 {v19.d}[0], [%0], %6 \n"
280 "st1 {v16.d}[1], [%0], %6 \n"
281 "st1 {v18.d}[1], [%0], %6 \n"
282 "st1 {v17.d}[1], [%0], %6 \n"
283 "st1 {v19.d}[1], [%0] \n"
284
285 "mov %0, %3 \n"
286
287 "st1 {v20.d}[0], [%0], %7 \n"
288 "st1 {v22.d}[0], [%0], %7 \n"
289 "st1 {v21.d}[0], [%0], %7 \n"
290 "st1 {v23.d}[0], [%0], %7 \n"
291 "st1 {v20.d}[1], [%0], %7 \n"
292 "st1 {v22.d}[1], [%0], %7 \n"
293 "st1 {v21.d}[1], [%0], %7 \n"
294 "st1 {v23.d}[1], [%0] \n"
295
296 "add %1, %1, #16 \n" // src += 8*2
297 "add %2, %2, %6, lsl #3 \n" // dst_a += 8 *
298 // dst_stride_a
299 "add %3, %3, %7, lsl #3 \n" // dst_b += 8 *
300 // dst_stride_b
301 "subs %w4, %w4, #8 \n" // w -= 8
302 "b.ge 1b \n"
303
304 // add 8 back to counter. if the result is 0 there are
305 // no residuals.
306 "adds %w4, %w4, #8 \n"
307 "b.eq 4f \n"
308
309 // some residual, so between 1 and 7 lines left to transpose
310 "cmp %w4, #2 \n"
311 "b.lt 3f \n"
312
313 "cmp %w4, #4 \n"
314 "b.lt 2f \n"
315
316 // TODO(frkoenig): Clean this up
317 // 4x8 block
318 "mov %0, %1 \n"
319 "ld1 {v0.8b}, [%0], %5 \n"
320 "ld1 {v1.8b}, [%0], %5 \n"
321 "ld1 {v2.8b}, [%0], %5 \n"
322 "ld1 {v3.8b}, [%0], %5 \n"
323 "ld1 {v4.8b}, [%0], %5 \n"
324 "ld1 {v5.8b}, [%0], %5 \n"
325 "ld1 {v6.8b}, [%0], %5 \n"
326 "ld1 {v7.8b}, [%0] \n"
327
328 "ld1 {v30.16b}, [%8], #16 \n"
329 "ld1 {v31.16b}, [%8] \n"
330
331 "tbl v16.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v30.16b \n"
332 "tbl v17.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v31.16b \n"
333 "tbl v18.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v30.16b \n"
334 "tbl v19.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v31.16b \n"
335
336 "mov %0, %2 \n"
337
338 "st1 {v16.s}[0], [%0], %6 \n"
339 "st1 {v16.s}[1], [%0], %6 \n"
340 "st1 {v16.s}[2], [%0], %6 \n"
341 "st1 {v16.s}[3], [%0], %6 \n"
342
343 "add %0, %2, #4 \n"
344 "st1 {v18.s}[0], [%0], %6 \n"
345 "st1 {v18.s}[1], [%0], %6 \n"
346 "st1 {v18.s}[2], [%0], %6 \n"
347 "st1 {v18.s}[3], [%0] \n"
348
349 "mov %0, %3 \n"
350
351 "st1 {v17.s}[0], [%0], %7 \n"
352 "st1 {v17.s}[1], [%0], %7 \n"
353 "st1 {v17.s}[2], [%0], %7 \n"
354 "st1 {v17.s}[3], [%0], %7 \n"
355
356 "add %0, %3, #4 \n"
357 "st1 {v19.s}[0], [%0], %7 \n"
358 "st1 {v19.s}[1], [%0], %7 \n"
359 "st1 {v19.s}[2], [%0], %7 \n"
360 "st1 {v19.s}[3], [%0] \n"
361
362 "add %1, %1, #8 \n" // src += 4 * 2
363 "add %2, %2, %6, lsl #2 \n" // dst_a += 4 *
364 // dst_stride_a
365 "add %3, %3, %7, lsl #2 \n" // dst_b += 4 *
366 // dst_stride_b
367 "subs %w4, %w4, #4 \n" // w -= 4
368 "b.eq 4f \n"
369
370 // some residual, check to see if it includes a 2x8 block,
371 // or less
372 "cmp %w4, #2 \n"
373 "b.lt 3f \n"
374
375 // 2x8 block
376 "2: \n"
377 "mov %0, %1 \n"
378 "ld2 {v0.h, v1.h}[0], [%0], %5 \n"
379 "ld2 {v2.h, v3.h}[0], [%0], %5 \n"
380 "ld2 {v0.h, v1.h}[1], [%0], %5 \n"
381 "ld2 {v2.h, v3.h}[1], [%0], %5 \n"
382 "ld2 {v0.h, v1.h}[2], [%0], %5 \n"
383 "ld2 {v2.h, v3.h}[2], [%0], %5 \n"
384 "ld2 {v0.h, v1.h}[3], [%0], %5 \n"
385 "ld2 {v2.h, v3.h}[3], [%0] \n"
386
387 "trn1 v4.8b, v0.8b, v2.8b \n"
388 "trn2 v5.8b, v0.8b, v2.8b \n"
389 "trn1 v6.8b, v1.8b, v3.8b \n"
390 "trn2 v7.8b, v1.8b, v3.8b \n"
391
392 "mov %0, %2 \n"
393
394 "st1 {v4.d}[0], [%0], %6 \n"
395 "st1 {v6.d}[0], [%0] \n"
396
397 "mov %0, %3 \n"
398
399 "st1 {v5.d}[0], [%0], %7 \n"
400 "st1 {v7.d}[0], [%0] \n"
401
402 "add %1, %1, #4 \n" // src += 2 * 2
403 "add %2, %2, %6, lsl #1 \n" // dst_a += 2 *
404 // dst_stride_a
405 "add %3, %3, %7, lsl #1 \n" // dst_b += 2 *
406 // dst_stride_b
407 "subs %w4, %w4, #2 \n" // w -= 2
408 "b.eq 4f \n"
409
410 // 1x8 block
411 "3: \n"
412 "ld2 {v0.b, v1.b}[0], [%1], %5 \n"
413 "ld2 {v0.b, v1.b}[1], [%1], %5 \n"
414 "ld2 {v0.b, v1.b}[2], [%1], %5 \n"
415 "ld2 {v0.b, v1.b}[3], [%1], %5 \n"
416 "ld2 {v0.b, v1.b}[4], [%1], %5 \n"
417 "ld2 {v0.b, v1.b}[5], [%1], %5 \n"
418 "ld2 {v0.b, v1.b}[6], [%1], %5 \n"
419 "ld2 {v0.b, v1.b}[7], [%1] \n"
420
421 "st1 {v0.d}[0], [%2] \n"
422 "st1 {v1.d}[0], [%3] \n"
423
424 "4: \n"
425
426 : "=&r"(src_temp), // %0
427 "+r"(src), // %1
428 "+r"(dst_a), // %2
429 "+r"(dst_b), // %3
430 "+r"(width) // %4
431 : "r"(static_cast<ptrdiff_t>(src_stride)), // %5
432 "r"(static_cast<ptrdiff_t>(dst_stride_a)), // %6
433 "r"(static_cast<ptrdiff_t>(dst_stride_b)), // %7
434 "r"(&kVTbl4x4TransposeDi) // %8
435 : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
436 "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v30", "v31");
437 }
438 #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
439
440 #ifdef __cplusplus
441 } // extern "C"
442 } // namespace libyuv
443 #endif
444