1 /*
2 * Copyright 2011 The LibYuv Project Authors. All rights reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "libyuv/rotate_row.h"
12 #include "libyuv/row.h"
13
14 #include "libyuv/basic_types.h"
15
16 #ifdef __cplusplus
17 namespace libyuv {
18 extern "C" {
19 #endif
20
21 #if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
22 !defined(__aarch64__)
23
24 static const uvec8 kVTbl4x4Transpose = {0, 4, 8, 12, 1, 5, 9, 13,
25 2, 6, 10, 14, 3, 7, 11, 15};
26
TransposeWx8_NEON(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int width)27 void TransposeWx8_NEON(const uint8_t* src,
28 int src_stride,
29 uint8_t* dst,
30 int dst_stride,
31 int width) {
32 const uint8_t* src_temp;
33 asm volatile(
34 // loops are on blocks of 8. loop will stop when
35 // counter gets to or below 0. starting the counter
36 // at w-8 allow for this
37 "sub %5, #8 \n"
38
39 // handle 8x8 blocks. this should be the majority of the plane
40 "1: \n"
41 "mov %0, %1 \n"
42
43 "vld1.8 {d0}, [%0], %2 \n"
44 "vld1.8 {d1}, [%0], %2 \n"
45 "vld1.8 {d2}, [%0], %2 \n"
46 "vld1.8 {d3}, [%0], %2 \n"
47 "vld1.8 {d4}, [%0], %2 \n"
48 "vld1.8 {d5}, [%0], %2 \n"
49 "vld1.8 {d6}, [%0], %2 \n"
50 "vld1.8 {d7}, [%0] \n"
51
52 "vtrn.8 d1, d0 \n"
53 "vtrn.8 d3, d2 \n"
54 "vtrn.8 d5, d4 \n"
55 "vtrn.8 d7, d6 \n"
56
57 "vtrn.16 d1, d3 \n"
58 "vtrn.16 d0, d2 \n"
59 "vtrn.16 d5, d7 \n"
60 "vtrn.16 d4, d6 \n"
61
62 "vtrn.32 d1, d5 \n"
63 "vtrn.32 d0, d4 \n"
64 "vtrn.32 d3, d7 \n"
65 "vtrn.32 d2, d6 \n"
66
67 "vrev16.8 q0, q0 \n"
68 "vrev16.8 q1, q1 \n"
69 "vrev16.8 q2, q2 \n"
70 "vrev16.8 q3, q3 \n"
71
72 "mov %0, %3 \n"
73
74 "vst1.8 {d1}, [%0], %4 \n"
75 "vst1.8 {d0}, [%0], %4 \n"
76 "vst1.8 {d3}, [%0], %4 \n"
77 "vst1.8 {d2}, [%0], %4 \n"
78 "vst1.8 {d5}, [%0], %4 \n"
79 "vst1.8 {d4}, [%0], %4 \n"
80 "vst1.8 {d7}, [%0], %4 \n"
81 "vst1.8 {d6}, [%0] \n"
82
83 "add %1, #8 \n" // src += 8
84 "add %3, %3, %4, lsl #3 \n" // dst += 8 * dst_stride
85 "subs %5, #8 \n" // w -= 8
86 "bge 1b \n"
87
88 // add 8 back to counter. if the result is 0 there are
89 // no residuals.
90 "adds %5, #8 \n"
91 "beq 4f \n"
92
93 // some residual, so between 1 and 7 lines left to transpose
94 "cmp %5, #2 \n"
95 "blt 3f \n"
96
97 "cmp %5, #4 \n"
98 "blt 2f \n"
99
100 // 4x8 block
101 "mov %0, %1 \n"
102 "vld1.32 {d0[0]}, [%0], %2 \n"
103 "vld1.32 {d0[1]}, [%0], %2 \n"
104 "vld1.32 {d1[0]}, [%0], %2 \n"
105 "vld1.32 {d1[1]}, [%0], %2 \n"
106 "vld1.32 {d2[0]}, [%0], %2 \n"
107 "vld1.32 {d2[1]}, [%0], %2 \n"
108 "vld1.32 {d3[0]}, [%0], %2 \n"
109 "vld1.32 {d3[1]}, [%0] \n"
110
111 "mov %0, %3 \n"
112
113 "vld1.8 {q3}, [%6] \n"
114
115 "vtbl.8 d4, {d0, d1}, d6 \n"
116 "vtbl.8 d5, {d0, d1}, d7 \n"
117 "vtbl.8 d0, {d2, d3}, d6 \n"
118 "vtbl.8 d1, {d2, d3}, d7 \n"
119
120 // TODO(frkoenig): Rework shuffle above to
121 // write out with 4 instead of 8 writes.
122 "vst1.32 {d4[0]}, [%0], %4 \n"
123 "vst1.32 {d4[1]}, [%0], %4 \n"
124 "vst1.32 {d5[0]}, [%0], %4 \n"
125 "vst1.32 {d5[1]}, [%0] \n"
126
127 "add %0, %3, #4 \n"
128 "vst1.32 {d0[0]}, [%0], %4 \n"
129 "vst1.32 {d0[1]}, [%0], %4 \n"
130 "vst1.32 {d1[0]}, [%0], %4 \n"
131 "vst1.32 {d1[1]}, [%0] \n"
132
133 "add %1, #4 \n" // src += 4
134 "add %3, %3, %4, lsl #2 \n" // dst += 4 * dst_stride
135 "subs %5, #4 \n" // w -= 4
136 "beq 4f \n"
137
138 // some residual, check to see if it includes a 2x8 block,
139 // or less
140 "cmp %5, #2 \n"
141 "blt 3f \n"
142
143 // 2x8 block
144 "2: \n"
145 "mov %0, %1 \n"
146 "vld1.16 {d0[0]}, [%0], %2 \n"
147 "vld1.16 {d1[0]}, [%0], %2 \n"
148 "vld1.16 {d0[1]}, [%0], %2 \n"
149 "vld1.16 {d1[1]}, [%0], %2 \n"
150 "vld1.16 {d0[2]}, [%0], %2 \n"
151 "vld1.16 {d1[2]}, [%0], %2 \n"
152 "vld1.16 {d0[3]}, [%0], %2 \n"
153 "vld1.16 {d1[3]}, [%0] \n"
154
155 "vtrn.8 d0, d1 \n"
156
157 "mov %0, %3 \n"
158
159 "vst1.64 {d0}, [%0], %4 \n"
160 "vst1.64 {d1}, [%0] \n"
161
162 "add %1, #2 \n" // src += 2
163 "add %3, %3, %4, lsl #1 \n" // dst += 2 * dst_stride
164 "subs %5, #2 \n" // w -= 2
165 "beq 4f \n"
166
167 // 1x8 block
168 "3: \n"
169 "vld1.8 {d0[0]}, [%1], %2 \n"
170 "vld1.8 {d0[1]}, [%1], %2 \n"
171 "vld1.8 {d0[2]}, [%1], %2 \n"
172 "vld1.8 {d0[3]}, [%1], %2 \n"
173 "vld1.8 {d0[4]}, [%1], %2 \n"
174 "vld1.8 {d0[5]}, [%1], %2 \n"
175 "vld1.8 {d0[6]}, [%1], %2 \n"
176 "vld1.8 {d0[7]}, [%1] \n"
177
178 "vst1.64 {d0}, [%3] \n"
179
180 "4: \n"
181
182 : "=&r"(src_temp), // %0
183 "+r"(src), // %1
184 "+r"(src_stride), // %2
185 "+r"(dst), // %3
186 "+r"(dst_stride), // %4
187 "+r"(width) // %5
188 : "r"(&kVTbl4x4Transpose) // %6
189 : "memory", "cc", "q0", "q1", "q2", "q3");
190 }
191
192 static const uvec8 kVTbl4x4TransposeDi = {0, 8, 1, 9, 2, 10, 3, 11,
193 4, 12, 5, 13, 6, 14, 7, 15};
194
TransposeUVWx8_NEON(const uint8_t * src,int src_stride,uint8_t * dst_a,int dst_stride_a,uint8_t * dst_b,int dst_stride_b,int width)195 void TransposeUVWx8_NEON(const uint8_t* src,
196 int src_stride,
197 uint8_t* dst_a,
198 int dst_stride_a,
199 uint8_t* dst_b,
200 int dst_stride_b,
201 int width) {
202 const uint8_t* src_temp;
203 asm volatile(
204 // loops are on blocks of 8. loop will stop when
205 // counter gets to or below 0. starting the counter
206 // at w-8 allow for this
207 "sub %7, #8 \n"
208
209 // handle 8x8 blocks. this should be the majority of the plane
210 "1: \n"
211 "mov %0, %1 \n"
212
213 "vld2.8 {d0, d1}, [%0], %2 \n"
214 "vld2.8 {d2, d3}, [%0], %2 \n"
215 "vld2.8 {d4, d5}, [%0], %2 \n"
216 "vld2.8 {d6, d7}, [%0], %2 \n"
217 "vld2.8 {d16, d17}, [%0], %2 \n"
218 "vld2.8 {d18, d19}, [%0], %2 \n"
219 "vld2.8 {d20, d21}, [%0], %2 \n"
220 "vld2.8 {d22, d23}, [%0] \n"
221
222 "vtrn.8 q1, q0 \n"
223 "vtrn.8 q3, q2 \n"
224 "vtrn.8 q9, q8 \n"
225 "vtrn.8 q11, q10 \n"
226
227 "vtrn.16 q1, q3 \n"
228 "vtrn.16 q0, q2 \n"
229 "vtrn.16 q9, q11 \n"
230 "vtrn.16 q8, q10 \n"
231
232 "vtrn.32 q1, q9 \n"
233 "vtrn.32 q0, q8 \n"
234 "vtrn.32 q3, q11 \n"
235 "vtrn.32 q2, q10 \n"
236
237 "vrev16.8 q0, q0 \n"
238 "vrev16.8 q1, q1 \n"
239 "vrev16.8 q2, q2 \n"
240 "vrev16.8 q3, q3 \n"
241 "vrev16.8 q8, q8 \n"
242 "vrev16.8 q9, q9 \n"
243 "vrev16.8 q10, q10 \n"
244 "vrev16.8 q11, q11 \n"
245
246 "mov %0, %3 \n"
247
248 "vst1.8 {d2}, [%0], %4 \n"
249 "vst1.8 {d0}, [%0], %4 \n"
250 "vst1.8 {d6}, [%0], %4 \n"
251 "vst1.8 {d4}, [%0], %4 \n"
252 "vst1.8 {d18}, [%0], %4 \n"
253 "vst1.8 {d16}, [%0], %4 \n"
254 "vst1.8 {d22}, [%0], %4 \n"
255 "vst1.8 {d20}, [%0] \n"
256
257 "mov %0, %5 \n"
258
259 "vst1.8 {d3}, [%0], %6 \n"
260 "vst1.8 {d1}, [%0], %6 \n"
261 "vst1.8 {d7}, [%0], %6 \n"
262 "vst1.8 {d5}, [%0], %6 \n"
263 "vst1.8 {d19}, [%0], %6 \n"
264 "vst1.8 {d17}, [%0], %6 \n"
265 "vst1.8 {d23}, [%0], %6 \n"
266 "vst1.8 {d21}, [%0] \n"
267
268 "add %1, #8*2 \n" // src += 8*2
269 "add %3, %3, %4, lsl #3 \n" // dst_a += 8 *
270 // dst_stride_a
271 "add %5, %5, %6, lsl #3 \n" // dst_b += 8 *
272 // dst_stride_b
273 "subs %7, #8 \n" // w -= 8
274 "bge 1b \n"
275
276 // add 8 back to counter. if the result is 0 there are
277 // no residuals.
278 "adds %7, #8 \n"
279 "beq 4f \n"
280
281 // some residual, so between 1 and 7 lines left to transpose
282 "cmp %7, #2 \n"
283 "blt 3f \n"
284
285 "cmp %7, #4 \n"
286 "blt 2f \n"
287
288 // TODO(frkoenig): Clean this up
289 // 4x8 block
290 "mov %0, %1 \n"
291 "vld1.64 {d0}, [%0], %2 \n"
292 "vld1.64 {d1}, [%0], %2 \n"
293 "vld1.64 {d2}, [%0], %2 \n"
294 "vld1.64 {d3}, [%0], %2 \n"
295 "vld1.64 {d4}, [%0], %2 \n"
296 "vld1.64 {d5}, [%0], %2 \n"
297 "vld1.64 {d6}, [%0], %2 \n"
298 "vld1.64 {d7}, [%0] \n"
299
300 "vld1.8 {q15}, [%8] \n"
301
302 "vtrn.8 q0, q1 \n"
303 "vtrn.8 q2, q3 \n"
304
305 "vtbl.8 d16, {d0, d1}, d30 \n"
306 "vtbl.8 d17, {d0, d1}, d31 \n"
307 "vtbl.8 d18, {d2, d3}, d30 \n"
308 "vtbl.8 d19, {d2, d3}, d31 \n"
309 "vtbl.8 d20, {d4, d5}, d30 \n"
310 "vtbl.8 d21, {d4, d5}, d31 \n"
311 "vtbl.8 d22, {d6, d7}, d30 \n"
312 "vtbl.8 d23, {d6, d7}, d31 \n"
313
314 "mov %0, %3 \n"
315
316 "vst1.32 {d16[0]}, [%0], %4 \n"
317 "vst1.32 {d16[1]}, [%0], %4 \n"
318 "vst1.32 {d17[0]}, [%0], %4 \n"
319 "vst1.32 {d17[1]}, [%0], %4 \n"
320
321 "add %0, %3, #4 \n"
322 "vst1.32 {d20[0]}, [%0], %4 \n"
323 "vst1.32 {d20[1]}, [%0], %4 \n"
324 "vst1.32 {d21[0]}, [%0], %4 \n"
325 "vst1.32 {d21[1]}, [%0] \n"
326
327 "mov %0, %5 \n"
328
329 "vst1.32 {d18[0]}, [%0], %6 \n"
330 "vst1.32 {d18[1]}, [%0], %6 \n"
331 "vst1.32 {d19[0]}, [%0], %6 \n"
332 "vst1.32 {d19[1]}, [%0], %6 \n"
333
334 "add %0, %5, #4 \n"
335 "vst1.32 {d22[0]}, [%0], %6 \n"
336 "vst1.32 {d22[1]}, [%0], %6 \n"
337 "vst1.32 {d23[0]}, [%0], %6 \n"
338 "vst1.32 {d23[1]}, [%0] \n"
339
340 "add %1, #4*2 \n" // src += 4 * 2
341 "add %3, %3, %4, lsl #2 \n" // dst_a += 4 *
342 // dst_stride_a
343 "add %5, %5, %6, lsl #2 \n" // dst_b += 4 *
344 // dst_stride_b
345 "subs %7, #4 \n" // w -= 4
346 "beq 4f \n"
347
348 // some residual, check to see if it includes a 2x8 block,
349 // or less
350 "cmp %7, #2 \n"
351 "blt 3f \n"
352
353 // 2x8 block
354 "2: \n"
355 "mov %0, %1 \n"
356 "vld2.16 {d0[0], d2[0]}, [%0], %2 \n"
357 "vld2.16 {d1[0], d3[0]}, [%0], %2 \n"
358 "vld2.16 {d0[1], d2[1]}, [%0], %2 \n"
359 "vld2.16 {d1[1], d3[1]}, [%0], %2 \n"
360 "vld2.16 {d0[2], d2[2]}, [%0], %2 \n"
361 "vld2.16 {d1[2], d3[2]}, [%0], %2 \n"
362 "vld2.16 {d0[3], d2[3]}, [%0], %2 \n"
363 "vld2.16 {d1[3], d3[3]}, [%0] \n"
364
365 "vtrn.8 d0, d1 \n"
366 "vtrn.8 d2, d3 \n"
367
368 "mov %0, %3 \n"
369
370 "vst1.64 {d0}, [%0], %4 \n"
371 "vst1.64 {d2}, [%0] \n"
372
373 "mov %0, %5 \n"
374
375 "vst1.64 {d1}, [%0], %6 \n"
376 "vst1.64 {d3}, [%0] \n"
377
378 "add %1, #2*2 \n" // src += 2 * 2
379 "add %3, %3, %4, lsl #1 \n" // dst_a += 2 *
380 // dst_stride_a
381 "add %5, %5, %6, lsl #1 \n" // dst_b += 2 *
382 // dst_stride_b
383 "subs %7, #2 \n" // w -= 2
384 "beq 4f \n"
385
386 // 1x8 block
387 "3: \n"
388 "vld2.8 {d0[0], d1[0]}, [%1], %2 \n"
389 "vld2.8 {d0[1], d1[1]}, [%1], %2 \n"
390 "vld2.8 {d0[2], d1[2]}, [%1], %2 \n"
391 "vld2.8 {d0[3], d1[3]}, [%1], %2 \n"
392 "vld2.8 {d0[4], d1[4]}, [%1], %2 \n"
393 "vld2.8 {d0[5], d1[5]}, [%1], %2 \n"
394 "vld2.8 {d0[6], d1[6]}, [%1], %2 \n"
395 "vld2.8 {d0[7], d1[7]}, [%1] \n"
396
397 "vst1.64 {d0}, [%3] \n"
398 "vst1.64 {d1}, [%5] \n"
399
400 "4: \n"
401
402 : "=&r"(src_temp), // %0
403 "+r"(src), // %1
404 "+r"(src_stride), // %2
405 "+r"(dst_a), // %3
406 "+r"(dst_stride_a), // %4
407 "+r"(dst_b), // %5
408 "+r"(dst_stride_b), // %6
409 "+r"(width) // %7
410 : "r"(&kVTbl4x4TransposeDi) // %8
411 : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
412 }
413 #endif // defined(__ARM_NEON__) && !defined(__aarch64__)
414
415 #ifdef __cplusplus
416 } // extern "C"
417 } // namespace libyuv
418 #endif
419