1 /*
2 * Copyright 2011 The LibYuv Project Authors. All rights reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "libyuv/row.h"
12
13 #include "libyuv/basic_types.h"
14
15 #ifdef __cplusplus
16 namespace libyuv {
17 extern "C" {
18 #endif
19
20 #if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__)
21
22 static const uvec8 kVTbl4x4Transpose =
23 { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 };
24
TransposeWx8_NEON(const uint8 * src,int src_stride,uint8 * dst,int dst_stride,int width)25 void TransposeWx8_NEON(const uint8* src, int src_stride,
26 uint8* dst, int dst_stride,
27 int width) {
28 asm volatile (
29 // loops are on blocks of 8. loop will stop when
30 // counter gets to or below 0. starting the counter
31 // at w-8 allow for this
32 "sub %4, #8 \n"
33
34 // handle 8x8 blocks. this should be the majority of the plane
35 ".p2align 4 \n"
36 "1: \n"
37 "mov r9, %0 \n"
38
39 "vld1.8 {d0}, [r9], %1 \n"
40 "vld1.8 {d1}, [r9], %1 \n"
41 "vld1.8 {d2}, [r9], %1 \n"
42 "vld1.8 {d3}, [r9], %1 \n"
43 "vld1.8 {d4}, [r9], %1 \n"
44 "vld1.8 {d5}, [r9], %1 \n"
45 "vld1.8 {d6}, [r9], %1 \n"
46 "vld1.8 {d7}, [r9] \n"
47
48 "vtrn.8 d1, d0 \n"
49 "vtrn.8 d3, d2 \n"
50 "vtrn.8 d5, d4 \n"
51 "vtrn.8 d7, d6 \n"
52
53 "vtrn.16 d1, d3 \n"
54 "vtrn.16 d0, d2 \n"
55 "vtrn.16 d5, d7 \n"
56 "vtrn.16 d4, d6 \n"
57
58 "vtrn.32 d1, d5 \n"
59 "vtrn.32 d0, d4 \n"
60 "vtrn.32 d3, d7 \n"
61 "vtrn.32 d2, d6 \n"
62
63 "vrev16.8 q0, q0 \n"
64 "vrev16.8 q1, q1 \n"
65 "vrev16.8 q2, q2 \n"
66 "vrev16.8 q3, q3 \n"
67
68 "mov r9, %2 \n"
69
70 "vst1.8 {d1}, [r9], %3 \n"
71 "vst1.8 {d0}, [r9], %3 \n"
72 "vst1.8 {d3}, [r9], %3 \n"
73 "vst1.8 {d2}, [r9], %3 \n"
74 "vst1.8 {d5}, [r9], %3 \n"
75 "vst1.8 {d4}, [r9], %3 \n"
76 "vst1.8 {d7}, [r9], %3 \n"
77 "vst1.8 {d6}, [r9] \n"
78
79 "add %0, #8 \n" // src += 8
80 "add %2, %2, %3, lsl #3 \n" // dst += 8 * dst_stride
81 "subs %4, #8 \n" // w -= 8
82 "bge 1b \n"
83
84 // add 8 back to counter. if the result is 0 there are
85 // no residuals.
86 "adds %4, #8 \n"
87 "beq 4f \n"
88
89 // some residual, so between 1 and 7 lines left to transpose
90 "cmp %4, #2 \n"
91 "blt 3f \n"
92
93 "cmp %4, #4 \n"
94 "blt 2f \n"
95
96 // 4x8 block
97 "mov r9, %0 \n"
98 "vld1.32 {d0[0]}, [r9], %1 \n"
99 "vld1.32 {d0[1]}, [r9], %1 \n"
100 "vld1.32 {d1[0]}, [r9], %1 \n"
101 "vld1.32 {d1[1]}, [r9], %1 \n"
102 "vld1.32 {d2[0]}, [r9], %1 \n"
103 "vld1.32 {d2[1]}, [r9], %1 \n"
104 "vld1.32 {d3[0]}, [r9], %1 \n"
105 "vld1.32 {d3[1]}, [r9] \n"
106
107 "mov r9, %2 \n"
108
109 "vld1.8 {q3}, [%5] \n"
110
111 "vtbl.8 d4, {d0, d1}, d6 \n"
112 "vtbl.8 d5, {d0, d1}, d7 \n"
113 "vtbl.8 d0, {d2, d3}, d6 \n"
114 "vtbl.8 d1, {d2, d3}, d7 \n"
115
116 // TODO: rework shuffle above to write
117 // out with 4 instead of 8 writes
118 "vst1.32 {d4[0]}, [r9], %3 \n"
119 "vst1.32 {d4[1]}, [r9], %3 \n"
120 "vst1.32 {d5[0]}, [r9], %3 \n"
121 "vst1.32 {d5[1]}, [r9] \n"
122
123 "add r9, %2, #4 \n"
124 "vst1.32 {d0[0]}, [r9], %3 \n"
125 "vst1.32 {d0[1]}, [r9], %3 \n"
126 "vst1.32 {d1[0]}, [r9], %3 \n"
127 "vst1.32 {d1[1]}, [r9] \n"
128
129 "add %0, #4 \n" // src += 4
130 "add %2, %2, %3, lsl #2 \n" // dst += 4 * dst_stride
131 "subs %4, #4 \n" // w -= 4
132 "beq 4f \n"
133
134 // some residual, check to see if it includes a 2x8 block,
135 // or less
136 "cmp %4, #2 \n"
137 "blt 3f \n"
138
139 // 2x8 block
140 "2: \n"
141 "mov r9, %0 \n"
142 "vld1.16 {d0[0]}, [r9], %1 \n"
143 "vld1.16 {d1[0]}, [r9], %1 \n"
144 "vld1.16 {d0[1]}, [r9], %1 \n"
145 "vld1.16 {d1[1]}, [r9], %1 \n"
146 "vld1.16 {d0[2]}, [r9], %1 \n"
147 "vld1.16 {d1[2]}, [r9], %1 \n"
148 "vld1.16 {d0[3]}, [r9], %1 \n"
149 "vld1.16 {d1[3]}, [r9] \n"
150
151 "vtrn.8 d0, d1 \n"
152
153 "mov r9, %2 \n"
154
155 "vst1.64 {d0}, [r9], %3 \n"
156 "vst1.64 {d1}, [r9] \n"
157
158 "add %0, #2 \n" // src += 2
159 "add %2, %2, %3, lsl #1 \n" // dst += 2 * dst_stride
160 "subs %4, #2 \n" // w -= 2
161 "beq 4f \n"
162
163 // 1x8 block
164 "3: \n"
165 "vld1.8 {d0[0]}, [%0], %1 \n"
166 "vld1.8 {d0[1]}, [%0], %1 \n"
167 "vld1.8 {d0[2]}, [%0], %1 \n"
168 "vld1.8 {d0[3]}, [%0], %1 \n"
169 "vld1.8 {d0[4]}, [%0], %1 \n"
170 "vld1.8 {d0[5]}, [%0], %1 \n"
171 "vld1.8 {d0[6]}, [%0], %1 \n"
172 "vld1.8 {d0[7]}, [%0] \n"
173
174 "vst1.64 {d0}, [%2] \n"
175
176 "4: \n"
177
178 : "+r"(src), // %0
179 "+r"(src_stride), // %1
180 "+r"(dst), // %2
181 "+r"(dst_stride), // %3
182 "+r"(width) // %4
183 : "r"(&kVTbl4x4Transpose) // %5
184 : "memory", "cc", "r9", "q0", "q1", "q2", "q3"
185 );
186 }
187
188 static const uvec8 kVTbl4x4TransposeDi =
189 { 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 };
190
TransposeUVWx8_NEON(const uint8 * src,int src_stride,uint8 * dst_a,int dst_stride_a,uint8 * dst_b,int dst_stride_b,int width)191 void TransposeUVWx8_NEON(const uint8* src, int src_stride,
192 uint8* dst_a, int dst_stride_a,
193 uint8* dst_b, int dst_stride_b,
194 int width) {
195 asm volatile (
196 // loops are on blocks of 8. loop will stop when
197 // counter gets to or below 0. starting the counter
198 // at w-8 allow for this
199 "sub %6, #8 \n"
200
201 // handle 8x8 blocks. this should be the majority of the plane
202 ".p2align 4 \n"
203 "1: \n"
204 "mov r9, %0 \n"
205
206 "vld2.8 {d0, d1}, [r9], %1 \n"
207 "vld2.8 {d2, d3}, [r9], %1 \n"
208 "vld2.8 {d4, d5}, [r9], %1 \n"
209 "vld2.8 {d6, d7}, [r9], %1 \n"
210 "vld2.8 {d16, d17}, [r9], %1 \n"
211 "vld2.8 {d18, d19}, [r9], %1 \n"
212 "vld2.8 {d20, d21}, [r9], %1 \n"
213 "vld2.8 {d22, d23}, [r9] \n"
214
215 "vtrn.8 q1, q0 \n"
216 "vtrn.8 q3, q2 \n"
217 "vtrn.8 q9, q8 \n"
218 "vtrn.8 q11, q10 \n"
219
220 "vtrn.16 q1, q3 \n"
221 "vtrn.16 q0, q2 \n"
222 "vtrn.16 q9, q11 \n"
223 "vtrn.16 q8, q10 \n"
224
225 "vtrn.32 q1, q9 \n"
226 "vtrn.32 q0, q8 \n"
227 "vtrn.32 q3, q11 \n"
228 "vtrn.32 q2, q10 \n"
229
230 "vrev16.8 q0, q0 \n"
231 "vrev16.8 q1, q1 \n"
232 "vrev16.8 q2, q2 \n"
233 "vrev16.8 q3, q3 \n"
234 "vrev16.8 q8, q8 \n"
235 "vrev16.8 q9, q9 \n"
236 "vrev16.8 q10, q10 \n"
237 "vrev16.8 q11, q11 \n"
238
239 "mov r9, %2 \n"
240
241 "vst1.8 {d2}, [r9], %3 \n"
242 "vst1.8 {d0}, [r9], %3 \n"
243 "vst1.8 {d6}, [r9], %3 \n"
244 "vst1.8 {d4}, [r9], %3 \n"
245 "vst1.8 {d18}, [r9], %3 \n"
246 "vst1.8 {d16}, [r9], %3 \n"
247 "vst1.8 {d22}, [r9], %3 \n"
248 "vst1.8 {d20}, [r9] \n"
249
250 "mov r9, %4 \n"
251
252 "vst1.8 {d3}, [r9], %5 \n"
253 "vst1.8 {d1}, [r9], %5 \n"
254 "vst1.8 {d7}, [r9], %5 \n"
255 "vst1.8 {d5}, [r9], %5 \n"
256 "vst1.8 {d19}, [r9], %5 \n"
257 "vst1.8 {d17}, [r9], %5 \n"
258 "vst1.8 {d23}, [r9], %5 \n"
259 "vst1.8 {d21}, [r9] \n"
260
261 "add %0, #8*2 \n" // src += 8*2
262 "add %2, %2, %3, lsl #3 \n" // dst_a += 8 * dst_stride_a
263 "add %4, %4, %5, lsl #3 \n" // dst_b += 8 * dst_stride_b
264 "subs %6, #8 \n" // w -= 8
265 "bge 1b \n"
266
267 // add 8 back to counter. if the result is 0 there are
268 // no residuals.
269 "adds %6, #8 \n"
270 "beq 4f \n"
271
272 // some residual, so between 1 and 7 lines left to transpose
273 "cmp %6, #2 \n"
274 "blt 3f \n"
275
276 "cmp %6, #4 \n"
277 "blt 2f \n"
278
279 //TODO(frkoenig) : clean this up
280 // 4x8 block
281 "mov r9, %0 \n"
282 "vld1.64 {d0}, [r9], %1 \n"
283 "vld1.64 {d1}, [r9], %1 \n"
284 "vld1.64 {d2}, [r9], %1 \n"
285 "vld1.64 {d3}, [r9], %1 \n"
286 "vld1.64 {d4}, [r9], %1 \n"
287 "vld1.64 {d5}, [r9], %1 \n"
288 "vld1.64 {d6}, [r9], %1 \n"
289 "vld1.64 {d7}, [r9] \n"
290
291 "vld1.8 {q15}, [%7] \n"
292
293 "vtrn.8 q0, q1 \n"
294 "vtrn.8 q2, q3 \n"
295
296 "vtbl.8 d16, {d0, d1}, d30 \n"
297 "vtbl.8 d17, {d0, d1}, d31 \n"
298 "vtbl.8 d18, {d2, d3}, d30 \n"
299 "vtbl.8 d19, {d2, d3}, d31 \n"
300 "vtbl.8 d20, {d4, d5}, d30 \n"
301 "vtbl.8 d21, {d4, d5}, d31 \n"
302 "vtbl.8 d22, {d6, d7}, d30 \n"
303 "vtbl.8 d23, {d6, d7}, d31 \n"
304
305 "mov r9, %2 \n"
306
307 "vst1.32 {d16[0]}, [r9], %3 \n"
308 "vst1.32 {d16[1]}, [r9], %3 \n"
309 "vst1.32 {d17[0]}, [r9], %3 \n"
310 "vst1.32 {d17[1]}, [r9], %3 \n"
311
312 "add r9, %2, #4 \n"
313 "vst1.32 {d20[0]}, [r9], %3 \n"
314 "vst1.32 {d20[1]}, [r9], %3 \n"
315 "vst1.32 {d21[0]}, [r9], %3 \n"
316 "vst1.32 {d21[1]}, [r9] \n"
317
318 "mov r9, %4 \n"
319
320 "vst1.32 {d18[0]}, [r9], %5 \n"
321 "vst1.32 {d18[1]}, [r9], %5 \n"
322 "vst1.32 {d19[0]}, [r9], %5 \n"
323 "vst1.32 {d19[1]}, [r9], %5 \n"
324
325 "add r9, %4, #4 \n"
326 "vst1.32 {d22[0]}, [r9], %5 \n"
327 "vst1.32 {d22[1]}, [r9], %5 \n"
328 "vst1.32 {d23[0]}, [r9], %5 \n"
329 "vst1.32 {d23[1]}, [r9] \n"
330
331 "add %0, #4*2 \n" // src += 4 * 2
332 "add %2, %2, %3, lsl #2 \n" // dst_a += 4 * dst_stride_a
333 "add %4, %4, %5, lsl #2 \n" // dst_b += 4 * dst_stride_b
334 "subs %6, #4 \n" // w -= 4
335 "beq 4f \n"
336
337 // some residual, check to see if it includes a 2x8 block,
338 // or less
339 "cmp %6, #2 \n"
340 "blt 3f \n"
341
342 // 2x8 block
343 "2: \n"
344 "mov r9, %0 \n"
345 "vld2.16 {d0[0], d2[0]}, [r9], %1 \n"
346 "vld2.16 {d1[0], d3[0]}, [r9], %1 \n"
347 "vld2.16 {d0[1], d2[1]}, [r9], %1 \n"
348 "vld2.16 {d1[1], d3[1]}, [r9], %1 \n"
349 "vld2.16 {d0[2], d2[2]}, [r9], %1 \n"
350 "vld2.16 {d1[2], d3[2]}, [r9], %1 \n"
351 "vld2.16 {d0[3], d2[3]}, [r9], %1 \n"
352 "vld2.16 {d1[3], d3[3]}, [r9] \n"
353
354 "vtrn.8 d0, d1 \n"
355 "vtrn.8 d2, d3 \n"
356
357 "mov r9, %2 \n"
358
359 "vst1.64 {d0}, [r9], %3 \n"
360 "vst1.64 {d2}, [r9] \n"
361
362 "mov r9, %4 \n"
363
364 "vst1.64 {d1}, [r9], %5 \n"
365 "vst1.64 {d3}, [r9] \n"
366
367 "add %0, #2*2 \n" // src += 2 * 2
368 "add %2, %2, %3, lsl #1 \n" // dst_a += 2 * dst_stride_a
369 "add %4, %4, %5, lsl #1 \n" // dst_b += 2 * dst_stride_b
370 "subs %6, #2 \n" // w -= 2
371 "beq 4f \n"
372
373 // 1x8 block
374 "3: \n"
375 "vld2.8 {d0[0], d1[0]}, [%0], %1 \n"
376 "vld2.8 {d0[1], d1[1]}, [%0], %1 \n"
377 "vld2.8 {d0[2], d1[2]}, [%0], %1 \n"
378 "vld2.8 {d0[3], d1[3]}, [%0], %1 \n"
379 "vld2.8 {d0[4], d1[4]}, [%0], %1 \n"
380 "vld2.8 {d0[5], d1[5]}, [%0], %1 \n"
381 "vld2.8 {d0[6], d1[6]}, [%0], %1 \n"
382 "vld2.8 {d0[7], d1[7]}, [%0] \n"
383
384 "vst1.64 {d0}, [%2] \n"
385 "vst1.64 {d1}, [%4] \n"
386
387 "4: \n"
388
389 : "+r"(src), // %0
390 "+r"(src_stride), // %1
391 "+r"(dst_a), // %2
392 "+r"(dst_stride_a), // %3
393 "+r"(dst_b), // %4
394 "+r"(dst_stride_b), // %5
395 "+r"(width) // %6
396 : "r"(&kVTbl4x4TransposeDi) // %7
397 : "memory", "cc", "r9",
398 "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"
399 );
400 }
401 #endif
402
403 #ifdef __cplusplus
404 } // extern "C"
405 } // namespace libyuv
406 #endif
407