1 /*
2 * Copyright 2011 The LibYuv Project Authors. All rights reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "libyuv/rotate.h"
12
13 #include "libyuv/cpu_id.h"
14 #include "libyuv/convert.h"
15 #include "libyuv/planar_functions.h"
16 #include "libyuv/row.h"
17
18 #ifdef __cplusplus
19 namespace libyuv {
20 extern "C" {
21 #endif
22
23 #if !defined(LIBYUV_DISABLE_X86) && \
24 (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
25 #if defined(__APPLE__) && defined(__i386__)
26 #define DECLARE_FUNCTION(name) \
27 ".text \n" \
28 ".private_extern _" #name " \n" \
29 ".align 4,0x90 \n" \
30 "_" #name ": \n"
31 #elif defined(__MINGW32__) || defined(__CYGWIN__) && defined(__i386__)
32 #define DECLARE_FUNCTION(name) \
33 ".text \n" \
34 ".align 4,0x90 \n" \
35 "_" #name ": \n"
36 #else
37 #define DECLARE_FUNCTION(name) \
38 ".text \n" \
39 ".align 4,0x90 \n" \
40 #name ": \n"
41 #endif
42 #endif
43
44 #if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
45 (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
46 #define HAS_MIRRORROW_NEON
47 void MirrorRow_NEON(const uint8* src, uint8* dst, int width);
48 #define HAS_MIRRORROW_UV_NEON
49 void MirrorUVRow_NEON(const uint8* src, uint8* dst_a, uint8* dst_b, int width);
50 #define HAS_TRANSPOSE_WX8_NEON
51 void TransposeWx8_NEON(const uint8* src, int src_stride,
52 uint8* dst, int dst_stride, int width);
53 #define HAS_TRANSPOSE_UVWX8_NEON
54 void TransposeUVWx8_NEON(const uint8* src, int src_stride,
55 uint8* dst_a, int dst_stride_a,
56 uint8* dst_b, int dst_stride_b,
57 int width);
58 #endif // defined(__ARM_NEON__)
59
60 #if !defined(LIBYUV_DISABLE_MIPS) && !defined(__native_client__) && \
61 defined(__mips__) && \
62 defined(__mips_dsp) && (__mips_dsp_rev >= 2)
63 #define HAS_TRANSPOSE_WX8_MIPS_DSPR2
64 void TransposeWx8_MIPS_DSPR2(const uint8* src, int src_stride,
65 uint8* dst, int dst_stride, int width);
66
67 void TransposeWx8_FAST_MIPS_DSPR2(const uint8* src, int src_stride,
68 uint8* dst, int dst_stride, int width);
69 #define HAS_TRANSPOSE_UVWx8_MIPS_DSPR2
70 void TransposeUVWx8_MIPS_DSPR2(const uint8* src, int src_stride,
71 uint8* dst_a, int dst_stride_a,
72 uint8* dst_b, int dst_stride_b,
73 int width);
74 #endif // defined(__mips__)
75
76 #if !defined(LIBYUV_DISABLE_X86) && \
77 defined(_M_IX86) && defined(_MSC_VER)
78 #define HAS_TRANSPOSE_WX8_SSSE3
79 __declspec(naked) __declspec(align(16))
TransposeWx8_SSSE3(const uint8 * src,int src_stride,uint8 * dst,int dst_stride,int width)80 static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
81 uint8* dst, int dst_stride, int width) {
82 __asm {
83 push edi
84 push esi
85 push ebp
86 mov eax, [esp + 12 + 4] // src
87 mov edi, [esp + 12 + 8] // src_stride
88 mov edx, [esp + 12 + 12] // dst
89 mov esi, [esp + 12 + 16] // dst_stride
90 mov ecx, [esp + 12 + 20] // width
91
92 // Read in the data from the source pointer.
93 // First round of bit swap.
94 align 4
95 convertloop:
96 movq xmm0, qword ptr [eax]
97 lea ebp, [eax + 8]
98 movq xmm1, qword ptr [eax + edi]
99 lea eax, [eax + 2 * edi]
100 punpcklbw xmm0, xmm1
101 movq xmm2, qword ptr [eax]
102 movdqa xmm1, xmm0
103 palignr xmm1, xmm1, 8
104 movq xmm3, qword ptr [eax + edi]
105 lea eax, [eax + 2 * edi]
106 punpcklbw xmm2, xmm3
107 movdqa xmm3, xmm2
108 movq xmm4, qword ptr [eax]
109 palignr xmm3, xmm3, 8
110 movq xmm5, qword ptr [eax + edi]
111 punpcklbw xmm4, xmm5
112 lea eax, [eax + 2 * edi]
113 movdqa xmm5, xmm4
114 movq xmm6, qword ptr [eax]
115 palignr xmm5, xmm5, 8
116 movq xmm7, qword ptr [eax + edi]
117 punpcklbw xmm6, xmm7
118 mov eax, ebp
119 movdqa xmm7, xmm6
120 palignr xmm7, xmm7, 8
121 // Second round of bit swap.
122 punpcklwd xmm0, xmm2
123 punpcklwd xmm1, xmm3
124 movdqa xmm2, xmm0
125 movdqa xmm3, xmm1
126 palignr xmm2, xmm2, 8
127 palignr xmm3, xmm3, 8
128 punpcklwd xmm4, xmm6
129 punpcklwd xmm5, xmm7
130 movdqa xmm6, xmm4
131 movdqa xmm7, xmm5
132 palignr xmm6, xmm6, 8
133 palignr xmm7, xmm7, 8
134 // Third round of bit swap.
135 // Write to the destination pointer.
136 punpckldq xmm0, xmm4
137 movq qword ptr [edx], xmm0
138 movdqa xmm4, xmm0
139 palignr xmm4, xmm4, 8
140 movq qword ptr [edx + esi], xmm4
141 lea edx, [edx + 2 * esi]
142 punpckldq xmm2, xmm6
143 movdqa xmm6, xmm2
144 palignr xmm6, xmm6, 8
145 movq qword ptr [edx], xmm2
146 punpckldq xmm1, xmm5
147 movq qword ptr [edx + esi], xmm6
148 lea edx, [edx + 2 * esi]
149 movdqa xmm5, xmm1
150 movq qword ptr [edx], xmm1
151 palignr xmm5, xmm5, 8
152 punpckldq xmm3, xmm7
153 movq qword ptr [edx + esi], xmm5
154 lea edx, [edx + 2 * esi]
155 movq qword ptr [edx], xmm3
156 movdqa xmm7, xmm3
157 palignr xmm7, xmm7, 8
158 sub ecx, 8
159 movq qword ptr [edx + esi], xmm7
160 lea edx, [edx + 2 * esi]
161 jg convertloop
162
163 pop ebp
164 pop esi
165 pop edi
166 ret
167 }
168 }
169
170 #define HAS_TRANSPOSE_UVWX8_SSE2
171 __declspec(naked) __declspec(align(16))
TransposeUVWx8_SSE2(const uint8 * src,int src_stride,uint8 * dst_a,int dst_stride_a,uint8 * dst_b,int dst_stride_b,int w)172 static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
173 uint8* dst_a, int dst_stride_a,
174 uint8* dst_b, int dst_stride_b,
175 int w) {
176 __asm {
177 push ebx
178 push esi
179 push edi
180 push ebp
181 mov eax, [esp + 16 + 4] // src
182 mov edi, [esp + 16 + 8] // src_stride
183 mov edx, [esp + 16 + 12] // dst_a
184 mov esi, [esp + 16 + 16] // dst_stride_a
185 mov ebx, [esp + 16 + 20] // dst_b
186 mov ebp, [esp + 16 + 24] // dst_stride_b
187 mov ecx, esp
188 sub esp, 4 + 16
189 and esp, ~15
190 mov [esp + 16], ecx
191 mov ecx, [ecx + 16 + 28] // w
192
193 align 4
194 convertloop:
195 // Read in the data from the source pointer.
196 // First round of bit swap.
197 movdqa xmm0, [eax]
198 movdqa xmm1, [eax + edi]
199 lea eax, [eax + 2 * edi]
200 movdqa xmm7, xmm0 // use xmm7 as temp register.
201 punpcklbw xmm0, xmm1
202 punpckhbw xmm7, xmm1
203 movdqa xmm1, xmm7
204 movdqa xmm2, [eax]
205 movdqa xmm3, [eax + edi]
206 lea eax, [eax + 2 * edi]
207 movdqa xmm7, xmm2
208 punpcklbw xmm2, xmm3
209 punpckhbw xmm7, xmm3
210 movdqa xmm3, xmm7
211 movdqa xmm4, [eax]
212 movdqa xmm5, [eax + edi]
213 lea eax, [eax + 2 * edi]
214 movdqa xmm7, xmm4
215 punpcklbw xmm4, xmm5
216 punpckhbw xmm7, xmm5
217 movdqa xmm5, xmm7
218 movdqa xmm6, [eax]
219 movdqa xmm7, [eax + edi]
220 lea eax, [eax + 2 * edi]
221 movdqa [esp], xmm5 // backup xmm5
222 neg edi
223 movdqa xmm5, xmm6 // use xmm5 as temp register.
224 punpcklbw xmm6, xmm7
225 punpckhbw xmm5, xmm7
226 movdqa xmm7, xmm5
227 lea eax, [eax + 8 * edi + 16]
228 neg edi
229 // Second round of bit swap.
230 movdqa xmm5, xmm0
231 punpcklwd xmm0, xmm2
232 punpckhwd xmm5, xmm2
233 movdqa xmm2, xmm5
234 movdqa xmm5, xmm1
235 punpcklwd xmm1, xmm3
236 punpckhwd xmm5, xmm3
237 movdqa xmm3, xmm5
238 movdqa xmm5, xmm4
239 punpcklwd xmm4, xmm6
240 punpckhwd xmm5, xmm6
241 movdqa xmm6, xmm5
242 movdqa xmm5, [esp] // restore xmm5
243 movdqa [esp], xmm6 // backup xmm6
244 movdqa xmm6, xmm5 // use xmm6 as temp register.
245 punpcklwd xmm5, xmm7
246 punpckhwd xmm6, xmm7
247 movdqa xmm7, xmm6
248 // Third round of bit swap.
249 // Write to the destination pointer.
250 movdqa xmm6, xmm0
251 punpckldq xmm0, xmm4
252 punpckhdq xmm6, xmm4
253 movdqa xmm4, xmm6
254 movdqa xmm6, [esp] // restore xmm6
255 movlpd qword ptr [edx], xmm0
256 movhpd qword ptr [ebx], xmm0
257 movlpd qword ptr [edx + esi], xmm4
258 lea edx, [edx + 2 * esi]
259 movhpd qword ptr [ebx + ebp], xmm4
260 lea ebx, [ebx + 2 * ebp]
261 movdqa xmm0, xmm2 // use xmm0 as the temp register.
262 punpckldq xmm2, xmm6
263 movlpd qword ptr [edx], xmm2
264 movhpd qword ptr [ebx], xmm2
265 punpckhdq xmm0, xmm6
266 movlpd qword ptr [edx + esi], xmm0
267 lea edx, [edx + 2 * esi]
268 movhpd qword ptr [ebx + ebp], xmm0
269 lea ebx, [ebx + 2 * ebp]
270 movdqa xmm0, xmm1 // use xmm0 as the temp register.
271 punpckldq xmm1, xmm5
272 movlpd qword ptr [edx], xmm1
273 movhpd qword ptr [ebx], xmm1
274 punpckhdq xmm0, xmm5
275 movlpd qword ptr [edx + esi], xmm0
276 lea edx, [edx + 2 * esi]
277 movhpd qword ptr [ebx + ebp], xmm0
278 lea ebx, [ebx + 2 * ebp]
279 movdqa xmm0, xmm3 // use xmm0 as the temp register.
280 punpckldq xmm3, xmm7
281 movlpd qword ptr [edx], xmm3
282 movhpd qword ptr [ebx], xmm3
283 punpckhdq xmm0, xmm7
284 sub ecx, 8
285 movlpd qword ptr [edx + esi], xmm0
286 lea edx, [edx + 2 * esi]
287 movhpd qword ptr [ebx + ebp], xmm0
288 lea ebx, [ebx + 2 * ebp]
289 jg convertloop
290
291 mov esp, [esp + 16]
292 pop ebp
293 pop edi
294 pop esi
295 pop ebx
296 ret
297 }
298 }
299 #elif !defined(LIBYUV_DISABLE_X86) && \
300 (defined(__i386__) || (defined(__x86_64__) && !defined(__native_client__)))
301 #define HAS_TRANSPOSE_WX8_SSSE3
TransposeWx8_SSSE3(const uint8 * src,int src_stride,uint8 * dst,int dst_stride,int width)302 static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
303 uint8* dst, int dst_stride, int width) {
304 asm volatile (
305 // Read in the data from the source pointer.
306 // First round of bit swap.
307 ".p2align 2 \n"
308 "1: \n"
309 "movq (%0),%%xmm0 \n"
310 "movq (%0,%3),%%xmm1 \n"
311 "lea (%0,%3,2),%0 \n"
312 "punpcklbw %%xmm1,%%xmm0 \n"
313 "movq (%0),%%xmm2 \n"
314 "movdqa %%xmm0,%%xmm1 \n"
315 "palignr $0x8,%%xmm1,%%xmm1 \n"
316 "movq (%0,%3),%%xmm3 \n"
317 "lea (%0,%3,2),%0 \n"
318 "punpcklbw %%xmm3,%%xmm2 \n"
319 "movdqa %%xmm2,%%xmm3 \n"
320 "movq (%0),%%xmm4 \n"
321 "palignr $0x8,%%xmm3,%%xmm3 \n"
322 "movq (%0,%3),%%xmm5 \n"
323 "lea (%0,%3,2),%0 \n"
324 "punpcklbw %%xmm5,%%xmm4 \n"
325 "movdqa %%xmm4,%%xmm5 \n"
326 "movq (%0),%%xmm6 \n"
327 "palignr $0x8,%%xmm5,%%xmm5 \n"
328 "movq (%0,%3),%%xmm7 \n"
329 "lea (%0,%3,2),%0 \n"
330 "punpcklbw %%xmm7,%%xmm6 \n"
331 "neg %3 \n"
332 "movdqa %%xmm6,%%xmm7 \n"
333 "lea 0x8(%0,%3,8),%0 \n"
334 "palignr $0x8,%%xmm7,%%xmm7 \n"
335 "neg %3 \n"
336 // Second round of bit swap.
337 "punpcklwd %%xmm2,%%xmm0 \n"
338 "punpcklwd %%xmm3,%%xmm1 \n"
339 "movdqa %%xmm0,%%xmm2 \n"
340 "movdqa %%xmm1,%%xmm3 \n"
341 "palignr $0x8,%%xmm2,%%xmm2 \n"
342 "palignr $0x8,%%xmm3,%%xmm3 \n"
343 "punpcklwd %%xmm6,%%xmm4 \n"
344 "punpcklwd %%xmm7,%%xmm5 \n"
345 "movdqa %%xmm4,%%xmm6 \n"
346 "movdqa %%xmm5,%%xmm7 \n"
347 "palignr $0x8,%%xmm6,%%xmm6 \n"
348 "palignr $0x8,%%xmm7,%%xmm7 \n"
349 // Third round of bit swap.
350 // Write to the destination pointer.
351 "punpckldq %%xmm4,%%xmm0 \n"
352 "movq %%xmm0,(%1) \n"
353 "movdqa %%xmm0,%%xmm4 \n"
354 "palignr $0x8,%%xmm4,%%xmm4 \n"
355 "movq %%xmm4,(%1,%4) \n"
356 "lea (%1,%4,2),%1 \n"
357 "punpckldq %%xmm6,%%xmm2 \n"
358 "movdqa %%xmm2,%%xmm6 \n"
359 "movq %%xmm2,(%1) \n"
360 "palignr $0x8,%%xmm6,%%xmm6 \n"
361 "punpckldq %%xmm5,%%xmm1 \n"
362 "movq %%xmm6,(%1,%4) \n"
363 "lea (%1,%4,2),%1 \n"
364 "movdqa %%xmm1,%%xmm5 \n"
365 "movq %%xmm1,(%1) \n"
366 "palignr $0x8,%%xmm5,%%xmm5 \n"
367 "movq %%xmm5,(%1,%4) \n"
368 "lea (%1,%4,2),%1 \n"
369 "punpckldq %%xmm7,%%xmm3 \n"
370 "movq %%xmm3,(%1) \n"
371 "movdqa %%xmm3,%%xmm7 \n"
372 "palignr $0x8,%%xmm7,%%xmm7 \n"
373 "sub $0x8,%2 \n"
374 "movq %%xmm7,(%1,%4) \n"
375 "lea (%1,%4,2),%1 \n"
376 "jg 1b \n"
377 : "+r"(src), // %0
378 "+r"(dst), // %1
379 "+r"(width) // %2
380 : "r"((intptr_t)(src_stride)), // %3
381 "r"((intptr_t)(dst_stride)) // %4
382 : "memory", "cc"
383 #if defined(__SSE2__)
384 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
385 #endif
386 );
387 }
388
389 #if !defined(LIBYUV_DISABLE_X86) && defined(__i386__)
390 #define HAS_TRANSPOSE_UVWX8_SSE2
391 extern "C" void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
392 uint8* dst_a, int dst_stride_a,
393 uint8* dst_b, int dst_stride_b,
394 int w);
395 asm (
396 DECLARE_FUNCTION(TransposeUVWx8_SSE2)
397 "push %ebx \n"
398 "push %esi \n"
399 "push %edi \n"
400 "push %ebp \n"
401 "mov 0x14(%esp),%eax \n"
402 "mov 0x18(%esp),%edi \n"
403 "mov 0x1c(%esp),%edx \n"
404 "mov 0x20(%esp),%esi \n"
405 "mov 0x24(%esp),%ebx \n"
406 "mov 0x28(%esp),%ebp \n"
407 "mov %esp,%ecx \n"
408 "sub $0x14,%esp \n"
409 "and $0xfffffff0,%esp \n"
410 "mov %ecx,0x10(%esp) \n"
411 "mov 0x2c(%ecx),%ecx \n"
412
413 "1: \n"
414 "movdqa (%eax),%xmm0 \n"
415 "movdqa (%eax,%edi,1),%xmm1 \n"
416 "lea (%eax,%edi,2),%eax \n"
417 "movdqa %xmm0,%xmm7 \n"
418 "punpcklbw %xmm1,%xmm0 \n"
419 "punpckhbw %xmm1,%xmm7 \n"
420 "movdqa %xmm7,%xmm1 \n"
421 "movdqa (%eax),%xmm2 \n"
422 "movdqa (%eax,%edi,1),%xmm3 \n"
423 "lea (%eax,%edi,2),%eax \n"
424 "movdqa %xmm2,%xmm7 \n"
425 "punpcklbw %xmm3,%xmm2 \n"
426 "punpckhbw %xmm3,%xmm7 \n"
427 "movdqa %xmm7,%xmm3 \n"
428 "movdqa (%eax),%xmm4 \n"
429 "movdqa (%eax,%edi,1),%xmm5 \n"
430 "lea (%eax,%edi,2),%eax \n"
431 "movdqa %xmm4,%xmm7 \n"
432 "punpcklbw %xmm5,%xmm4 \n"
433 "punpckhbw %xmm5,%xmm7 \n"
434 "movdqa %xmm7,%xmm5 \n"
435 "movdqa (%eax),%xmm6 \n"
436 "movdqa (%eax,%edi,1),%xmm7 \n"
437 "lea (%eax,%edi,2),%eax \n"
438 "movdqa %xmm5,(%esp) \n"
439 "neg %edi \n"
440 "movdqa %xmm6,%xmm5 \n"
441 "punpcklbw %xmm7,%xmm6 \n"
442 "punpckhbw %xmm7,%xmm5 \n"
443 "movdqa %xmm5,%xmm7 \n"
444 "lea 0x10(%eax,%edi,8),%eax \n"
445 "neg %edi \n"
446 "movdqa %xmm0,%xmm5 \n"
447 "punpcklwd %xmm2,%xmm0 \n"
448 "punpckhwd %xmm2,%xmm5 \n"
449 "movdqa %xmm5,%xmm2 \n"
450 "movdqa %xmm1,%xmm5 \n"
451 "punpcklwd %xmm3,%xmm1 \n"
452 "punpckhwd %xmm3,%xmm5 \n"
453 "movdqa %xmm5,%xmm3 \n"
454 "movdqa %xmm4,%xmm5 \n"
455 "punpcklwd %xmm6,%xmm4 \n"
456 "punpckhwd %xmm6,%xmm5 \n"
457 "movdqa %xmm5,%xmm6 \n"
458 "movdqa (%esp),%xmm5 \n"
459 "movdqa %xmm6,(%esp) \n"
460 "movdqa %xmm5,%xmm6 \n"
461 "punpcklwd %xmm7,%xmm5 \n"
462 "punpckhwd %xmm7,%xmm6 \n"
463 "movdqa %xmm6,%xmm7 \n"
464 "movdqa %xmm0,%xmm6 \n"
465 "punpckldq %xmm4,%xmm0 \n"
466 "punpckhdq %xmm4,%xmm6 \n"
467 "movdqa %xmm6,%xmm4 \n"
468 "movdqa (%esp),%xmm6 \n"
469 "movlpd %xmm0,(%edx) \n"
470 "movhpd %xmm0,(%ebx) \n"
471 "movlpd %xmm4,(%edx,%esi,1) \n"
472 "lea (%edx,%esi,2),%edx \n"
473 "movhpd %xmm4,(%ebx,%ebp,1) \n"
474 "lea (%ebx,%ebp,2),%ebx \n"
475 "movdqa %xmm2,%xmm0 \n"
476 "punpckldq %xmm6,%xmm2 \n"
477 "movlpd %xmm2,(%edx) \n"
478 "movhpd %xmm2,(%ebx) \n"
479 "punpckhdq %xmm6,%xmm0 \n"
480 "movlpd %xmm0,(%edx,%esi,1) \n"
481 "lea (%edx,%esi,2),%edx \n"
482 "movhpd %xmm0,(%ebx,%ebp,1) \n"
483 "lea (%ebx,%ebp,2),%ebx \n"
484 "movdqa %xmm1,%xmm0 \n"
485 "punpckldq %xmm5,%xmm1 \n"
486 "movlpd %xmm1,(%edx) \n"
487 "movhpd %xmm1,(%ebx) \n"
488 "punpckhdq %xmm5,%xmm0 \n"
489 "movlpd %xmm0,(%edx,%esi,1) \n"
490 "lea (%edx,%esi,2),%edx \n"
491 "movhpd %xmm0,(%ebx,%ebp,1) \n"
492 "lea (%ebx,%ebp,2),%ebx \n"
493 "movdqa %xmm3,%xmm0 \n"
494 "punpckldq %xmm7,%xmm3 \n"
495 "movlpd %xmm3,(%edx) \n"
496 "movhpd %xmm3,(%ebx) \n"
497 "punpckhdq %xmm7,%xmm0 \n"
498 "sub $0x8,%ecx \n"
499 "movlpd %xmm0,(%edx,%esi,1) \n"
500 "lea (%edx,%esi,2),%edx \n"
501 "movhpd %xmm0,(%ebx,%ebp,1) \n"
502 "lea (%ebx,%ebp,2),%ebx \n"
503 "jg 1b \n"
504 "mov 0x10(%esp),%esp \n"
505 "pop %ebp \n"
506 "pop %edi \n"
507 "pop %esi \n"
508 "pop %ebx \n"
509 #if defined(__native_client__)
510 "pop %ecx \n"
511 "and $0xffffffe0,%ecx \n"
512 "jmp *%ecx \n"
513 #else
514 "ret \n"
515 #endif
516 );
517 #elif !defined(LIBYUV_DISABLE_X86) && !defined(__native_client__) && \
518 defined(__x86_64__)
519 // 64 bit version has enough registers to do 16x8 to 8x16 at a time.
520 #define HAS_TRANSPOSE_WX8_FAST_SSSE3
TransposeWx8_FAST_SSSE3(const uint8 * src,int src_stride,uint8 * dst,int dst_stride,int width)521 static void TransposeWx8_FAST_SSSE3(const uint8* src, int src_stride,
522 uint8* dst, int dst_stride, int width) {
523 asm volatile (
524 // Read in the data from the source pointer.
525 // First round of bit swap.
526 ".p2align 2 \n"
527 "1: \n"
528 "movdqa (%0),%%xmm0 \n"
529 "movdqa (%0,%3),%%xmm1 \n"
530 "lea (%0,%3,2),%0 \n"
531 "movdqa %%xmm0,%%xmm8 \n"
532 "punpcklbw %%xmm1,%%xmm0 \n"
533 "punpckhbw %%xmm1,%%xmm8 \n"
534 "movdqa (%0),%%xmm2 \n"
535 "movdqa %%xmm0,%%xmm1 \n"
536 "movdqa %%xmm8,%%xmm9 \n"
537 "palignr $0x8,%%xmm1,%%xmm1 \n"
538 "palignr $0x8,%%xmm9,%%xmm9 \n"
539 "movdqa (%0,%3),%%xmm3 \n"
540 "lea (%0,%3,2),%0 \n"
541 "movdqa %%xmm2,%%xmm10 \n"
542 "punpcklbw %%xmm3,%%xmm2 \n"
543 "punpckhbw %%xmm3,%%xmm10 \n"
544 "movdqa %%xmm2,%%xmm3 \n"
545 "movdqa %%xmm10,%%xmm11 \n"
546 "movdqa (%0),%%xmm4 \n"
547 "palignr $0x8,%%xmm3,%%xmm3 \n"
548 "palignr $0x8,%%xmm11,%%xmm11 \n"
549 "movdqa (%0,%3),%%xmm5 \n"
550 "lea (%0,%3,2),%0 \n"
551 "movdqa %%xmm4,%%xmm12 \n"
552 "punpcklbw %%xmm5,%%xmm4 \n"
553 "punpckhbw %%xmm5,%%xmm12 \n"
554 "movdqa %%xmm4,%%xmm5 \n"
555 "movdqa %%xmm12,%%xmm13 \n"
556 "movdqa (%0),%%xmm6 \n"
557 "palignr $0x8,%%xmm5,%%xmm5 \n"
558 "palignr $0x8,%%xmm13,%%xmm13 \n"
559 "movdqa (%0,%3),%%xmm7 \n"
560 "lea (%0,%3,2),%0 \n"
561 "movdqa %%xmm6,%%xmm14 \n"
562 "punpcklbw %%xmm7,%%xmm6 \n"
563 "punpckhbw %%xmm7,%%xmm14 \n"
564 "neg %3 \n"
565 "movdqa %%xmm6,%%xmm7 \n"
566 "movdqa %%xmm14,%%xmm15 \n"
567 "lea 0x10(%0,%3,8),%0 \n"
568 "palignr $0x8,%%xmm7,%%xmm7 \n"
569 "palignr $0x8,%%xmm15,%%xmm15 \n"
570 "neg %3 \n"
571 // Second round of bit swap.
572 "punpcklwd %%xmm2,%%xmm0 \n"
573 "punpcklwd %%xmm3,%%xmm1 \n"
574 "movdqa %%xmm0,%%xmm2 \n"
575 "movdqa %%xmm1,%%xmm3 \n"
576 "palignr $0x8,%%xmm2,%%xmm2 \n"
577 "palignr $0x8,%%xmm3,%%xmm3 \n"
578 "punpcklwd %%xmm6,%%xmm4 \n"
579 "punpcklwd %%xmm7,%%xmm5 \n"
580 "movdqa %%xmm4,%%xmm6 \n"
581 "movdqa %%xmm5,%%xmm7 \n"
582 "palignr $0x8,%%xmm6,%%xmm6 \n"
583 "palignr $0x8,%%xmm7,%%xmm7 \n"
584 "punpcklwd %%xmm10,%%xmm8 \n"
585 "punpcklwd %%xmm11,%%xmm9 \n"
586 "movdqa %%xmm8,%%xmm10 \n"
587 "movdqa %%xmm9,%%xmm11 \n"
588 "palignr $0x8,%%xmm10,%%xmm10 \n"
589 "palignr $0x8,%%xmm11,%%xmm11 \n"
590 "punpcklwd %%xmm14,%%xmm12 \n"
591 "punpcklwd %%xmm15,%%xmm13 \n"
592 "movdqa %%xmm12,%%xmm14 \n"
593 "movdqa %%xmm13,%%xmm15 \n"
594 "palignr $0x8,%%xmm14,%%xmm14 \n"
595 "palignr $0x8,%%xmm15,%%xmm15 \n"
596 // Third round of bit swap.
597 // Write to the destination pointer.
598 "punpckldq %%xmm4,%%xmm0 \n"
599 "movq %%xmm0,(%1) \n"
600 "movdqa %%xmm0,%%xmm4 \n"
601 "palignr $0x8,%%xmm4,%%xmm4 \n"
602 "movq %%xmm4,(%1,%4) \n"
603 "lea (%1,%4,2),%1 \n"
604 "punpckldq %%xmm6,%%xmm2 \n"
605 "movdqa %%xmm2,%%xmm6 \n"
606 "movq %%xmm2,(%1) \n"
607 "palignr $0x8,%%xmm6,%%xmm6 \n"
608 "punpckldq %%xmm5,%%xmm1 \n"
609 "movq %%xmm6,(%1,%4) \n"
610 "lea (%1,%4,2),%1 \n"
611 "movdqa %%xmm1,%%xmm5 \n"
612 "movq %%xmm1,(%1) \n"
613 "palignr $0x8,%%xmm5,%%xmm5 \n"
614 "movq %%xmm5,(%1,%4) \n"
615 "lea (%1,%4,2),%1 \n"
616 "punpckldq %%xmm7,%%xmm3 \n"
617 "movq %%xmm3,(%1) \n"
618 "movdqa %%xmm3,%%xmm7 \n"
619 "palignr $0x8,%%xmm7,%%xmm7 \n"
620 "movq %%xmm7,(%1,%4) \n"
621 "lea (%1,%4,2),%1 \n"
622 "punpckldq %%xmm12,%%xmm8 \n"
623 "movq %%xmm8,(%1) \n"
624 "movdqa %%xmm8,%%xmm12 \n"
625 "palignr $0x8,%%xmm12,%%xmm12 \n"
626 "movq %%xmm12,(%1,%4) \n"
627 "lea (%1,%4,2),%1 \n"
628 "punpckldq %%xmm14,%%xmm10 \n"
629 "movdqa %%xmm10,%%xmm14 \n"
630 "movq %%xmm10,(%1) \n"
631 "palignr $0x8,%%xmm14,%%xmm14 \n"
632 "punpckldq %%xmm13,%%xmm9 \n"
633 "movq %%xmm14,(%1,%4) \n"
634 "lea (%1,%4,2),%1 \n"
635 "movdqa %%xmm9,%%xmm13 \n"
636 "movq %%xmm9,(%1) \n"
637 "palignr $0x8,%%xmm13,%%xmm13 \n"
638 "movq %%xmm13,(%1,%4) \n"
639 "lea (%1,%4,2),%1 \n"
640 "punpckldq %%xmm15,%%xmm11 \n"
641 "movq %%xmm11,(%1) \n"
642 "movdqa %%xmm11,%%xmm15 \n"
643 "palignr $0x8,%%xmm15,%%xmm15 \n"
644 "sub $0x10,%2 \n"
645 "movq %%xmm15,(%1,%4) \n"
646 "lea (%1,%4,2),%1 \n"
647 "jg 1b \n"
648 : "+r"(src), // %0
649 "+r"(dst), // %1
650 "+r"(width) // %2
651 : "r"((intptr_t)(src_stride)), // %3
652 "r"((intptr_t)(dst_stride)) // %4
653 : "memory", "cc",
654 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
655 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15"
656 );
657 }
658
659 #define HAS_TRANSPOSE_UVWX8_SSE2
TransposeUVWx8_SSE2(const uint8 * src,int src_stride,uint8 * dst_a,int dst_stride_a,uint8 * dst_b,int dst_stride_b,int w)660 static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
661 uint8* dst_a, int dst_stride_a,
662 uint8* dst_b, int dst_stride_b,
663 int w) {
664 asm volatile (
665 // Read in the data from the source pointer.
666 // First round of bit swap.
667 ".p2align 2 \n"
668 "1: \n"
669 "movdqa (%0),%%xmm0 \n"
670 "movdqa (%0,%4),%%xmm1 \n"
671 "lea (%0,%4,2),%0 \n"
672 "movdqa %%xmm0,%%xmm8 \n"
673 "punpcklbw %%xmm1,%%xmm0 \n"
674 "punpckhbw %%xmm1,%%xmm8 \n"
675 "movdqa %%xmm8,%%xmm1 \n"
676 "movdqa (%0),%%xmm2 \n"
677 "movdqa (%0,%4),%%xmm3 \n"
678 "lea (%0,%4,2),%0 \n"
679 "movdqa %%xmm2,%%xmm8 \n"
680 "punpcklbw %%xmm3,%%xmm2 \n"
681 "punpckhbw %%xmm3,%%xmm8 \n"
682 "movdqa %%xmm8,%%xmm3 \n"
683 "movdqa (%0),%%xmm4 \n"
684 "movdqa (%0,%4),%%xmm5 \n"
685 "lea (%0,%4,2),%0 \n"
686 "movdqa %%xmm4,%%xmm8 \n"
687 "punpcklbw %%xmm5,%%xmm4 \n"
688 "punpckhbw %%xmm5,%%xmm8 \n"
689 "movdqa %%xmm8,%%xmm5 \n"
690 "movdqa (%0),%%xmm6 \n"
691 "movdqa (%0,%4),%%xmm7 \n"
692 "lea (%0,%4,2),%0 \n"
693 "movdqa %%xmm6,%%xmm8 \n"
694 "punpcklbw %%xmm7,%%xmm6 \n"
695 "neg %4 \n"
696 "lea 0x10(%0,%4,8),%0 \n"
697 "punpckhbw %%xmm7,%%xmm8 \n"
698 "movdqa %%xmm8,%%xmm7 \n"
699 "neg %4 \n"
700 // Second round of bit swap.
701 "movdqa %%xmm0,%%xmm8 \n"
702 "movdqa %%xmm1,%%xmm9 \n"
703 "punpckhwd %%xmm2,%%xmm8 \n"
704 "punpckhwd %%xmm3,%%xmm9 \n"
705 "punpcklwd %%xmm2,%%xmm0 \n"
706 "punpcklwd %%xmm3,%%xmm1 \n"
707 "movdqa %%xmm8,%%xmm2 \n"
708 "movdqa %%xmm9,%%xmm3 \n"
709 "movdqa %%xmm4,%%xmm8 \n"
710 "movdqa %%xmm5,%%xmm9 \n"
711 "punpckhwd %%xmm6,%%xmm8 \n"
712 "punpckhwd %%xmm7,%%xmm9 \n"
713 "punpcklwd %%xmm6,%%xmm4 \n"
714 "punpcklwd %%xmm7,%%xmm5 \n"
715 "movdqa %%xmm8,%%xmm6 \n"
716 "movdqa %%xmm9,%%xmm7 \n"
717 // Third round of bit swap.
718 // Write to the destination pointer.
719 "movdqa %%xmm0,%%xmm8 \n"
720 "punpckldq %%xmm4,%%xmm0 \n"
721 "movlpd %%xmm0,(%1) \n" // Write back U channel
722 "movhpd %%xmm0,(%2) \n" // Write back V channel
723 "punpckhdq %%xmm4,%%xmm8 \n"
724 "movlpd %%xmm8,(%1,%5) \n"
725 "lea (%1,%5,2),%1 \n"
726 "movhpd %%xmm8,(%2,%6) \n"
727 "lea (%2,%6,2),%2 \n"
728 "movdqa %%xmm2,%%xmm8 \n"
729 "punpckldq %%xmm6,%%xmm2 \n"
730 "movlpd %%xmm2,(%1) \n"
731 "movhpd %%xmm2,(%2) \n"
732 "punpckhdq %%xmm6,%%xmm8 \n"
733 "movlpd %%xmm8,(%1,%5) \n"
734 "lea (%1,%5,2),%1 \n"
735 "movhpd %%xmm8,(%2,%6) \n"
736 "lea (%2,%6,2),%2 \n"
737 "movdqa %%xmm1,%%xmm8 \n"
738 "punpckldq %%xmm5,%%xmm1 \n"
739 "movlpd %%xmm1,(%1) \n"
740 "movhpd %%xmm1,(%2) \n"
741 "punpckhdq %%xmm5,%%xmm8 \n"
742 "movlpd %%xmm8,(%1,%5) \n"
743 "lea (%1,%5,2),%1 \n"
744 "movhpd %%xmm8,(%2,%6) \n"
745 "lea (%2,%6,2),%2 \n"
746 "movdqa %%xmm3,%%xmm8 \n"
747 "punpckldq %%xmm7,%%xmm3 \n"
748 "movlpd %%xmm3,(%1) \n"
749 "movhpd %%xmm3,(%2) \n"
750 "punpckhdq %%xmm7,%%xmm8 \n"
751 "sub $0x8,%3 \n"
752 "movlpd %%xmm8,(%1,%5) \n"
753 "lea (%1,%5,2),%1 \n"
754 "movhpd %%xmm8,(%2,%6) \n"
755 "lea (%2,%6,2),%2 \n"
756 "jg 1b \n"
757 : "+r"(src), // %0
758 "+r"(dst_a), // %1
759 "+r"(dst_b), // %2
760 "+r"(w) // %3
761 : "r"((intptr_t)(src_stride)), // %4
762 "r"((intptr_t)(dst_stride_a)), // %5
763 "r"((intptr_t)(dst_stride_b)) // %6
764 : "memory", "cc",
765 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
766 "xmm8", "xmm9"
767 );
768 }
769 #endif
770 #endif
771
TransposeWx8_C(const uint8 * src,int src_stride,uint8 * dst,int dst_stride,int width)772 static void TransposeWx8_C(const uint8* src, int src_stride,
773 uint8* dst, int dst_stride,
774 int width) {
775 int i;
776 for (i = 0; i < width; ++i) {
777 dst[0] = src[0 * src_stride];
778 dst[1] = src[1 * src_stride];
779 dst[2] = src[2 * src_stride];
780 dst[3] = src[3 * src_stride];
781 dst[4] = src[4 * src_stride];
782 dst[5] = src[5 * src_stride];
783 dst[6] = src[6 * src_stride];
784 dst[7] = src[7 * src_stride];
785 ++src;
786 dst += dst_stride;
787 }
788 }
789
TransposeWxH_C(const uint8 * src,int src_stride,uint8 * dst,int dst_stride,int width,int height)790 static void TransposeWxH_C(const uint8* src, int src_stride,
791 uint8* dst, int dst_stride,
792 int width, int height) {
793 int i;
794 for (i = 0; i < width; ++i) {
795 int j;
796 for (j = 0; j < height; ++j) {
797 dst[i * dst_stride + j] = src[j * src_stride + i];
798 }
799 }
800 }
801
802 LIBYUV_API
TransposePlane(const uint8 * src,int src_stride,uint8 * dst,int dst_stride,int width,int height)803 void TransposePlane(const uint8* src, int src_stride,
804 uint8* dst, int dst_stride,
805 int width, int height) {
806 int i = height;
807 void (*TransposeWx8)(const uint8* src, int src_stride,
808 uint8* dst, int dst_stride,
809 int width) = TransposeWx8_C;
810 #if defined(HAS_TRANSPOSE_WX8_NEON)
811 if (TestCpuFlag(kCpuHasNEON)) {
812 TransposeWx8 = TransposeWx8_NEON;
813 }
814 #endif
815 #if defined(HAS_TRANSPOSE_WX8_SSSE3)
816 if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) {
817 TransposeWx8 = TransposeWx8_SSSE3;
818 }
819 #endif
820 #if defined(HAS_TRANSPOSE_WX8_FAST_SSSE3)
821 if (TestCpuFlag(kCpuHasSSSE3) &&
822 IS_ALIGNED(width, 16) &&
823 IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) {
824 TransposeWx8 = TransposeWx8_FAST_SSSE3;
825 }
826 #endif
827 #if defined(HAS_TRANSPOSE_WX8_MIPS_DSPR2)
828 if (TestCpuFlag(kCpuHasMIPS_DSPR2)) {
829 if (IS_ALIGNED(width, 4) &&
830 IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
831 TransposeWx8 = TransposeWx8_FAST_MIPS_DSPR2;
832 } else {
833 TransposeWx8 = TransposeWx8_MIPS_DSPR2;
834 }
835 }
836 #endif
837
838 // Work across the source in 8x8 tiles
839 while (i >= 8) {
840 TransposeWx8(src, src_stride, dst, dst_stride, width);
841 src += 8 * src_stride; // Go down 8 rows.
842 dst += 8; // Move over 8 columns.
843 i -= 8;
844 }
845
846 TransposeWxH_C(src, src_stride, dst, dst_stride, width, i);
847 }
848
849 LIBYUV_API
RotatePlane90(const uint8 * src,int src_stride,uint8 * dst,int dst_stride,int width,int height)850 void RotatePlane90(const uint8* src, int src_stride,
851 uint8* dst, int dst_stride,
852 int width, int height) {
853 // Rotate by 90 is a transpose with the source read
854 // from bottom to top. So set the source pointer to the end
855 // of the buffer and flip the sign of the source stride.
856 src += src_stride * (height - 1);
857 src_stride = -src_stride;
858 TransposePlane(src, src_stride, dst, dst_stride, width, height);
859 }
860
861 LIBYUV_API
RotatePlane270(const uint8 * src,int src_stride,uint8 * dst,int dst_stride,int width,int height)862 void RotatePlane270(const uint8* src, int src_stride,
863 uint8* dst, int dst_stride,
864 int width, int height) {
865 // Rotate by 270 is a transpose with the destination written
866 // from bottom to top. So set the destination pointer to the end
867 // of the buffer and flip the sign of the destination stride.
868 dst += dst_stride * (width - 1);
869 dst_stride = -dst_stride;
870 TransposePlane(src, src_stride, dst, dst_stride, width, height);
871 }
872
873 LIBYUV_API
RotatePlane180(const uint8 * src,int src_stride,uint8 * dst,int dst_stride,int width,int height)874 void RotatePlane180(const uint8* src, int src_stride,
875 uint8* dst, int dst_stride,
876 int width, int height) {
877 // Swap first and last row and mirror the content. Uses a temporary row.
878 align_buffer_64(row, width);
879 const uint8* src_bot = src + src_stride * (height - 1);
880 uint8* dst_bot = dst + dst_stride * (height - 1);
881 int half_height = (height + 1) >> 1;
882 int y;
883 void (*MirrorRow)(const uint8* src, uint8* dst, int width) = MirrorRow_C;
884 void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
885 #if defined(HAS_MIRRORROW_NEON)
886 if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) {
887 MirrorRow = MirrorRow_NEON;
888 }
889 #endif
890 #if defined(HAS_MIRRORROW_SSE2)
891 if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16) &&
892 IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&
893 IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
894 MirrorRow = MirrorRow_SSE2;
895 }
896 #endif
897 #if defined(HAS_MIRRORROW_SSSE3)
898 if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16) &&
899 IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&
900 IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
901 MirrorRow = MirrorRow_SSSE3;
902 }
903 #endif
904 #if defined(HAS_MIRRORROW_AVX2)
905 if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 32)) {
906 MirrorRow = MirrorRow_AVX2;
907 }
908 #endif
909 #if defined(HAS_MIRRORROW_MIPS_DSPR2)
910 if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
911 IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4) &&
912 IS_ALIGNED(dst, 4) && IS_ALIGNED(dst_stride, 4)) {
913 MirrorRow = MirrorRow_MIPS_DSPR2;
914 }
915 #endif
916 #if defined(HAS_COPYROW_NEON)
917 if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) {
918 CopyRow = CopyRow_NEON;
919 }
920 #endif
921 #if defined(HAS_COPYROW_X86)
922 if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) {
923 CopyRow = CopyRow_X86;
924 }
925 #endif
926 #if defined(HAS_COPYROW_SSE2)
927 if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32) &&
928 IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&
929 IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
930 CopyRow = CopyRow_SSE2;
931 }
932 #endif
933 #if defined(HAS_COPYROW_ERMS)
934 if (TestCpuFlag(kCpuHasERMS)) {
935 CopyRow = CopyRow_ERMS;
936 }
937 #endif
938 #if defined(HAS_COPYROW_MIPS)
939 if (TestCpuFlag(kCpuHasMIPS)) {
940 CopyRow = CopyRow_MIPS;
941 }
942 #endif
943
944 // Odd height will harmlessly mirror the middle row twice.
945 for (y = 0; y < half_height; ++y) {
946 MirrorRow(src, row, width); // Mirror first row into a buffer
947 src += src_stride;
948 MirrorRow(src_bot, dst, width); // Mirror last row into first row
949 dst += dst_stride;
950 CopyRow(row, dst_bot, width); // Copy first mirrored row into last
951 src_bot -= src_stride;
952 dst_bot -= dst_stride;
953 }
954 free_aligned_buffer_64(row);
955 }
956
TransposeUVWx8_C(const uint8 * src,int src_stride,uint8 * dst_a,int dst_stride_a,uint8 * dst_b,int dst_stride_b,int width)957 static void TransposeUVWx8_C(const uint8* src, int src_stride,
958 uint8* dst_a, int dst_stride_a,
959 uint8* dst_b, int dst_stride_b,
960 int width) {
961 int i;
962 for (i = 0; i < width; ++i) {
963 dst_a[0] = src[0 * src_stride + 0];
964 dst_b[0] = src[0 * src_stride + 1];
965 dst_a[1] = src[1 * src_stride + 0];
966 dst_b[1] = src[1 * src_stride + 1];
967 dst_a[2] = src[2 * src_stride + 0];
968 dst_b[2] = src[2 * src_stride + 1];
969 dst_a[3] = src[3 * src_stride + 0];
970 dst_b[3] = src[3 * src_stride + 1];
971 dst_a[4] = src[4 * src_stride + 0];
972 dst_b[4] = src[4 * src_stride + 1];
973 dst_a[5] = src[5 * src_stride + 0];
974 dst_b[5] = src[5 * src_stride + 1];
975 dst_a[6] = src[6 * src_stride + 0];
976 dst_b[6] = src[6 * src_stride + 1];
977 dst_a[7] = src[7 * src_stride + 0];
978 dst_b[7] = src[7 * src_stride + 1];
979 src += 2;
980 dst_a += dst_stride_a;
981 dst_b += dst_stride_b;
982 }
983 }
984
TransposeUVWxH_C(const uint8 * src,int src_stride,uint8 * dst_a,int dst_stride_a,uint8 * dst_b,int dst_stride_b,int width,int height)985 static void TransposeUVWxH_C(const uint8* src, int src_stride,
986 uint8* dst_a, int dst_stride_a,
987 uint8* dst_b, int dst_stride_b,
988 int width, int height) {
989 int i;
990 for (i = 0; i < width * 2; i += 2) {
991 int j;
992 for (j = 0; j < height; ++j) {
993 dst_a[j + ((i >> 1) * dst_stride_a)] = src[i + (j * src_stride)];
994 dst_b[j + ((i >> 1) * dst_stride_b)] = src[i + (j * src_stride) + 1];
995 }
996 }
997 }
998
999 LIBYUV_API
TransposeUV(const uint8 * src,int src_stride,uint8 * dst_a,int dst_stride_a,uint8 * dst_b,int dst_stride_b,int width,int height)1000 void TransposeUV(const uint8* src, int src_stride,
1001 uint8* dst_a, int dst_stride_a,
1002 uint8* dst_b, int dst_stride_b,
1003 int width, int height) {
1004 int i = height;
1005 void (*TransposeUVWx8)(const uint8* src, int src_stride,
1006 uint8* dst_a, int dst_stride_a,
1007 uint8* dst_b, int dst_stride_b,
1008 int width) = TransposeUVWx8_C;
1009 #if defined(HAS_TRANSPOSE_UVWX8_NEON)
1010 if (TestCpuFlag(kCpuHasNEON)) {
1011 TransposeUVWx8 = TransposeUVWx8_NEON;
1012 }
1013 #elif defined(HAS_TRANSPOSE_UVWX8_SSE2)
1014 if (TestCpuFlag(kCpuHasSSE2) &&
1015 IS_ALIGNED(width, 8) &&
1016 IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) {
1017 TransposeUVWx8 = TransposeUVWx8_SSE2;
1018 }
1019 #elif defined(HAS_TRANSPOSE_UVWx8_MIPS_DSPR2)
1020 if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 2) &&
1021 IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
1022 TransposeUVWx8 = TransposeUVWx8_MIPS_DSPR2;
1023 }
1024 #endif
1025
1026 // Work through the source in 8x8 tiles.
1027 while (i >= 8) {
1028 TransposeUVWx8(src, src_stride,
1029 dst_a, dst_stride_a,
1030 dst_b, dst_stride_b,
1031 width);
1032 src += 8 * src_stride; // Go down 8 rows.
1033 dst_a += 8; // Move over 8 columns.
1034 dst_b += 8; // Move over 8 columns.
1035 i -= 8;
1036 }
1037
1038 TransposeUVWxH_C(src, src_stride,
1039 dst_a, dst_stride_a,
1040 dst_b, dst_stride_b,
1041 width, i);
1042 }
1043
1044 LIBYUV_API
RotateUV90(const uint8 * src,int src_stride,uint8 * dst_a,int dst_stride_a,uint8 * dst_b,int dst_stride_b,int width,int height)1045 void RotateUV90(const uint8* src, int src_stride,
1046 uint8* dst_a, int dst_stride_a,
1047 uint8* dst_b, int dst_stride_b,
1048 int width, int height) {
1049 src += src_stride * (height - 1);
1050 src_stride = -src_stride;
1051
1052 TransposeUV(src, src_stride,
1053 dst_a, dst_stride_a,
1054 dst_b, dst_stride_b,
1055 width, height);
1056 }
1057
1058 LIBYUV_API
RotateUV270(const uint8 * src,int src_stride,uint8 * dst_a,int dst_stride_a,uint8 * dst_b,int dst_stride_b,int width,int height)1059 void RotateUV270(const uint8* src, int src_stride,
1060 uint8* dst_a, int dst_stride_a,
1061 uint8* dst_b, int dst_stride_b,
1062 int width, int height) {
1063 dst_a += dst_stride_a * (width - 1);
1064 dst_b += dst_stride_b * (width - 1);
1065 dst_stride_a = -dst_stride_a;
1066 dst_stride_b = -dst_stride_b;
1067
1068 TransposeUV(src, src_stride,
1069 dst_a, dst_stride_a,
1070 dst_b, dst_stride_b,
1071 width, height);
1072 }
1073
1074 // Rotate 180 is a horizontal and vertical flip.
1075 LIBYUV_API
RotateUV180(const uint8 * src,int src_stride,uint8 * dst_a,int dst_stride_a,uint8 * dst_b,int dst_stride_b,int width,int height)1076 void RotateUV180(const uint8* src, int src_stride,
1077 uint8* dst_a, int dst_stride_a,
1078 uint8* dst_b, int dst_stride_b,
1079 int width, int height) {
1080 int i;
1081 void (*MirrorRowUV)(const uint8* src, uint8* dst_u, uint8* dst_v, int width) =
1082 MirrorUVRow_C;
1083 #if defined(HAS_MIRRORUVROW_NEON)
1084 if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
1085 MirrorRowUV = MirrorUVRow_NEON;
1086 }
1087 #elif defined(HAS_MIRRORROW_UV_SSSE3)
1088 if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16) &&
1089 IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) {
1090 MirrorRowUV = MirrorUVRow_SSSE3;
1091 }
1092 #elif defined(HAS_MIRRORUVROW_MIPS_DSPR2)
1093 if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
1094 IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
1095 MirrorRowUV = MirrorUVRow_MIPS_DSPR2;
1096 }
1097 #endif
1098
1099 dst_a += dst_stride_a * (height - 1);
1100 dst_b += dst_stride_b * (height - 1);
1101
1102 for (i = 0; i < height; ++i) {
1103 MirrorRowUV(src, dst_a, dst_b, width);
1104 src += src_stride;
1105 dst_a -= dst_stride_a;
1106 dst_b -= dst_stride_b;
1107 }
1108 }
1109
1110 LIBYUV_API
RotatePlane(const uint8 * src,int src_stride,uint8 * dst,int dst_stride,int width,int height,enum RotationMode mode)1111 int RotatePlane(const uint8* src, int src_stride,
1112 uint8* dst, int dst_stride,
1113 int width, int height,
1114 enum RotationMode mode) {
1115 if (!src || width <= 0 || height == 0 || !dst) {
1116 return -1;
1117 }
1118
1119 // Negative height means invert the image.
1120 if (height < 0) {
1121 height = -height;
1122 src = src + (height - 1) * src_stride;
1123 src_stride = -src_stride;
1124 }
1125
1126 switch (mode) {
1127 case kRotate0:
1128 // copy frame
1129 CopyPlane(src, src_stride,
1130 dst, dst_stride,
1131 width, height);
1132 return 0;
1133 case kRotate90:
1134 RotatePlane90(src, src_stride,
1135 dst, dst_stride,
1136 width, height);
1137 return 0;
1138 case kRotate270:
1139 RotatePlane270(src, src_stride,
1140 dst, dst_stride,
1141 width, height);
1142 return 0;
1143 case kRotate180:
1144 RotatePlane180(src, src_stride,
1145 dst, dst_stride,
1146 width, height);
1147 return 0;
1148 default:
1149 break;
1150 }
1151 return -1;
1152 }
1153
1154 LIBYUV_API
I420Rotate(const uint8 * src_y,int src_stride_y,const uint8 * src_u,int src_stride_u,const uint8 * src_v,int src_stride_v,uint8 * dst_y,int dst_stride_y,uint8 * dst_u,int dst_stride_u,uint8 * dst_v,int dst_stride_v,int width,int height,enum RotationMode mode)1155 int I420Rotate(const uint8* src_y, int src_stride_y,
1156 const uint8* src_u, int src_stride_u,
1157 const uint8* src_v, int src_stride_v,
1158 uint8* dst_y, int dst_stride_y,
1159 uint8* dst_u, int dst_stride_u,
1160 uint8* dst_v, int dst_stride_v,
1161 int width, int height,
1162 enum RotationMode mode) {
1163 int halfwidth = (width + 1) >> 1;
1164 int halfheight = (height + 1) >> 1;
1165 if (!src_y || !src_u || !src_v || width <= 0 || height == 0 ||
1166 !dst_y || !dst_u || !dst_v) {
1167 return -1;
1168 }
1169
1170 // Negative height means invert the image.
1171 if (height < 0) {
1172 height = -height;
1173 halfheight = (height + 1) >> 1;
1174 src_y = src_y + (height - 1) * src_stride_y;
1175 src_u = src_u + (halfheight - 1) * src_stride_u;
1176 src_v = src_v + (halfheight - 1) * src_stride_v;
1177 src_stride_y = -src_stride_y;
1178 src_stride_u = -src_stride_u;
1179 src_stride_v = -src_stride_v;
1180 }
1181
1182 switch (mode) {
1183 case kRotate0:
1184 // copy frame
1185 return I420Copy(src_y, src_stride_y,
1186 src_u, src_stride_u,
1187 src_v, src_stride_v,
1188 dst_y, dst_stride_y,
1189 dst_u, dst_stride_u,
1190 dst_v, dst_stride_v,
1191 width, height);
1192 case kRotate90:
1193 RotatePlane90(src_y, src_stride_y,
1194 dst_y, dst_stride_y,
1195 width, height);
1196 RotatePlane90(src_u, src_stride_u,
1197 dst_u, dst_stride_u,
1198 halfwidth, halfheight);
1199 RotatePlane90(src_v, src_stride_v,
1200 dst_v, dst_stride_v,
1201 halfwidth, halfheight);
1202 return 0;
1203 case kRotate270:
1204 RotatePlane270(src_y, src_stride_y,
1205 dst_y, dst_stride_y,
1206 width, height);
1207 RotatePlane270(src_u, src_stride_u,
1208 dst_u, dst_stride_u,
1209 halfwidth, halfheight);
1210 RotatePlane270(src_v, src_stride_v,
1211 dst_v, dst_stride_v,
1212 halfwidth, halfheight);
1213 return 0;
1214 case kRotate180:
1215 RotatePlane180(src_y, src_stride_y,
1216 dst_y, dst_stride_y,
1217 width, height);
1218 RotatePlane180(src_u, src_stride_u,
1219 dst_u, dst_stride_u,
1220 halfwidth, halfheight);
1221 RotatePlane180(src_v, src_stride_v,
1222 dst_v, dst_stride_v,
1223 halfwidth, halfheight);
1224 return 0;
1225 default:
1226 break;
1227 }
1228 return -1;
1229 }
1230
1231 LIBYUV_API
NV12ToI420Rotate(const uint8 * src_y,int src_stride_y,const uint8 * src_uv,int src_stride_uv,uint8 * dst_y,int dst_stride_y,uint8 * dst_u,int dst_stride_u,uint8 * dst_v,int dst_stride_v,int width,int height,enum RotationMode mode)1232 int NV12ToI420Rotate(const uint8* src_y, int src_stride_y,
1233 const uint8* src_uv, int src_stride_uv,
1234 uint8* dst_y, int dst_stride_y,
1235 uint8* dst_u, int dst_stride_u,
1236 uint8* dst_v, int dst_stride_v,
1237 int width, int height,
1238 enum RotationMode mode) {
1239 int halfwidth = (width + 1) >> 1;
1240 int halfheight = (height + 1) >> 1;
1241 if (!src_y || !src_uv || width <= 0 || height == 0 ||
1242 !dst_y || !dst_u || !dst_v) {
1243 return -1;
1244 }
1245
1246 // Negative height means invert the image.
1247 if (height < 0) {
1248 height = -height;
1249 halfheight = (height + 1) >> 1;
1250 src_y = src_y + (height - 1) * src_stride_y;
1251 src_uv = src_uv + (halfheight - 1) * src_stride_uv;
1252 src_stride_y = -src_stride_y;
1253 src_stride_uv = -src_stride_uv;
1254 }
1255
1256 switch (mode) {
1257 case kRotate0:
1258 // copy frame
1259 return NV12ToI420(src_y, src_stride_y,
1260 src_uv, src_stride_uv,
1261 dst_y, dst_stride_y,
1262 dst_u, dst_stride_u,
1263 dst_v, dst_stride_v,
1264 width, height);
1265 case kRotate90:
1266 RotatePlane90(src_y, src_stride_y,
1267 dst_y, dst_stride_y,
1268 width, height);
1269 RotateUV90(src_uv, src_stride_uv,
1270 dst_u, dst_stride_u,
1271 dst_v, dst_stride_v,
1272 halfwidth, halfheight);
1273 return 0;
1274 case kRotate270:
1275 RotatePlane270(src_y, src_stride_y,
1276 dst_y, dst_stride_y,
1277 width, height);
1278 RotateUV270(src_uv, src_stride_uv,
1279 dst_u, dst_stride_u,
1280 dst_v, dst_stride_v,
1281 halfwidth, halfheight);
1282 return 0;
1283 case kRotate180:
1284 RotatePlane180(src_y, src_stride_y,
1285 dst_y, dst_stride_y,
1286 width, height);
1287 RotateUV180(src_uv, src_stride_uv,
1288 dst_u, dst_stride_u,
1289 dst_v, dst_stride_v,
1290 halfwidth, halfheight);
1291 return 0;
1292 default:
1293 break;
1294 }
1295 return -1;
1296 }
1297
1298 #ifdef __cplusplus
1299 } // extern "C"
1300 } // namespace libyuv
1301 #endif
1302