1 /*
2 * Copyright 2011 The LibYuv Project Authors. All rights reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "libyuv/rotate.h"
12
13 #include "libyuv/cpu_id.h"
14 #include "libyuv/convert.h"
15 #include "libyuv/planar_functions.h"
16 #include "libyuv/row.h"
17
18 #ifdef __cplusplus
19 namespace libyuv {
20 extern "C" {
21 #endif
22
23 #if !defined(YUV_DISABLE_ASM) && \
24 (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
25 #if defined(__APPLE__) && defined(__i386__)
26 #define DECLARE_FUNCTION(name) \
27 ".text \n" \
28 ".private_extern _" #name " \n" \
29 ".align 4,0x90 \n" \
30 "_" #name ": \n"
31 #elif defined(__MINGW32__) || defined(__CYGWIN__) && defined(__i386__)
32 #define DECLARE_FUNCTION(name) \
33 ".text \n" \
34 ".align 4,0x90 \n" \
35 "_" #name ": \n"
36 #else
37 #define DECLARE_FUNCTION(name) \
38 ".text \n" \
39 ".align 4,0x90 \n" \
40 #name ": \n"
41 #endif
42 #endif
43
44 #if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__)
45 #define HAS_MIRRORROW_NEON
46 void MirrorRow_NEON(const uint8* src, uint8* dst, int width);
47 #define HAS_MIRRORROW_UV_NEON
48 void MirrorRowUV_NEON(const uint8* src,
49 uint8* dst_a, uint8* dst_b,
50 int width);
51 #define HAS_TRANSPOSE_WX8_NEON
52 void TransposeWx8_NEON(const uint8* src, int src_stride,
53 uint8* dst, int dst_stride, int width);
54 #define HAS_TRANSPOSE_UVWX8_NEON
55 void TransposeUVWx8_NEON(const uint8* src, int src_stride,
56 uint8* dst_a, int dst_stride_a,
57 uint8* dst_b, int dst_stride_b,
58 int width);
59 #endif // defined(__ARM_NEON__)
60
61 #if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
62 #define HAS_TRANSPOSE_WX8_SSSE3
63 __declspec(naked) __declspec(align(16))
TransposeWx8_SSSE3(const uint8 * src,int src_stride,uint8 * dst,int dst_stride,int width)64 static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
65 uint8* dst, int dst_stride, int width) {
66 __asm {
67 push edi
68 push esi
69 push ebp
70 mov eax, [esp + 12 + 4] // src
71 mov edi, [esp + 12 + 8] // src_stride
72 mov edx, [esp + 12 + 12] // dst
73 mov esi, [esp + 12 + 16] // dst_stride
74 mov ecx, [esp + 12 + 20] // width
75
76 // Read in the data from the source pointer.
77 // First round of bit swap.
78 align 16
79 convertloop:
80 movq xmm0, qword ptr [eax]
81 lea ebp, [eax + 8]
82 movq xmm1, qword ptr [eax + edi]
83 lea eax, [eax + 2 * edi]
84 punpcklbw xmm0, xmm1
85 movq xmm2, qword ptr [eax]
86 movdqa xmm1, xmm0
87 palignr xmm1, xmm1, 8
88 movq xmm3, qword ptr [eax + edi]
89 lea eax, [eax + 2 * edi]
90 punpcklbw xmm2, xmm3
91 movdqa xmm3, xmm2
92 movq xmm4, qword ptr [eax]
93 palignr xmm3, xmm3, 8
94 movq xmm5, qword ptr [eax + edi]
95 punpcklbw xmm4, xmm5
96 lea eax, [eax + 2 * edi]
97 movdqa xmm5, xmm4
98 movq xmm6, qword ptr [eax]
99 palignr xmm5, xmm5, 8
100 movq xmm7, qword ptr [eax + edi]
101 punpcklbw xmm6, xmm7
102 mov eax, ebp
103 movdqa xmm7, xmm6
104 palignr xmm7, xmm7, 8
105 // Second round of bit swap.
106 punpcklwd xmm0, xmm2
107 punpcklwd xmm1, xmm3
108 movdqa xmm2, xmm0
109 movdqa xmm3, xmm1
110 palignr xmm2, xmm2, 8
111 palignr xmm3, xmm3, 8
112 punpcklwd xmm4, xmm6
113 punpcklwd xmm5, xmm7
114 movdqa xmm6, xmm4
115 movdqa xmm7, xmm5
116 palignr xmm6, xmm6, 8
117 palignr xmm7, xmm7, 8
118 // Third round of bit swap.
119 // Write to the destination pointer.
120 punpckldq xmm0, xmm4
121 movq qword ptr [edx], xmm0
122 movdqa xmm4, xmm0
123 palignr xmm4, xmm4, 8
124 movq qword ptr [edx + esi], xmm4
125 lea edx, [edx + 2 * esi]
126 punpckldq xmm2, xmm6
127 movdqa xmm6, xmm2
128 palignr xmm6, xmm6, 8
129 movq qword ptr [edx], xmm2
130 punpckldq xmm1, xmm5
131 movq qword ptr [edx + esi], xmm6
132 lea edx, [edx + 2 * esi]
133 movdqa xmm5, xmm1
134 movq qword ptr [edx], xmm1
135 palignr xmm5, xmm5, 8
136 punpckldq xmm3, xmm7
137 movq qword ptr [edx + esi], xmm5
138 lea edx, [edx + 2 * esi]
139 movq qword ptr [edx], xmm3
140 movdqa xmm7, xmm3
141 palignr xmm7, xmm7, 8
142 sub ecx, 8
143 movq qword ptr [edx + esi], xmm7
144 lea edx, [edx + 2 * esi]
145 jg convertloop
146
147 pop ebp
148 pop esi
149 pop edi
150 ret
151 }
152 }
153
154 #define HAS_TRANSPOSE_UVWX8_SSE2
155 __declspec(naked) __declspec(align(16))
TransposeUVWx8_SSE2(const uint8 * src,int src_stride,uint8 * dst_a,int dst_stride_a,uint8 * dst_b,int dst_stride_b,int w)156 static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
157 uint8* dst_a, int dst_stride_a,
158 uint8* dst_b, int dst_stride_b,
159 int w) {
160 __asm {
161 push ebx
162 push esi
163 push edi
164 push ebp
165 mov eax, [esp + 16 + 4] // src
166 mov edi, [esp + 16 + 8] // src_stride
167 mov edx, [esp + 16 + 12] // dst_a
168 mov esi, [esp + 16 + 16] // dst_stride_a
169 mov ebx, [esp + 16 + 20] // dst_b
170 mov ebp, [esp + 16 + 24] // dst_stride_b
171 mov ecx, esp
172 sub esp, 4 + 16
173 and esp, ~15
174 mov [esp + 16], ecx
175 mov ecx, [ecx + 16 + 28] // w
176
177 align 16
178 convertloop:
179 // Read in the data from the source pointer.
180 // First round of bit swap.
181 movdqa xmm0, [eax]
182 movdqa xmm1, [eax + edi]
183 lea eax, [eax + 2 * edi]
184 movdqa xmm7, xmm0 // use xmm7 as temp register.
185 punpcklbw xmm0, xmm1
186 punpckhbw xmm7, xmm1
187 movdqa xmm1, xmm7
188 movdqa xmm2, [eax]
189 movdqa xmm3, [eax + edi]
190 lea eax, [eax + 2 * edi]
191 movdqa xmm7, xmm2
192 punpcklbw xmm2, xmm3
193 punpckhbw xmm7, xmm3
194 movdqa xmm3, xmm7
195 movdqa xmm4, [eax]
196 movdqa xmm5, [eax + edi]
197 lea eax, [eax + 2 * edi]
198 movdqa xmm7, xmm4
199 punpcklbw xmm4, xmm5
200 punpckhbw xmm7, xmm5
201 movdqa xmm5, xmm7
202 movdqa xmm6, [eax]
203 movdqa xmm7, [eax + edi]
204 lea eax, [eax + 2 * edi]
205 movdqa [esp], xmm5 // backup xmm5
206 neg edi
207 movdqa xmm5, xmm6 // use xmm5 as temp register.
208 punpcklbw xmm6, xmm7
209 punpckhbw xmm5, xmm7
210 movdqa xmm7, xmm5
211 lea eax, [eax + 8 * edi + 16]
212 neg edi
213 // Second round of bit swap.
214 movdqa xmm5, xmm0
215 punpcklwd xmm0, xmm2
216 punpckhwd xmm5, xmm2
217 movdqa xmm2, xmm5
218 movdqa xmm5, xmm1
219 punpcklwd xmm1, xmm3
220 punpckhwd xmm5, xmm3
221 movdqa xmm3, xmm5
222 movdqa xmm5, xmm4
223 punpcklwd xmm4, xmm6
224 punpckhwd xmm5, xmm6
225 movdqa xmm6, xmm5
226 movdqa xmm5, [esp] // restore xmm5
227 movdqa [esp], xmm6 // backup xmm6
228 movdqa xmm6, xmm5 // use xmm6 as temp register.
229 punpcklwd xmm5, xmm7
230 punpckhwd xmm6, xmm7
231 movdqa xmm7, xmm6
232 // Third round of bit swap.
233 // Write to the destination pointer.
234 movdqa xmm6, xmm0
235 punpckldq xmm0, xmm4
236 punpckhdq xmm6, xmm4
237 movdqa xmm4, xmm6
238 movdqa xmm6, [esp] // restore xmm6
239 movlpd qword ptr [edx], xmm0
240 movhpd qword ptr [ebx], xmm0
241 movlpd qword ptr [edx + esi], xmm4
242 lea edx, [edx + 2 * esi]
243 movhpd qword ptr [ebx + ebp], xmm4
244 lea ebx, [ebx + 2 * ebp]
245 movdqa xmm0, xmm2 // use xmm0 as the temp register.
246 punpckldq xmm2, xmm6
247 movlpd qword ptr [edx], xmm2
248 movhpd qword ptr [ebx], xmm2
249 punpckhdq xmm0, xmm6
250 movlpd qword ptr [edx + esi], xmm0
251 lea edx, [edx + 2 * esi]
252 movhpd qword ptr [ebx + ebp], xmm0
253 lea ebx, [ebx + 2 * ebp]
254 movdqa xmm0, xmm1 // use xmm0 as the temp register.
255 punpckldq xmm1, xmm5
256 movlpd qword ptr [edx], xmm1
257 movhpd qword ptr [ebx], xmm1
258 punpckhdq xmm0, xmm5
259 movlpd qword ptr [edx + esi], xmm0
260 lea edx, [edx + 2 * esi]
261 movhpd qword ptr [ebx + ebp], xmm0
262 lea ebx, [ebx + 2 * ebp]
263 movdqa xmm0, xmm3 // use xmm0 as the temp register.
264 punpckldq xmm3, xmm7
265 movlpd qword ptr [edx], xmm3
266 movhpd qword ptr [ebx], xmm3
267 punpckhdq xmm0, xmm7
268 sub ecx, 8
269 movlpd qword ptr [edx + esi], xmm0
270 lea edx, [edx + 2 * esi]
271 movhpd qword ptr [ebx + ebp], xmm0
272 lea ebx, [ebx + 2 * ebp]
273 jg convertloop
274
275 mov esp, [esp + 16]
276 pop ebp
277 pop edi
278 pop esi
279 pop ebx
280 ret
281 }
282 }
283 #elif !defined(YUV_DISABLE_ASM) && (defined(__i386__) || defined(__x86_64__))
284 #define HAS_TRANSPOSE_WX8_SSSE3
TransposeWx8_SSSE3(const uint8 * src,int src_stride,uint8 * dst,int dst_stride,int width)285 static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
286 uint8* dst, int dst_stride, int width) {
287 asm volatile (
288 // Read in the data from the source pointer.
289 // First round of bit swap.
290 ".p2align 4 \n"
291 "1: \n"
292 "movq (%0),%%xmm0 \n"
293 "movq (%0,%3),%%xmm1 \n"
294 "lea (%0,%3,2),%0 \n"
295 "punpcklbw %%xmm1,%%xmm0 \n"
296 "movq (%0),%%xmm2 \n"
297 "movdqa %%xmm0,%%xmm1 \n"
298 "palignr $0x8,%%xmm1,%%xmm1 \n"
299 "movq (%0,%3),%%xmm3 \n"
300 "lea (%0,%3,2),%0 \n"
301 "punpcklbw %%xmm3,%%xmm2 \n"
302 "movdqa %%xmm2,%%xmm3 \n"
303 "movq (%0),%%xmm4 \n"
304 "palignr $0x8,%%xmm3,%%xmm3 \n"
305 "movq (%0,%3),%%xmm5 \n"
306 "lea (%0,%3,2),%0 \n"
307 "punpcklbw %%xmm5,%%xmm4 \n"
308 "movdqa %%xmm4,%%xmm5 \n"
309 "movq (%0),%%xmm6 \n"
310 "palignr $0x8,%%xmm5,%%xmm5 \n"
311 "movq (%0,%3),%%xmm7 \n"
312 "lea (%0,%3,2),%0 \n"
313 "punpcklbw %%xmm7,%%xmm6 \n"
314 "neg %3 \n"
315 "movdqa %%xmm6,%%xmm7 \n"
316 "lea 0x8(%0,%3,8),%0 \n"
317 "palignr $0x8,%%xmm7,%%xmm7 \n"
318 "neg %3 \n"
319 // Second round of bit swap.
320 "punpcklwd %%xmm2,%%xmm0 \n"
321 "punpcklwd %%xmm3,%%xmm1 \n"
322 "movdqa %%xmm0,%%xmm2 \n"
323 "movdqa %%xmm1,%%xmm3 \n"
324 "palignr $0x8,%%xmm2,%%xmm2 \n"
325 "palignr $0x8,%%xmm3,%%xmm3 \n"
326 "punpcklwd %%xmm6,%%xmm4 \n"
327 "punpcklwd %%xmm7,%%xmm5 \n"
328 "movdqa %%xmm4,%%xmm6 \n"
329 "movdqa %%xmm5,%%xmm7 \n"
330 "palignr $0x8,%%xmm6,%%xmm6 \n"
331 "palignr $0x8,%%xmm7,%%xmm7 \n"
332 // Third round of bit swap.
333 // Write to the destination pointer.
334 "punpckldq %%xmm4,%%xmm0 \n"
335 "movq %%xmm0,(%1) \n"
336 "movdqa %%xmm0,%%xmm4 \n"
337 "palignr $0x8,%%xmm4,%%xmm4 \n"
338 "movq %%xmm4,(%1,%4) \n"
339 "lea (%1,%4,2),%1 \n"
340 "punpckldq %%xmm6,%%xmm2 \n"
341 "movdqa %%xmm2,%%xmm6 \n"
342 "movq %%xmm2,(%1) \n"
343 "palignr $0x8,%%xmm6,%%xmm6 \n"
344 "punpckldq %%xmm5,%%xmm1 \n"
345 "movq %%xmm6,(%1,%4) \n"
346 "lea (%1,%4,2),%1 \n"
347 "movdqa %%xmm1,%%xmm5 \n"
348 "movq %%xmm1,(%1) \n"
349 "palignr $0x8,%%xmm5,%%xmm5 \n"
350 "movq %%xmm5,(%1,%4) \n"
351 "lea (%1,%4,2),%1 \n"
352 "punpckldq %%xmm7,%%xmm3 \n"
353 "movq %%xmm3,(%1) \n"
354 "movdqa %%xmm3,%%xmm7 \n"
355 "palignr $0x8,%%xmm7,%%xmm7 \n"
356 "sub $0x8,%2 \n"
357 "movq %%xmm7,(%1,%4) \n"
358 "lea (%1,%4,2),%1 \n"
359 "jg 1b \n"
360 : "+r"(src), // %0
361 "+r"(dst), // %1
362 "+r"(width) // %2
363 : "r"(static_cast<intptr_t>(src_stride)), // %3
364 "r"(static_cast<intptr_t>(dst_stride)) // %4
365 : "memory", "cc"
366 #if defined(__SSE2__)
367 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
368 #endif
369 );
370 }
371
372 #if !defined(YUV_DISABLE_ASM) && defined (__i386__)
373 #define HAS_TRANSPOSE_UVWX8_SSE2
374 extern "C" void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
375 uint8* dst_a, int dst_stride_a,
376 uint8* dst_b, int dst_stride_b,
377 int w);
378 asm (
379 DECLARE_FUNCTION(TransposeUVWx8_SSE2)
380 "push %ebx \n"
381 "push %esi \n"
382 "push %edi \n"
383 "push %ebp \n"
384 "mov 0x14(%esp),%eax \n"
385 "mov 0x18(%esp),%edi \n"
386 "mov 0x1c(%esp),%edx \n"
387 "mov 0x20(%esp),%esi \n"
388 "mov 0x24(%esp),%ebx \n"
389 "mov 0x28(%esp),%ebp \n"
390 "mov %esp,%ecx \n"
391 "sub $0x14,%esp \n"
392 "and $0xfffffff0,%esp \n"
393 "mov %ecx,0x10(%esp) \n"
394 "mov 0x2c(%ecx),%ecx \n"
395
396 "1: \n"
397 "movdqa (%eax),%xmm0 \n"
398 "movdqa (%eax,%edi,1),%xmm1 \n"
399 "lea (%eax,%edi,2),%eax \n"
400 "movdqa %xmm0,%xmm7 \n"
401 "punpcklbw %xmm1,%xmm0 \n"
402 "punpckhbw %xmm1,%xmm7 \n"
403 "movdqa %xmm7,%xmm1 \n"
404 "movdqa (%eax),%xmm2 \n"
405 "movdqa (%eax,%edi,1),%xmm3 \n"
406 "lea (%eax,%edi,2),%eax \n"
407 "movdqa %xmm2,%xmm7 \n"
408 "punpcklbw %xmm3,%xmm2 \n"
409 "punpckhbw %xmm3,%xmm7 \n"
410 "movdqa %xmm7,%xmm3 \n"
411 "movdqa (%eax),%xmm4 \n"
412 "movdqa (%eax,%edi,1),%xmm5 \n"
413 "lea (%eax,%edi,2),%eax \n"
414 "movdqa %xmm4,%xmm7 \n"
415 "punpcklbw %xmm5,%xmm4 \n"
416 "punpckhbw %xmm5,%xmm7 \n"
417 "movdqa %xmm7,%xmm5 \n"
418 "movdqa (%eax),%xmm6 \n"
419 "movdqa (%eax,%edi,1),%xmm7 \n"
420 "lea (%eax,%edi,2),%eax \n"
421 "movdqa %xmm5,(%esp) \n"
422 "neg %edi \n"
423 "movdqa %xmm6,%xmm5 \n"
424 "punpcklbw %xmm7,%xmm6 \n"
425 "punpckhbw %xmm7,%xmm5 \n"
426 "movdqa %xmm5,%xmm7 \n"
427 "lea 0x10(%eax,%edi,8),%eax \n"
428 "neg %edi \n"
429 "movdqa %xmm0,%xmm5 \n"
430 "punpcklwd %xmm2,%xmm0 \n"
431 "punpckhwd %xmm2,%xmm5 \n"
432 "movdqa %xmm5,%xmm2 \n"
433 "movdqa %xmm1,%xmm5 \n"
434 "punpcklwd %xmm3,%xmm1 \n"
435 "punpckhwd %xmm3,%xmm5 \n"
436 "movdqa %xmm5,%xmm3 \n"
437 "movdqa %xmm4,%xmm5 \n"
438 "punpcklwd %xmm6,%xmm4 \n"
439 "punpckhwd %xmm6,%xmm5 \n"
440 "movdqa %xmm5,%xmm6 \n"
441 "movdqa (%esp),%xmm5 \n"
442 "movdqa %xmm6,(%esp) \n"
443 "movdqa %xmm5,%xmm6 \n"
444 "punpcklwd %xmm7,%xmm5 \n"
445 "punpckhwd %xmm7,%xmm6 \n"
446 "movdqa %xmm6,%xmm7 \n"
447 "movdqa %xmm0,%xmm6 \n"
448 "punpckldq %xmm4,%xmm0 \n"
449 "punpckhdq %xmm4,%xmm6 \n"
450 "movdqa %xmm6,%xmm4 \n"
451 "movdqa (%esp),%xmm6 \n"
452 "movlpd %xmm0,(%edx) \n"
453 "movhpd %xmm0,(%ebx) \n"
454 "movlpd %xmm4,(%edx,%esi,1) \n"
455 "lea (%edx,%esi,2),%edx \n"
456 "movhpd %xmm4,(%ebx,%ebp,1) \n"
457 "lea (%ebx,%ebp,2),%ebx \n"
458 "movdqa %xmm2,%xmm0 \n"
459 "punpckldq %xmm6,%xmm2 \n"
460 "movlpd %xmm2,(%edx) \n"
461 "movhpd %xmm2,(%ebx) \n"
462 "punpckhdq %xmm6,%xmm0 \n"
463 "movlpd %xmm0,(%edx,%esi,1) \n"
464 "lea (%edx,%esi,2),%edx \n"
465 "movhpd %xmm0,(%ebx,%ebp,1) \n"
466 "lea (%ebx,%ebp,2),%ebx \n"
467 "movdqa %xmm1,%xmm0 \n"
468 "punpckldq %xmm5,%xmm1 \n"
469 "movlpd %xmm1,(%edx) \n"
470 "movhpd %xmm1,(%ebx) \n"
471 "punpckhdq %xmm5,%xmm0 \n"
472 "movlpd %xmm0,(%edx,%esi,1) \n"
473 "lea (%edx,%esi,2),%edx \n"
474 "movhpd %xmm0,(%ebx,%ebp,1) \n"
475 "lea (%ebx,%ebp,2),%ebx \n"
476 "movdqa %xmm3,%xmm0 \n"
477 "punpckldq %xmm7,%xmm3 \n"
478 "movlpd %xmm3,(%edx) \n"
479 "movhpd %xmm3,(%ebx) \n"
480 "punpckhdq %xmm7,%xmm0 \n"
481 "sub $0x8,%ecx \n"
482 "movlpd %xmm0,(%edx,%esi,1) \n"
483 "lea (%edx,%esi,2),%edx \n"
484 "movhpd %xmm0,(%ebx,%ebp,1) \n"
485 "lea (%ebx,%ebp,2),%ebx \n"
486 "jg 1b \n"
487 "mov 0x10(%esp),%esp \n"
488 "pop %ebp \n"
489 "pop %edi \n"
490 "pop %esi \n"
491 "pop %ebx \n"
492 "ret \n"
493 );
494 #elif !defined(YUV_DISABLE_ASM) && defined(__x86_64__)
495 // 64 bit version has enough registers to do 16x8 to 8x16 at a time.
496 #define HAS_TRANSPOSE_WX8_FAST_SSSE3
TransposeWx8_FAST_SSSE3(const uint8 * src,int src_stride,uint8 * dst,int dst_stride,int width)497 static void TransposeWx8_FAST_SSSE3(const uint8* src, int src_stride,
498 uint8* dst, int dst_stride, int width) {
499 asm volatile (
500 // Read in the data from the source pointer.
501 // First round of bit swap.
502 ".p2align 4 \n"
503 "1: \n"
504 "movdqa (%0),%%xmm0 \n"
505 "movdqa (%0,%3),%%xmm1 \n"
506 "lea (%0,%3,2),%0 \n"
507 "movdqa %%xmm0,%%xmm8 \n"
508 "punpcklbw %%xmm1,%%xmm0 \n"
509 "punpckhbw %%xmm1,%%xmm8 \n"
510 "movdqa (%0),%%xmm2 \n"
511 "movdqa %%xmm0,%%xmm1 \n"
512 "movdqa %%xmm8,%%xmm9 \n"
513 "palignr $0x8,%%xmm1,%%xmm1 \n"
514 "palignr $0x8,%%xmm9,%%xmm9 \n"
515 "movdqa (%0,%3),%%xmm3 \n"
516 "lea (%0,%3,2),%0 \n"
517 "movdqa %%xmm2,%%xmm10 \n"
518 "punpcklbw %%xmm3,%%xmm2 \n"
519 "punpckhbw %%xmm3,%%xmm10 \n"
520 "movdqa %%xmm2,%%xmm3 \n"
521 "movdqa %%xmm10,%%xmm11 \n"
522 "movdqa (%0),%%xmm4 \n"
523 "palignr $0x8,%%xmm3,%%xmm3 \n"
524 "palignr $0x8,%%xmm11,%%xmm11 \n"
525 "movdqa (%0,%3),%%xmm5 \n"
526 "lea (%0,%3,2),%0 \n"
527 "movdqa %%xmm4,%%xmm12 \n"
528 "punpcklbw %%xmm5,%%xmm4 \n"
529 "punpckhbw %%xmm5,%%xmm12 \n"
530 "movdqa %%xmm4,%%xmm5 \n"
531 "movdqa %%xmm12,%%xmm13 \n"
532 "movdqa (%0),%%xmm6 \n"
533 "palignr $0x8,%%xmm5,%%xmm5 \n"
534 "palignr $0x8,%%xmm13,%%xmm13 \n"
535 "movdqa (%0,%3),%%xmm7 \n"
536 "lea (%0,%3,2),%0 \n"
537 "movdqa %%xmm6,%%xmm14 \n"
538 "punpcklbw %%xmm7,%%xmm6 \n"
539 "punpckhbw %%xmm7,%%xmm14 \n"
540 "neg %3 \n"
541 "movdqa %%xmm6,%%xmm7 \n"
542 "movdqa %%xmm14,%%xmm15 \n"
543 "lea 0x10(%0,%3,8),%0 \n"
544 "palignr $0x8,%%xmm7,%%xmm7 \n"
545 "palignr $0x8,%%xmm15,%%xmm15 \n"
546 "neg %3 \n"
547 // Second round of bit swap.
548 "punpcklwd %%xmm2,%%xmm0 \n"
549 "punpcklwd %%xmm3,%%xmm1 \n"
550 "movdqa %%xmm0,%%xmm2 \n"
551 "movdqa %%xmm1,%%xmm3 \n"
552 "palignr $0x8,%%xmm2,%%xmm2 \n"
553 "palignr $0x8,%%xmm3,%%xmm3 \n"
554 "punpcklwd %%xmm6,%%xmm4 \n"
555 "punpcklwd %%xmm7,%%xmm5 \n"
556 "movdqa %%xmm4,%%xmm6 \n"
557 "movdqa %%xmm5,%%xmm7 \n"
558 "palignr $0x8,%%xmm6,%%xmm6 \n"
559 "palignr $0x8,%%xmm7,%%xmm7 \n"
560 "punpcklwd %%xmm10,%%xmm8 \n"
561 "punpcklwd %%xmm11,%%xmm9 \n"
562 "movdqa %%xmm8,%%xmm10 \n"
563 "movdqa %%xmm9,%%xmm11 \n"
564 "palignr $0x8,%%xmm10,%%xmm10 \n"
565 "palignr $0x8,%%xmm11,%%xmm11 \n"
566 "punpcklwd %%xmm14,%%xmm12 \n"
567 "punpcklwd %%xmm15,%%xmm13 \n"
568 "movdqa %%xmm12,%%xmm14 \n"
569 "movdqa %%xmm13,%%xmm15 \n"
570 "palignr $0x8,%%xmm14,%%xmm14 \n"
571 "palignr $0x8,%%xmm15,%%xmm15 \n"
572 // Third round of bit swap.
573 // Write to the destination pointer.
574 "punpckldq %%xmm4,%%xmm0 \n"
575 "movq %%xmm0,(%1) \n"
576 "movdqa %%xmm0,%%xmm4 \n"
577 "palignr $0x8,%%xmm4,%%xmm4 \n"
578 "movq %%xmm4,(%1,%4) \n"
579 "lea (%1,%4,2),%1 \n"
580 "punpckldq %%xmm6,%%xmm2 \n"
581 "movdqa %%xmm2,%%xmm6 \n"
582 "movq %%xmm2,(%1) \n"
583 "palignr $0x8,%%xmm6,%%xmm6 \n"
584 "punpckldq %%xmm5,%%xmm1 \n"
585 "movq %%xmm6,(%1,%4) \n"
586 "lea (%1,%4,2),%1 \n"
587 "movdqa %%xmm1,%%xmm5 \n"
588 "movq %%xmm1,(%1) \n"
589 "palignr $0x8,%%xmm5,%%xmm5 \n"
590 "movq %%xmm5,(%1,%4) \n"
591 "lea (%1,%4,2),%1 \n"
592 "punpckldq %%xmm7,%%xmm3 \n"
593 "movq %%xmm3,(%1) \n"
594 "movdqa %%xmm3,%%xmm7 \n"
595 "palignr $0x8,%%xmm7,%%xmm7 \n"
596 "movq %%xmm7,(%1,%4) \n"
597 "lea (%1,%4,2),%1 \n"
598 "punpckldq %%xmm12,%%xmm8 \n"
599 "movq %%xmm8,(%1) \n"
600 "movdqa %%xmm8,%%xmm12 \n"
601 "palignr $0x8,%%xmm12,%%xmm12 \n"
602 "movq %%xmm12,(%1,%4) \n"
603 "lea (%1,%4,2),%1 \n"
604 "punpckldq %%xmm14,%%xmm10 \n"
605 "movdqa %%xmm10,%%xmm14 \n"
606 "movq %%xmm10,(%1) \n"
607 "palignr $0x8,%%xmm14,%%xmm14 \n"
608 "punpckldq %%xmm13,%%xmm9 \n"
609 "movq %%xmm14,(%1,%4) \n"
610 "lea (%1,%4,2),%1 \n"
611 "movdqa %%xmm9,%%xmm13 \n"
612 "movq %%xmm9,(%1) \n"
613 "palignr $0x8,%%xmm13,%%xmm13 \n"
614 "movq %%xmm13,(%1,%4) \n"
615 "lea (%1,%4,2),%1 \n"
616 "punpckldq %%xmm15,%%xmm11 \n"
617 "movq %%xmm11,(%1) \n"
618 "movdqa %%xmm11,%%xmm15 \n"
619 "palignr $0x8,%%xmm15,%%xmm15 \n"
620 "sub $0x10,%2 \n"
621 "movq %%xmm15,(%1,%4) \n"
622 "lea (%1,%4,2),%1 \n"
623 "jg 1b \n"
624 : "+r"(src), // %0
625 "+r"(dst), // %1
626 "+r"(width) // %2
627 : "r"(static_cast<intptr_t>(src_stride)), // %3
628 "r"(static_cast<intptr_t>(dst_stride)) // %4
629 : "memory", "cc",
630 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
631 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15"
632 );
633 }
634
635 #define HAS_TRANSPOSE_UVWX8_SSE2
TransposeUVWx8_SSE2(const uint8 * src,int src_stride,uint8 * dst_a,int dst_stride_a,uint8 * dst_b,int dst_stride_b,int w)636 static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
637 uint8* dst_a, int dst_stride_a,
638 uint8* dst_b, int dst_stride_b,
639 int w) {
640 asm volatile (
641 // Read in the data from the source pointer.
642 // First round of bit swap.
643 ".p2align 4 \n"
644 "1: \n"
645 "movdqa (%0),%%xmm0 \n"
646 "movdqa (%0,%4),%%xmm1 \n"
647 "lea (%0,%4,2),%0 \n"
648 "movdqa %%xmm0,%%xmm8 \n"
649 "punpcklbw %%xmm1,%%xmm0 \n"
650 "punpckhbw %%xmm1,%%xmm8 \n"
651 "movdqa %%xmm8,%%xmm1 \n"
652 "movdqa (%0),%%xmm2 \n"
653 "movdqa (%0,%4),%%xmm3 \n"
654 "lea (%0,%4,2),%0 \n"
655 "movdqa %%xmm2,%%xmm8 \n"
656 "punpcklbw %%xmm3,%%xmm2 \n"
657 "punpckhbw %%xmm3,%%xmm8 \n"
658 "movdqa %%xmm8,%%xmm3 \n"
659 "movdqa (%0),%%xmm4 \n"
660 "movdqa (%0,%4),%%xmm5 \n"
661 "lea (%0,%4,2),%0 \n"
662 "movdqa %%xmm4,%%xmm8 \n"
663 "punpcklbw %%xmm5,%%xmm4 \n"
664 "punpckhbw %%xmm5,%%xmm8 \n"
665 "movdqa %%xmm8,%%xmm5 \n"
666 "movdqa (%0),%%xmm6 \n"
667 "movdqa (%0,%4),%%xmm7 \n"
668 "lea (%0,%4,2),%0 \n"
669 "movdqa %%xmm6,%%xmm8 \n"
670 "punpcklbw %%xmm7,%%xmm6 \n"
671 "neg %4 \n"
672 "lea 0x10(%0,%4,8),%0 \n"
673 "punpckhbw %%xmm7,%%xmm8 \n"
674 "movdqa %%xmm8,%%xmm7 \n"
675 "neg %4 \n"
676 // Second round of bit swap.
677 "movdqa %%xmm0,%%xmm8 \n"
678 "movdqa %%xmm1,%%xmm9 \n"
679 "punpckhwd %%xmm2,%%xmm8 \n"
680 "punpckhwd %%xmm3,%%xmm9 \n"
681 "punpcklwd %%xmm2,%%xmm0 \n"
682 "punpcklwd %%xmm3,%%xmm1 \n"
683 "movdqa %%xmm8,%%xmm2 \n"
684 "movdqa %%xmm9,%%xmm3 \n"
685 "movdqa %%xmm4,%%xmm8 \n"
686 "movdqa %%xmm5,%%xmm9 \n"
687 "punpckhwd %%xmm6,%%xmm8 \n"
688 "punpckhwd %%xmm7,%%xmm9 \n"
689 "punpcklwd %%xmm6,%%xmm4 \n"
690 "punpcklwd %%xmm7,%%xmm5 \n"
691 "movdqa %%xmm8,%%xmm6 \n"
692 "movdqa %%xmm9,%%xmm7 \n"
693 // Third round of bit swap.
694 // Write to the destination pointer.
695 "movdqa %%xmm0,%%xmm8 \n"
696 "punpckldq %%xmm4,%%xmm0 \n"
697 "movlpd %%xmm0,(%1) \n" // Write back U channel
698 "movhpd %%xmm0,(%2) \n" // Write back V channel
699 "punpckhdq %%xmm4,%%xmm8 \n"
700 "movlpd %%xmm8,(%1,%5) \n"
701 "lea (%1,%5,2),%1 \n"
702 "movhpd %%xmm8,(%2,%6) \n"
703 "lea (%2,%6,2),%2 \n"
704 "movdqa %%xmm2,%%xmm8 \n"
705 "punpckldq %%xmm6,%%xmm2 \n"
706 "movlpd %%xmm2,(%1) \n"
707 "movhpd %%xmm2,(%2) \n"
708 "punpckhdq %%xmm6,%%xmm8 \n"
709 "movlpd %%xmm8,(%1,%5) \n"
710 "lea (%1,%5,2),%1 \n"
711 "movhpd %%xmm8,(%2,%6) \n"
712 "lea (%2,%6,2),%2 \n"
713 "movdqa %%xmm1,%%xmm8 \n"
714 "punpckldq %%xmm5,%%xmm1 \n"
715 "movlpd %%xmm1,(%1) \n"
716 "movhpd %%xmm1,(%2) \n"
717 "punpckhdq %%xmm5,%%xmm8 \n"
718 "movlpd %%xmm8,(%1,%5) \n"
719 "lea (%1,%5,2),%1 \n"
720 "movhpd %%xmm8,(%2,%6) \n"
721 "lea (%2,%6,2),%2 \n"
722 "movdqa %%xmm3,%%xmm8 \n"
723 "punpckldq %%xmm7,%%xmm3 \n"
724 "movlpd %%xmm3,(%1) \n"
725 "movhpd %%xmm3,(%2) \n"
726 "punpckhdq %%xmm7,%%xmm8 \n"
727 "sub $0x8,%3 \n"
728 "movlpd %%xmm8,(%1,%5) \n"
729 "lea (%1,%5,2),%1 \n"
730 "movhpd %%xmm8,(%2,%6) \n"
731 "lea (%2,%6,2),%2 \n"
732 "jg 1b \n"
733 : "+r"(src), // %0
734 "+r"(dst_a), // %1
735 "+r"(dst_b), // %2
736 "+r"(w) // %3
737 : "r"(static_cast<intptr_t>(src_stride)), // %4
738 "r"(static_cast<intptr_t>(dst_stride_a)), // %5
739 "r"(static_cast<intptr_t>(dst_stride_b)) // %6
740 : "memory", "cc",
741 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
742 "xmm8", "xmm9"
743 );
744 }
745 #endif
746 #endif
747
TransposeWx8_C(const uint8 * src,int src_stride,uint8 * dst,int dst_stride,int width)748 static void TransposeWx8_C(const uint8* src, int src_stride,
749 uint8* dst, int dst_stride,
750 int width) {
751 for (int i = 0; i < width; ++i) {
752 dst[0] = src[0 * src_stride];
753 dst[1] = src[1 * src_stride];
754 dst[2] = src[2 * src_stride];
755 dst[3] = src[3 * src_stride];
756 dst[4] = src[4 * src_stride];
757 dst[5] = src[5 * src_stride];
758 dst[6] = src[6 * src_stride];
759 dst[7] = src[7 * src_stride];
760 ++src;
761 dst += dst_stride;
762 }
763 }
764
TransposeWxH_C(const uint8 * src,int src_stride,uint8 * dst,int dst_stride,int width,int height)765 static void TransposeWxH_C(const uint8* src, int src_stride,
766 uint8* dst, int dst_stride,
767 int width, int height) {
768 for (int i = 0; i < width; ++i) {
769 for (int j = 0; j < height; ++j) {
770 dst[i * dst_stride + j] = src[j * src_stride + i];
771 }
772 }
773 }
774
775 LIBYUV_API
TransposePlane(const uint8 * src,int src_stride,uint8 * dst,int dst_stride,int width,int height)776 void TransposePlane(const uint8* src, int src_stride,
777 uint8* dst, int dst_stride,
778 int width, int height) {
779 void (*TransposeWx8)(const uint8* src, int src_stride,
780 uint8* dst, int dst_stride,
781 int width) = TransposeWx8_C;
782 #if defined(HAS_TRANSPOSE_WX8_NEON)
783 if (TestCpuFlag(kCpuHasNEON)) {
784 TransposeWx8 = TransposeWx8_NEON;
785 }
786 #endif
787 #if defined(HAS_TRANSPOSE_WX8_SSSE3)
788 if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) {
789 TransposeWx8 = TransposeWx8_SSSE3;
790 }
791 #endif
792 #if defined(HAS_TRANSPOSE_WX8_FAST_SSSE3)
793 if (TestCpuFlag(kCpuHasSSSE3) &&
794 IS_ALIGNED(width, 16) &&
795 IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) {
796 TransposeWx8 = TransposeWx8_FAST_SSSE3;
797 }
798 #endif
799
800 // Work across the source in 8x8 tiles
801 int i = height;
802 while (i >= 8) {
803 TransposeWx8(src, src_stride, dst, dst_stride, width);
804 src += 8 * src_stride; // Go down 8 rows.
805 dst += 8; // Move over 8 columns.
806 i -= 8;
807 }
808
809 TransposeWxH_C(src, src_stride, dst, dst_stride, width, i);
810 }
811
812 LIBYUV_API
RotatePlane90(const uint8 * src,int src_stride,uint8 * dst,int dst_stride,int width,int height)813 void RotatePlane90(const uint8* src, int src_stride,
814 uint8* dst, int dst_stride,
815 int width, int height) {
816 // Rotate by 90 is a transpose with the source read
817 // from bottom to top. So set the source pointer to the end
818 // of the buffer and flip the sign of the source stride.
819 src += src_stride * (height - 1);
820 src_stride = -src_stride;
821 TransposePlane(src, src_stride, dst, dst_stride, width, height);
822 }
823
824 LIBYUV_API
RotatePlane270(const uint8 * src,int src_stride,uint8 * dst,int dst_stride,int width,int height)825 void RotatePlane270(const uint8* src, int src_stride,
826 uint8* dst, int dst_stride,
827 int width, int height) {
828 // Rotate by 270 is a transpose with the destination written
829 // from bottom to top. So set the destination pointer to the end
830 // of the buffer and flip the sign of the destination stride.
831 dst += dst_stride * (width - 1);
832 dst_stride = -dst_stride;
833 TransposePlane(src, src_stride, dst, dst_stride, width, height);
834 }
835
836 LIBYUV_API
RotatePlane180(const uint8 * src,int src_stride,uint8 * dst,int dst_stride,int width,int height)837 void RotatePlane180(const uint8* src, int src_stride,
838 uint8* dst, int dst_stride,
839 int width, int height) {
840 void (*MirrorRow)(const uint8* src, uint8* dst, int width) = MirrorRow_C;
841 #if defined(HAS_MIRRORROW_NEON)
842 if (TestCpuFlag(kCpuHasNEON)) {
843 MirrorRow = MirrorRow_NEON;
844 }
845 #endif
846 #if defined(HAS_MIRRORROW_SSE2)
847 if (TestCpuFlag(kCpuHasSSE2) &&
848 IS_ALIGNED(width, 16) &&
849 IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&
850 IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
851 MirrorRow = MirrorRow_SSE2;
852 }
853 #endif
854 #if defined(HAS_MIRRORROW_SSSE3)
855 if (TestCpuFlag(kCpuHasSSSE3) &&
856 IS_ALIGNED(width, 16) &&
857 IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&
858 IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
859 MirrorRow = MirrorRow_SSSE3;
860 }
861 #endif
862 void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
863 #if defined(HAS_COPYROW_NEON)
864 if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 64)) {
865 CopyRow = CopyRow_NEON;
866 }
867 #endif
868 #if defined(HAS_COPYROW_X86)
869 if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) {
870 CopyRow = CopyRow_X86;
871 }
872 #endif
873 #if defined(HAS_COPYROW_SSE2)
874 if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32) &&
875 IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&
876 IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
877 CopyRow = CopyRow_SSE2;
878 }
879 #endif
880 if (width > kMaxStride) {
881 return;
882 }
883 // Swap first and last row and mirror the content. Uses a temporary row.
884 SIMD_ALIGNED(uint8 row[kMaxStride]);
885 const uint8* src_bot = src + src_stride * (height - 1);
886 uint8* dst_bot = dst + dst_stride * (height - 1);
887 int half_height = (height + 1) >> 1;
888 // Odd height will harmlessly mirror the middle row twice.
889 for (int y = 0; y < half_height; ++y) {
890 MirrorRow(src, row, width); // Mirror first row into a buffer
891 src += src_stride;
892 MirrorRow(src_bot, dst, width); // Mirror last row into first row
893 dst += dst_stride;
894 CopyRow(row, dst_bot, width); // Copy first mirrored row into last
895 src_bot -= src_stride;
896 dst_bot -= dst_stride;
897 }
898 }
899
TransposeUVWx8_C(const uint8 * src,int src_stride,uint8 * dst_a,int dst_stride_a,uint8 * dst_b,int dst_stride_b,int width)900 static void TransposeUVWx8_C(const uint8* src, int src_stride,
901 uint8* dst_a, int dst_stride_a,
902 uint8* dst_b, int dst_stride_b,
903 int width) {
904 for (int i = 0; i < width; ++i) {
905 dst_a[0] = src[0 * src_stride + 0];
906 dst_b[0] = src[0 * src_stride + 1];
907 dst_a[1] = src[1 * src_stride + 0];
908 dst_b[1] = src[1 * src_stride + 1];
909 dst_a[2] = src[2 * src_stride + 0];
910 dst_b[2] = src[2 * src_stride + 1];
911 dst_a[3] = src[3 * src_stride + 0];
912 dst_b[3] = src[3 * src_stride + 1];
913 dst_a[4] = src[4 * src_stride + 0];
914 dst_b[4] = src[4 * src_stride + 1];
915 dst_a[5] = src[5 * src_stride + 0];
916 dst_b[5] = src[5 * src_stride + 1];
917 dst_a[6] = src[6 * src_stride + 0];
918 dst_b[6] = src[6 * src_stride + 1];
919 dst_a[7] = src[7 * src_stride + 0];
920 dst_b[7] = src[7 * src_stride + 1];
921 src += 2;
922 dst_a += dst_stride_a;
923 dst_b += dst_stride_b;
924 }
925 }
926
TransposeUVWxH_C(const uint8 * src,int src_stride,uint8 * dst_a,int dst_stride_a,uint8 * dst_b,int dst_stride_b,int width,int height)927 static void TransposeUVWxH_C(const uint8* src, int src_stride,
928 uint8* dst_a, int dst_stride_a,
929 uint8* dst_b, int dst_stride_b,
930 int width, int height) {
931 for (int i = 0; i < width * 2; i += 2)
932 for (int j = 0; j < height; ++j) {
933 dst_a[j + ((i >> 1) * dst_stride_a)] = src[i + (j * src_stride)];
934 dst_b[j + ((i >> 1) * dst_stride_b)] = src[i + (j * src_stride) + 1];
935 }
936 }
937
938 LIBYUV_API
TransposeUV(const uint8 * src,int src_stride,uint8 * dst_a,int dst_stride_a,uint8 * dst_b,int dst_stride_b,int width,int height)939 void TransposeUV(const uint8* src, int src_stride,
940 uint8* dst_a, int dst_stride_a,
941 uint8* dst_b, int dst_stride_b,
942 int width, int height) {
943 void (*TransposeUVWx8)(const uint8* src, int src_stride,
944 uint8* dst_a, int dst_stride_a,
945 uint8* dst_b, int dst_stride_b,
946 int width) = TransposeUVWx8_C;
947 #if defined(HAS_TRANSPOSE_UVWX8_NEON)
948 if (TestCpuFlag(kCpuHasNEON)) {
949 TransposeUVWx8 = TransposeUVWx8_NEON;
950 }
951 #elif defined(HAS_TRANSPOSE_UVWX8_SSE2)
952 if (TestCpuFlag(kCpuHasSSE2) &&
953 IS_ALIGNED(width, 8) &&
954 IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) {
955 TransposeUVWx8 = TransposeUVWx8_SSE2;
956 }
957 #endif
958
959 // Work through the source in 8x8 tiles.
960 int i = height;
961 while (i >= 8) {
962 TransposeUVWx8(src, src_stride,
963 dst_a, dst_stride_a,
964 dst_b, dst_stride_b,
965 width);
966 src += 8 * src_stride; // Go down 8 rows.
967 dst_a += 8; // Move over 8 columns.
968 dst_b += 8; // Move over 8 columns.
969 i -= 8;
970 }
971
972 TransposeUVWxH_C(src, src_stride,
973 dst_a, dst_stride_a,
974 dst_b, dst_stride_b,
975 width, i);
976 }
977
978 LIBYUV_API
RotateUV90(const uint8 * src,int src_stride,uint8 * dst_a,int dst_stride_a,uint8 * dst_b,int dst_stride_b,int width,int height)979 void RotateUV90(const uint8* src, int src_stride,
980 uint8* dst_a, int dst_stride_a,
981 uint8* dst_b, int dst_stride_b,
982 int width, int height) {
983 src += src_stride * (height - 1);
984 src_stride = -src_stride;
985
986 TransposeUV(src, src_stride,
987 dst_a, dst_stride_a,
988 dst_b, dst_stride_b,
989 width, height);
990 }
991
992 LIBYUV_API
RotateUV270(const uint8 * src,int src_stride,uint8 * dst_a,int dst_stride_a,uint8 * dst_b,int dst_stride_b,int width,int height)993 void RotateUV270(const uint8* src, int src_stride,
994 uint8* dst_a, int dst_stride_a,
995 uint8* dst_b, int dst_stride_b,
996 int width, int height) {
997 dst_a += dst_stride_a * (width - 1);
998 dst_b += dst_stride_b * (width - 1);
999 dst_stride_a = -dst_stride_a;
1000 dst_stride_b = -dst_stride_b;
1001
1002 TransposeUV(src, src_stride,
1003 dst_a, dst_stride_a,
1004 dst_b, dst_stride_b,
1005 width, height);
1006 }
1007
1008 // Rotate 180 is a horizontal and vertical flip.
1009 LIBYUV_API
RotateUV180(const uint8 * src,int src_stride,uint8 * dst_a,int dst_stride_a,uint8 * dst_b,int dst_stride_b,int width,int height)1010 void RotateUV180(const uint8* src, int src_stride,
1011 uint8* dst_a, int dst_stride_a,
1012 uint8* dst_b, int dst_stride_b,
1013 int width, int height) {
1014 void (*MirrorRowUV)(const uint8* src, uint8* dst_u, uint8* dst_v, int width) =
1015 MirrorRowUV_C;
1016 #if defined(HAS_MIRRORROW_UV_NEON)
1017 if (TestCpuFlag(kCpuHasNEON)) {
1018 MirrorRowUV = MirrorRowUV_NEON;
1019 }
1020 #elif defined(HAS_MIRRORROW_UV_SSSE3)
1021 if (TestCpuFlag(kCpuHasSSSE3) &&
1022 IS_ALIGNED(width, 16) &&
1023 IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) {
1024 MirrorRowUV = MirrorRowUV_SSSE3;
1025 }
1026 #endif
1027
1028 dst_a += dst_stride_a * (height - 1);
1029 dst_b += dst_stride_b * (height - 1);
1030
1031 for (int i = 0; i < height; ++i) {
1032 MirrorRowUV(src, dst_a, dst_b, width);
1033 src += src_stride;
1034 dst_a -= dst_stride_a;
1035 dst_b -= dst_stride_b;
1036 }
1037 }
1038
1039 LIBYUV_API
I420Rotate(const uint8 * src_y,int src_stride_y,const uint8 * src_u,int src_stride_u,const uint8 * src_v,int src_stride_v,uint8 * dst_y,int dst_stride_y,uint8 * dst_u,int dst_stride_u,uint8 * dst_v,int dst_stride_v,int width,int height,RotationMode mode)1040 int I420Rotate(const uint8* src_y, int src_stride_y,
1041 const uint8* src_u, int src_stride_u,
1042 const uint8* src_v, int src_stride_v,
1043 uint8* dst_y, int dst_stride_y,
1044 uint8* dst_u, int dst_stride_u,
1045 uint8* dst_v, int dst_stride_v,
1046 int width, int height,
1047 RotationMode mode) {
1048 if (!src_y || !src_u || !src_v || width <= 0 || height == 0 ||
1049 !dst_y || !dst_u || !dst_v) {
1050 return -1;
1051 }
1052 int halfwidth = (width + 1) >> 1;
1053 int halfheight = (height + 1) >> 1;
1054
1055 // Negative height means invert the image.
1056 if (height < 0) {
1057 height = -height;
1058 halfheight = (height + 1) >> 1;
1059 src_y = src_y + (height - 1) * src_stride_y;
1060 src_u = src_u + (halfheight - 1) * src_stride_u;
1061 src_v = src_v + (halfheight - 1) * src_stride_v;
1062 src_stride_y = -src_stride_y;
1063 src_stride_u = -src_stride_u;
1064 src_stride_v = -src_stride_v;
1065 }
1066
1067 switch (mode) {
1068 case kRotate0:
1069 // copy frame
1070 return I420Copy(src_y, src_stride_y,
1071 src_u, src_stride_u,
1072 src_v, src_stride_v,
1073 dst_y, dst_stride_y,
1074 dst_u, dst_stride_u,
1075 dst_v, dst_stride_v,
1076 width, height);
1077 case kRotate90:
1078 RotatePlane90(src_y, src_stride_y,
1079 dst_y, dst_stride_y,
1080 width, height);
1081 RotatePlane90(src_u, src_stride_u,
1082 dst_u, dst_stride_u,
1083 halfwidth, halfheight);
1084 RotatePlane90(src_v, src_stride_v,
1085 dst_v, dst_stride_v,
1086 halfwidth, halfheight);
1087 return 0;
1088 case kRotate270:
1089 RotatePlane270(src_y, src_stride_y,
1090 dst_y, dst_stride_y,
1091 width, height);
1092 RotatePlane270(src_u, src_stride_u,
1093 dst_u, dst_stride_u,
1094 halfwidth, halfheight);
1095 RotatePlane270(src_v, src_stride_v,
1096 dst_v, dst_stride_v,
1097 halfwidth, halfheight);
1098 return 0;
1099 case kRotate180:
1100 RotatePlane180(src_y, src_stride_y,
1101 dst_y, dst_stride_y,
1102 width, height);
1103 RotatePlane180(src_u, src_stride_u,
1104 dst_u, dst_stride_u,
1105 halfwidth, halfheight);
1106 RotatePlane180(src_v, src_stride_v,
1107 dst_v, dst_stride_v,
1108 halfwidth, halfheight);
1109 return 0;
1110 default:
1111 break;
1112 }
1113 return -1;
1114 }
1115
1116 LIBYUV_API
NV12ToI420Rotate(const uint8 * src_y,int src_stride_y,const uint8 * src_uv,int src_stride_uv,uint8 * dst_y,int dst_stride_y,uint8 * dst_u,int dst_stride_u,uint8 * dst_v,int dst_stride_v,int width,int height,RotationMode mode)1117 int NV12ToI420Rotate(const uint8* src_y, int src_stride_y,
1118 const uint8* src_uv, int src_stride_uv,
1119 uint8* dst_y, int dst_stride_y,
1120 uint8* dst_u, int dst_stride_u,
1121 uint8* dst_v, int dst_stride_v,
1122 int width, int height,
1123 RotationMode mode) {
1124 if (!src_y || !src_uv || width <= 0 || height == 0 ||
1125 !dst_y || !dst_u || !dst_v) {
1126 return -1;
1127 }
1128 int halfwidth = (width + 1) >> 1;
1129 int halfheight = (height + 1) >> 1;
1130
1131 // Negative height means invert the image.
1132 if (height < 0) {
1133 height = -height;
1134 halfheight = (height + 1) >> 1;
1135 src_y = src_y + (height - 1) * src_stride_y;
1136 src_uv = src_uv + (halfheight - 1) * src_stride_uv;
1137 src_stride_y = -src_stride_y;
1138 src_stride_uv = -src_stride_uv;
1139 }
1140
1141 switch (mode) {
1142 case kRotate0:
1143 // copy frame
1144 return NV12ToI420(src_y, src_stride_y,
1145 src_uv, src_stride_uv,
1146 dst_y, dst_stride_y,
1147 dst_u, dst_stride_u,
1148 dst_v, dst_stride_v,
1149 width, height);
1150 case kRotate90:
1151 RotatePlane90(src_y, src_stride_y,
1152 dst_y, dst_stride_y,
1153 width, height);
1154 RotateUV90(src_uv, src_stride_uv,
1155 dst_u, dst_stride_u,
1156 dst_v, dst_stride_v,
1157 halfwidth, halfheight);
1158 return 0;
1159 case kRotate270:
1160 RotatePlane270(src_y, src_stride_y,
1161 dst_y, dst_stride_y,
1162 width, height);
1163 RotateUV270(src_uv, src_stride_uv,
1164 dst_u, dst_stride_u,
1165 dst_v, dst_stride_v,
1166 halfwidth, halfheight);
1167 return 0;
1168 case kRotate180:
1169 RotatePlane180(src_y, src_stride_y,
1170 dst_y, dst_stride_y,
1171 width, height);
1172 RotateUV180(src_uv, src_stride_uv,
1173 dst_u, dst_stride_u,
1174 dst_v, dst_stride_v,
1175 halfwidth, halfheight);
1176 return 0;
1177 default:
1178 break;
1179 }
1180 return -1;
1181 }
1182
1183 #ifdef __cplusplus
1184 } // extern "C"
1185 } // namespace libyuv
1186 #endif
1187