1 /*
2 * Copyright 2015 The LibYuv Project Authors. All rights reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "libyuv/rotate_row.h"
12 #include "libyuv/row.h"
13
14 #ifdef __cplusplus
15 namespace libyuv {
16 extern "C" {
17 #endif
18
19 // This module is for GCC x86 and x64.
20 #if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
21
22 // Transpose 8x8. 32 or 64 bit, but not NaCL for 64 bit.
23 #if defined(HAS_TRANSPOSEWX8_SSSE3)
TransposeWx8_SSSE3(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int width)24 void TransposeWx8_SSSE3(const uint8_t* src,
25 int src_stride,
26 uint8_t* dst,
27 int dst_stride,
28 int width) {
29 asm volatile(
30 // Read in the data from the source pointer.
31 // First round of bit swap.
32 LABELALIGN
33 "1: \n"
34 "movq (%0),%%xmm0 \n"
35 "movq (%0,%3),%%xmm1 \n"
36 "lea (%0,%3,2),%0 \n"
37 "punpcklbw %%xmm1,%%xmm0 \n"
38 "movq (%0),%%xmm2 \n"
39 "movdqa %%xmm0,%%xmm1 \n"
40 "palignr $0x8,%%xmm1,%%xmm1 \n"
41 "movq (%0,%3),%%xmm3 \n"
42 "lea (%0,%3,2),%0 \n"
43 "punpcklbw %%xmm3,%%xmm2 \n"
44 "movdqa %%xmm2,%%xmm3 \n"
45 "movq (%0),%%xmm4 \n"
46 "palignr $0x8,%%xmm3,%%xmm3 \n"
47 "movq (%0,%3),%%xmm5 \n"
48 "lea (%0,%3,2),%0 \n"
49 "punpcklbw %%xmm5,%%xmm4 \n"
50 "movdqa %%xmm4,%%xmm5 \n"
51 "movq (%0),%%xmm6 \n"
52 "palignr $0x8,%%xmm5,%%xmm5 \n"
53 "movq (%0,%3),%%xmm7 \n"
54 "lea (%0,%3,2),%0 \n"
55 "punpcklbw %%xmm7,%%xmm6 \n"
56 "neg %3 \n"
57 "movdqa %%xmm6,%%xmm7 \n"
58 "lea 0x8(%0,%3,8),%0 \n"
59 "palignr $0x8,%%xmm7,%%xmm7 \n"
60 "neg %3 \n"
61 // Second round of bit swap.
62 "punpcklwd %%xmm2,%%xmm0 \n"
63 "punpcklwd %%xmm3,%%xmm1 \n"
64 "movdqa %%xmm0,%%xmm2 \n"
65 "movdqa %%xmm1,%%xmm3 \n"
66 "palignr $0x8,%%xmm2,%%xmm2 \n"
67 "palignr $0x8,%%xmm3,%%xmm3 \n"
68 "punpcklwd %%xmm6,%%xmm4 \n"
69 "punpcklwd %%xmm7,%%xmm5 \n"
70 "movdqa %%xmm4,%%xmm6 \n"
71 "movdqa %%xmm5,%%xmm7 \n"
72 "palignr $0x8,%%xmm6,%%xmm6 \n"
73 "palignr $0x8,%%xmm7,%%xmm7 \n"
74 // Third round of bit swap.
75 // Write to the destination pointer.
76 "punpckldq %%xmm4,%%xmm0 \n"
77 "movq %%xmm0,(%1) \n"
78 "movdqa %%xmm0,%%xmm4 \n"
79 "palignr $0x8,%%xmm4,%%xmm4 \n"
80 "movq %%xmm4,(%1,%4) \n"
81 "lea (%1,%4,2),%1 \n"
82 "punpckldq %%xmm6,%%xmm2 \n"
83 "movdqa %%xmm2,%%xmm6 \n"
84 "movq %%xmm2,(%1) \n"
85 "palignr $0x8,%%xmm6,%%xmm6 \n"
86 "punpckldq %%xmm5,%%xmm1 \n"
87 "movq %%xmm6,(%1,%4) \n"
88 "lea (%1,%4,2),%1 \n"
89 "movdqa %%xmm1,%%xmm5 \n"
90 "movq %%xmm1,(%1) \n"
91 "palignr $0x8,%%xmm5,%%xmm5 \n"
92 "movq %%xmm5,(%1,%4) \n"
93 "lea (%1,%4,2),%1 \n"
94 "punpckldq %%xmm7,%%xmm3 \n"
95 "movq %%xmm3,(%1) \n"
96 "movdqa %%xmm3,%%xmm7 \n"
97 "palignr $0x8,%%xmm7,%%xmm7 \n"
98 "sub $0x8,%2 \n"
99 "movq %%xmm7,(%1,%4) \n"
100 "lea (%1,%4,2),%1 \n"
101 "jg 1b \n"
102 : "+r"(src), // %0
103 "+r"(dst), // %1
104 "+r"(width) // %2
105 : "r"((intptr_t)(src_stride)), // %3
106 "r"((intptr_t)(dst_stride)) // %4
107 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
108 "xmm7");
109 }
110 #endif // defined(HAS_TRANSPOSEWX8_SSSE3)
111
112 // Transpose 16x8. 64 bit
113 #if defined(HAS_TRANSPOSEWX8_FAST_SSSE3)
TransposeWx8_Fast_SSSE3(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int width)114 void TransposeWx8_Fast_SSSE3(const uint8_t* src,
115 int src_stride,
116 uint8_t* dst,
117 int dst_stride,
118 int width) {
119 asm volatile(
120 // Read in the data from the source pointer.
121 // First round of bit swap.
122 LABELALIGN
123 "1: \n"
124 "movdqu (%0),%%xmm0 \n"
125 "movdqu (%0,%3),%%xmm1 \n"
126 "lea (%0,%3,2),%0 \n"
127 "movdqa %%xmm0,%%xmm8 \n"
128 "punpcklbw %%xmm1,%%xmm0 \n"
129 "punpckhbw %%xmm1,%%xmm8 \n"
130 "movdqu (%0),%%xmm2 \n"
131 "movdqa %%xmm0,%%xmm1 \n"
132 "movdqa %%xmm8,%%xmm9 \n"
133 "palignr $0x8,%%xmm1,%%xmm1 \n"
134 "palignr $0x8,%%xmm9,%%xmm9 \n"
135 "movdqu (%0,%3),%%xmm3 \n"
136 "lea (%0,%3,2),%0 \n"
137 "movdqa %%xmm2,%%xmm10 \n"
138 "punpcklbw %%xmm3,%%xmm2 \n"
139 "punpckhbw %%xmm3,%%xmm10 \n"
140 "movdqa %%xmm2,%%xmm3 \n"
141 "movdqa %%xmm10,%%xmm11 \n"
142 "movdqu (%0),%%xmm4 \n"
143 "palignr $0x8,%%xmm3,%%xmm3 \n"
144 "palignr $0x8,%%xmm11,%%xmm11 \n"
145 "movdqu (%0,%3),%%xmm5 \n"
146 "lea (%0,%3,2),%0 \n"
147 "movdqa %%xmm4,%%xmm12 \n"
148 "punpcklbw %%xmm5,%%xmm4 \n"
149 "punpckhbw %%xmm5,%%xmm12 \n"
150 "movdqa %%xmm4,%%xmm5 \n"
151 "movdqa %%xmm12,%%xmm13 \n"
152 "movdqu (%0),%%xmm6 \n"
153 "palignr $0x8,%%xmm5,%%xmm5 \n"
154 "palignr $0x8,%%xmm13,%%xmm13 \n"
155 "movdqu (%0,%3),%%xmm7 \n"
156 "lea (%0,%3,2),%0 \n"
157 "movdqa %%xmm6,%%xmm14 \n"
158 "punpcklbw %%xmm7,%%xmm6 \n"
159 "punpckhbw %%xmm7,%%xmm14 \n"
160 "neg %3 \n"
161 "movdqa %%xmm6,%%xmm7 \n"
162 "movdqa %%xmm14,%%xmm15 \n"
163 "lea 0x10(%0,%3,8),%0 \n"
164 "palignr $0x8,%%xmm7,%%xmm7 \n"
165 "palignr $0x8,%%xmm15,%%xmm15 \n"
166 "neg %3 \n"
167 // Second round of bit swap.
168 "punpcklwd %%xmm2,%%xmm0 \n"
169 "punpcklwd %%xmm3,%%xmm1 \n"
170 "movdqa %%xmm0,%%xmm2 \n"
171 "movdqa %%xmm1,%%xmm3 \n"
172 "palignr $0x8,%%xmm2,%%xmm2 \n"
173 "palignr $0x8,%%xmm3,%%xmm3 \n"
174 "punpcklwd %%xmm6,%%xmm4 \n"
175 "punpcklwd %%xmm7,%%xmm5 \n"
176 "movdqa %%xmm4,%%xmm6 \n"
177 "movdqa %%xmm5,%%xmm7 \n"
178 "palignr $0x8,%%xmm6,%%xmm6 \n"
179 "palignr $0x8,%%xmm7,%%xmm7 \n"
180 "punpcklwd %%xmm10,%%xmm8 \n"
181 "punpcklwd %%xmm11,%%xmm9 \n"
182 "movdqa %%xmm8,%%xmm10 \n"
183 "movdqa %%xmm9,%%xmm11 \n"
184 "palignr $0x8,%%xmm10,%%xmm10 \n"
185 "palignr $0x8,%%xmm11,%%xmm11 \n"
186 "punpcklwd %%xmm14,%%xmm12 \n"
187 "punpcklwd %%xmm15,%%xmm13 \n"
188 "movdqa %%xmm12,%%xmm14 \n"
189 "movdqa %%xmm13,%%xmm15 \n"
190 "palignr $0x8,%%xmm14,%%xmm14 \n"
191 "palignr $0x8,%%xmm15,%%xmm15 \n"
192 // Third round of bit swap.
193 // Write to the destination pointer.
194 "punpckldq %%xmm4,%%xmm0 \n"
195 "movq %%xmm0,(%1) \n"
196 "movdqa %%xmm0,%%xmm4 \n"
197 "palignr $0x8,%%xmm4,%%xmm4 \n"
198 "movq %%xmm4,(%1,%4) \n"
199 "lea (%1,%4,2),%1 \n"
200 "punpckldq %%xmm6,%%xmm2 \n"
201 "movdqa %%xmm2,%%xmm6 \n"
202 "movq %%xmm2,(%1) \n"
203 "palignr $0x8,%%xmm6,%%xmm6 \n"
204 "punpckldq %%xmm5,%%xmm1 \n"
205 "movq %%xmm6,(%1,%4) \n"
206 "lea (%1,%4,2),%1 \n"
207 "movdqa %%xmm1,%%xmm5 \n"
208 "movq %%xmm1,(%1) \n"
209 "palignr $0x8,%%xmm5,%%xmm5 \n"
210 "movq %%xmm5,(%1,%4) \n"
211 "lea (%1,%4,2),%1 \n"
212 "punpckldq %%xmm7,%%xmm3 \n"
213 "movq %%xmm3,(%1) \n"
214 "movdqa %%xmm3,%%xmm7 \n"
215 "palignr $0x8,%%xmm7,%%xmm7 \n"
216 "movq %%xmm7,(%1,%4) \n"
217 "lea (%1,%4,2),%1 \n"
218 "punpckldq %%xmm12,%%xmm8 \n"
219 "movq %%xmm8,(%1) \n"
220 "movdqa %%xmm8,%%xmm12 \n"
221 "palignr $0x8,%%xmm12,%%xmm12 \n"
222 "movq %%xmm12,(%1,%4) \n"
223 "lea (%1,%4,2),%1 \n"
224 "punpckldq %%xmm14,%%xmm10 \n"
225 "movdqa %%xmm10,%%xmm14 \n"
226 "movq %%xmm10,(%1) \n"
227 "palignr $0x8,%%xmm14,%%xmm14 \n"
228 "punpckldq %%xmm13,%%xmm9 \n"
229 "movq %%xmm14,(%1,%4) \n"
230 "lea (%1,%4,2),%1 \n"
231 "movdqa %%xmm9,%%xmm13 \n"
232 "movq %%xmm9,(%1) \n"
233 "palignr $0x8,%%xmm13,%%xmm13 \n"
234 "movq %%xmm13,(%1,%4) \n"
235 "lea (%1,%4,2),%1 \n"
236 "punpckldq %%xmm15,%%xmm11 \n"
237 "movq %%xmm11,(%1) \n"
238 "movdqa %%xmm11,%%xmm15 \n"
239 "palignr $0x8,%%xmm15,%%xmm15 \n"
240 "sub $0x10,%2 \n"
241 "movq %%xmm15,(%1,%4) \n"
242 "lea (%1,%4,2),%1 \n"
243 "jg 1b \n"
244 : "+r"(src), // %0
245 "+r"(dst), // %1
246 "+r"(width) // %2
247 : "r"((intptr_t)(src_stride)), // %3
248 "r"((intptr_t)(dst_stride)) // %4
249 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
250 "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",
251 "xmm15");
252 }
253 #endif // defined(HAS_TRANSPOSEWX8_FAST_SSSE3)
254
255 // Transpose UV 8x8. 64 bit.
256 #if defined(HAS_TRANSPOSEUVWX8_SSE2)
TransposeUVWx8_SSE2(const uint8_t * src,int src_stride,uint8_t * dst_a,int dst_stride_a,uint8_t * dst_b,int dst_stride_b,int width)257 void TransposeUVWx8_SSE2(const uint8_t* src,
258 int src_stride,
259 uint8_t* dst_a,
260 int dst_stride_a,
261 uint8_t* dst_b,
262 int dst_stride_b,
263 int width) {
264 asm volatile(
265 // Read in the data from the source pointer.
266 // First round of bit swap.
267 LABELALIGN
268 "1: \n"
269 "movdqu (%0),%%xmm0 \n"
270 "movdqu (%0,%4),%%xmm1 \n"
271 "lea (%0,%4,2),%0 \n"
272 "movdqa %%xmm0,%%xmm8 \n"
273 "punpcklbw %%xmm1,%%xmm0 \n"
274 "punpckhbw %%xmm1,%%xmm8 \n"
275 "movdqa %%xmm8,%%xmm1 \n"
276 "movdqu (%0),%%xmm2 \n"
277 "movdqu (%0,%4),%%xmm3 \n"
278 "lea (%0,%4,2),%0 \n"
279 "movdqa %%xmm2,%%xmm8 \n"
280 "punpcklbw %%xmm3,%%xmm2 \n"
281 "punpckhbw %%xmm3,%%xmm8 \n"
282 "movdqa %%xmm8,%%xmm3 \n"
283 "movdqu (%0),%%xmm4 \n"
284 "movdqu (%0,%4),%%xmm5 \n"
285 "lea (%0,%4,2),%0 \n"
286 "movdqa %%xmm4,%%xmm8 \n"
287 "punpcklbw %%xmm5,%%xmm4 \n"
288 "punpckhbw %%xmm5,%%xmm8 \n"
289 "movdqa %%xmm8,%%xmm5 \n"
290 "movdqu (%0),%%xmm6 \n"
291 "movdqu (%0,%4),%%xmm7 \n"
292 "lea (%0,%4,2),%0 \n"
293 "movdqa %%xmm6,%%xmm8 \n"
294 "punpcklbw %%xmm7,%%xmm6 \n"
295 "neg %4 \n"
296 "lea 0x10(%0,%4,8),%0 \n"
297 "punpckhbw %%xmm7,%%xmm8 \n"
298 "movdqa %%xmm8,%%xmm7 \n"
299 "neg %4 \n"
300 // Second round of bit swap.
301 "movdqa %%xmm0,%%xmm8 \n"
302 "movdqa %%xmm1,%%xmm9 \n"
303 "punpckhwd %%xmm2,%%xmm8 \n"
304 "punpckhwd %%xmm3,%%xmm9 \n"
305 "punpcklwd %%xmm2,%%xmm0 \n"
306 "punpcklwd %%xmm3,%%xmm1 \n"
307 "movdqa %%xmm8,%%xmm2 \n"
308 "movdqa %%xmm9,%%xmm3 \n"
309 "movdqa %%xmm4,%%xmm8 \n"
310 "movdqa %%xmm5,%%xmm9 \n"
311 "punpckhwd %%xmm6,%%xmm8 \n"
312 "punpckhwd %%xmm7,%%xmm9 \n"
313 "punpcklwd %%xmm6,%%xmm4 \n"
314 "punpcklwd %%xmm7,%%xmm5 \n"
315 "movdqa %%xmm8,%%xmm6 \n"
316 "movdqa %%xmm9,%%xmm7 \n"
317 // Third round of bit swap.
318 // Write to the destination pointer.
319 "movdqa %%xmm0,%%xmm8 \n"
320 "punpckldq %%xmm4,%%xmm0 \n"
321 "movlpd %%xmm0,(%1) \n" // Write back U channel
322 "movhpd %%xmm0,(%2) \n" // Write back V channel
323 "punpckhdq %%xmm4,%%xmm8 \n"
324 "movlpd %%xmm8,(%1,%5) \n"
325 "lea (%1,%5,2),%1 \n"
326 "movhpd %%xmm8,(%2,%6) \n"
327 "lea (%2,%6,2),%2 \n"
328 "movdqa %%xmm2,%%xmm8 \n"
329 "punpckldq %%xmm6,%%xmm2 \n"
330 "movlpd %%xmm2,(%1) \n"
331 "movhpd %%xmm2,(%2) \n"
332 "punpckhdq %%xmm6,%%xmm8 \n"
333 "movlpd %%xmm8,(%1,%5) \n"
334 "lea (%1,%5,2),%1 \n"
335 "movhpd %%xmm8,(%2,%6) \n"
336 "lea (%2,%6,2),%2 \n"
337 "movdqa %%xmm1,%%xmm8 \n"
338 "punpckldq %%xmm5,%%xmm1 \n"
339 "movlpd %%xmm1,(%1) \n"
340 "movhpd %%xmm1,(%2) \n"
341 "punpckhdq %%xmm5,%%xmm8 \n"
342 "movlpd %%xmm8,(%1,%5) \n"
343 "lea (%1,%5,2),%1 \n"
344 "movhpd %%xmm8,(%2,%6) \n"
345 "lea (%2,%6,2),%2 \n"
346 "movdqa %%xmm3,%%xmm8 \n"
347 "punpckldq %%xmm7,%%xmm3 \n"
348 "movlpd %%xmm3,(%1) \n"
349 "movhpd %%xmm3,(%2) \n"
350 "punpckhdq %%xmm7,%%xmm8 \n"
351 "sub $0x8,%3 \n"
352 "movlpd %%xmm8,(%1,%5) \n"
353 "lea (%1,%5,2),%1 \n"
354 "movhpd %%xmm8,(%2,%6) \n"
355 "lea (%2,%6,2),%2 \n"
356 "jg 1b \n"
357 : "+r"(src), // %0
358 "+r"(dst_a), // %1
359 "+r"(dst_b), // %2
360 "+r"(width) // %3
361 : "r"((intptr_t)(src_stride)), // %4
362 "r"((intptr_t)(dst_stride_a)), // %5
363 "r"((intptr_t)(dst_stride_b)) // %6
364 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
365 "xmm7", "xmm8", "xmm9");
366 }
367 #endif // defined(HAS_TRANSPOSEUVWX8_SSE2)
368
369 #if defined(HAS_TRANSPOSE4X4_32_SSE2)
370 // 4 values, little endian view
371 // a b c d
372 // e f g h
373 // i j k l
374 // m n o p
375
376 // transpose 2x2
377 // a e b f from row 0, 1
378 // i m j n from row 2, 3
379 // c g d h from row 0, 1
380 // k o l p from row 2, 3
381
382 // transpose 4x4
383 // a e i m from row 0, 1
384 // b f j n from row 0, 1
385 // c g k o from row 2, 3
386 // d h l p from row 2, 3
387
388 // Transpose 32 bit values (ARGB)
Transpose4x4_32_SSE2(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int width)389 void Transpose4x4_32_SSE2(const uint8_t* src,
390 int src_stride,
391 uint8_t* dst,
392 int dst_stride,
393 int width) {
394 asm volatile(
395 // Main loop transpose 4x4. Read a column, write a row.
396 "1: \n"
397 "movdqu (%0),%%xmm0 \n" // a b c d
398 "movdqu (%0,%3),%%xmm1 \n" // e f g h
399 "lea (%0,%3,2),%0 \n" // src += stride * 2
400 "movdqu (%0),%%xmm2 \n" // i j k l
401 "movdqu (%0,%3),%%xmm3 \n" // m n o p
402 "lea (%0,%3,2),%0 \n" // src += stride * 2
403
404 // Transpose 2x2
405 "movdqa %%xmm0,%%xmm4 \n"
406 "movdqa %%xmm2,%%xmm5 \n"
407 "movdqa %%xmm0,%%xmm6 \n"
408 "movdqa %%xmm2,%%xmm7 \n"
409 "punpckldq %%xmm1,%%xmm4 \n" // a e b f from row 0, 1
410 "punpckldq %%xmm3,%%xmm5 \n" // i m j n from row 2, 3
411 "punpckhdq %%xmm1,%%xmm6 \n" // c g d h from row 0, 1
412 "punpckhdq %%xmm3,%%xmm7 \n" // k o l p from row 2, 3
413
414 // Transpose 4x4
415 "movdqa %%xmm4,%%xmm0 \n"
416 "movdqa %%xmm4,%%xmm1 \n"
417 "movdqa %%xmm6,%%xmm2 \n"
418 "movdqa %%xmm6,%%xmm3 \n"
419 "punpcklqdq %%xmm5,%%xmm0 \n" // a e i m from row 0, 1
420 "punpckhqdq %%xmm5,%%xmm1 \n" // b f j n from row 0, 1
421 "punpcklqdq %%xmm7,%%xmm2 \n" // c g k o from row 2, 3
422 "punpckhqdq %%xmm7,%%xmm3 \n" // d h l p from row 2, 3
423
424 "movdqu %%xmm0,(%1) \n"
425 "lea 16(%1,%4),%1 \n" // dst += stride + 16
426 "movdqu %%xmm1,-16(%1) \n"
427 "movdqu %%xmm2,-16(%1,%4) \n"
428 "movdqu %%xmm3,-16(%1,%4,2) \n"
429 "sub %4,%1 \n"
430 "sub $0x4,%2 \n"
431 "jg 1b \n"
432 : "+r"(src), // %0
433 "+r"(dst), // %1
434 "+rm"(width) // %2
435 : "r"((ptrdiff_t)(src_stride)), // %3
436 "r"((ptrdiff_t)(dst_stride)) // %4
437 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
438 "xmm7");
439 }
440 #endif // defined(HAS_TRANSPOSE4X4_32_SSE2)
441
442 #if defined(HAS_TRANSPOSE4X4_32_AVX2)
443
444 // Transpose 32 bit values (ARGB)
Transpose4x4_32_AVX2(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int width)445 void Transpose4x4_32_AVX2(const uint8_t* src,
446 int src_stride,
447 uint8_t* dst,
448 int dst_stride,
449 int width) {
450 asm volatile(
451 // Main loop transpose 2 blocks of 4x4. Read a column, write a row.
452 "1: \n"
453 "vmovdqu (%0),%%xmm0 \n" // a b c d
454 "vmovdqu (%0,%3),%%xmm1 \n" // e f g h
455 "lea (%0,%3,2),%0 \n" // src += stride * 2
456 "vmovdqu (%0),%%xmm2 \n" // i j k l
457 "vmovdqu (%0,%3),%%xmm3 \n" // m n o p
458 "lea (%0,%3,2),%0 \n" // src += stride * 2
459
460 "vinserti128 $1,(%0),%%ymm0,%%ymm0 \n" // a b c d
461 "vinserti128 $1,(%0,%3),%%ymm1,%%ymm1 \n" // e f g h
462 "lea (%0,%3,2),%0 \n" // src += stride * 2
463 "vinserti128 $1,(%0),%%ymm2,%%ymm2 \n" // i j k l
464 "vinserti128 $1,(%0,%3),%%ymm3,%%ymm3 \n" // m n o p
465 "lea (%0,%3,2),%0 \n" // src += stride * 2
466
467 // Transpose 2x2
468 "vpunpckldq %%ymm1,%%ymm0,%%ymm4 \n" // a e b f from row 0, 1
469 "vpunpckldq %%ymm3,%%ymm2,%%ymm5 \n" // i m j n from row 2, 3
470 "vpunpckhdq %%ymm1,%%ymm0,%%ymm6 \n" // c g d h from row 0, 1
471 "vpunpckhdq %%ymm3,%%ymm2,%%ymm7 \n" // k o l p from row 2, 3
472
473 // Transpose 4x4
474 "vpunpcklqdq %%ymm5,%%ymm4,%%ymm0 \n" // a e i m from row 0, 1
475 "vpunpckhqdq %%ymm5,%%ymm4,%%ymm1 \n" // b f j n from row 0, 1
476 "vpunpcklqdq %%ymm7,%%ymm6,%%ymm2 \n" // c g k o from row 2, 3
477 "vpunpckhqdq %%ymm7,%%ymm6,%%ymm3 \n" // d h l p from row 2, 3
478
479 "vmovdqu %%ymm0,(%1) \n"
480 "lea 32(%1,%4),%1 \n" // dst += stride + 32
481 "vmovdqu %%ymm1,-32(%1) \n"
482 "vmovdqu %%ymm2,-32(%1,%4) \n"
483 "vmovdqu %%ymm3,-32(%1,%4,2) \n"
484 "sub %4,%1 \n"
485 "sub $0x8,%2 \n"
486 "jg 1b \n"
487 "vzeroupper \n"
488 : "+r"(src), // %0
489 "+r"(dst), // %1
490 "+rm"(width) // %2
491 : "r"((ptrdiff_t)(src_stride)), // %3
492 "r"((ptrdiff_t)(dst_stride)) // %4
493 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
494 "xmm7");
495 }
496 #endif // defined(HAS_TRANSPOSE4X4_32_AVX2)
497
498 #endif // defined(__x86_64__) || defined(__i386__)
499
500 #ifdef __cplusplus
501 } // extern "C"
502 } // namespace libyuv
503 #endif
504