1 /*
2 * Copyright 2013 The LibYuv Project Authors. All rights reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "libyuv/row.h"
12 #include "libyuv/rotate_row.h"
13
14 #ifdef __cplusplus
15 namespace libyuv {
16 extern "C" {
17 #endif
18
19 // This module is for 32 bit Visual C x86 and clangcl
20 #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
21
22 __declspec(naked)
TransposeWx8_SSSE3(const uint8 * src,int src_stride,uint8 * dst,int dst_stride,int width)23 void TransposeWx8_SSSE3(const uint8* src, int src_stride,
24 uint8* dst, int dst_stride, int width) {
25 __asm {
26 push edi
27 push esi
28 push ebp
29 mov eax, [esp + 12 + 4] // src
30 mov edi, [esp + 12 + 8] // src_stride
31 mov edx, [esp + 12 + 12] // dst
32 mov esi, [esp + 12 + 16] // dst_stride
33 mov ecx, [esp + 12 + 20] // width
34
35 // Read in the data from the source pointer.
36 // First round of bit swap.
37 align 4
38 convertloop:
39 movq xmm0, qword ptr [eax]
40 lea ebp, [eax + 8]
41 movq xmm1, qword ptr [eax + edi]
42 lea eax, [eax + 2 * edi]
43 punpcklbw xmm0, xmm1
44 movq xmm2, qword ptr [eax]
45 movdqa xmm1, xmm0
46 palignr xmm1, xmm1, 8
47 movq xmm3, qword ptr [eax + edi]
48 lea eax, [eax + 2 * edi]
49 punpcklbw xmm2, xmm3
50 movdqa xmm3, xmm2
51 movq xmm4, qword ptr [eax]
52 palignr xmm3, xmm3, 8
53 movq xmm5, qword ptr [eax + edi]
54 punpcklbw xmm4, xmm5
55 lea eax, [eax + 2 * edi]
56 movdqa xmm5, xmm4
57 movq xmm6, qword ptr [eax]
58 palignr xmm5, xmm5, 8
59 movq xmm7, qword ptr [eax + edi]
60 punpcklbw xmm6, xmm7
61 mov eax, ebp
62 movdqa xmm7, xmm6
63 palignr xmm7, xmm7, 8
64 // Second round of bit swap.
65 punpcklwd xmm0, xmm2
66 punpcklwd xmm1, xmm3
67 movdqa xmm2, xmm0
68 movdqa xmm3, xmm1
69 palignr xmm2, xmm2, 8
70 palignr xmm3, xmm3, 8
71 punpcklwd xmm4, xmm6
72 punpcklwd xmm5, xmm7
73 movdqa xmm6, xmm4
74 movdqa xmm7, xmm5
75 palignr xmm6, xmm6, 8
76 palignr xmm7, xmm7, 8
77 // Third round of bit swap.
78 // Write to the destination pointer.
79 punpckldq xmm0, xmm4
80 movq qword ptr [edx], xmm0
81 movdqa xmm4, xmm0
82 palignr xmm4, xmm4, 8
83 movq qword ptr [edx + esi], xmm4
84 lea edx, [edx + 2 * esi]
85 punpckldq xmm2, xmm6
86 movdqa xmm6, xmm2
87 palignr xmm6, xmm6, 8
88 movq qword ptr [edx], xmm2
89 punpckldq xmm1, xmm5
90 movq qword ptr [edx + esi], xmm6
91 lea edx, [edx + 2 * esi]
92 movdqa xmm5, xmm1
93 movq qword ptr [edx], xmm1
94 palignr xmm5, xmm5, 8
95 punpckldq xmm3, xmm7
96 movq qword ptr [edx + esi], xmm5
97 lea edx, [edx + 2 * esi]
98 movq qword ptr [edx], xmm3
99 movdqa xmm7, xmm3
100 palignr xmm7, xmm7, 8
101 sub ecx, 8
102 movq qword ptr [edx + esi], xmm7
103 lea edx, [edx + 2 * esi]
104 jg convertloop
105
106 pop ebp
107 pop esi
108 pop edi
109 ret
110 }
111 }
112
113 __declspec(naked)
TransposeUVWx8_SSE2(const uint8 * src,int src_stride,uint8 * dst_a,int dst_stride_a,uint8 * dst_b,int dst_stride_b,int w)114 void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
115 uint8* dst_a, int dst_stride_a,
116 uint8* dst_b, int dst_stride_b,
117 int w) {
118 __asm {
119 push ebx
120 push esi
121 push edi
122 push ebp
123 mov eax, [esp + 16 + 4] // src
124 mov edi, [esp + 16 + 8] // src_stride
125 mov edx, [esp + 16 + 12] // dst_a
126 mov esi, [esp + 16 + 16] // dst_stride_a
127 mov ebx, [esp + 16 + 20] // dst_b
128 mov ebp, [esp + 16 + 24] // dst_stride_b
129 mov ecx, esp
130 sub esp, 4 + 16
131 and esp, ~15
132 mov [esp + 16], ecx
133 mov ecx, [ecx + 16 + 28] // w
134
135 align 4
136 convertloop:
137 // Read in the data from the source pointer.
138 // First round of bit swap.
139 movdqu xmm0, [eax]
140 movdqu xmm1, [eax + edi]
141 lea eax, [eax + 2 * edi]
142 movdqa xmm7, xmm0 // use xmm7 as temp register.
143 punpcklbw xmm0, xmm1
144 punpckhbw xmm7, xmm1
145 movdqa xmm1, xmm7
146 movdqu xmm2, [eax]
147 movdqu xmm3, [eax + edi]
148 lea eax, [eax + 2 * edi]
149 movdqa xmm7, xmm2
150 punpcklbw xmm2, xmm3
151 punpckhbw xmm7, xmm3
152 movdqa xmm3, xmm7
153 movdqu xmm4, [eax]
154 movdqu xmm5, [eax + edi]
155 lea eax, [eax + 2 * edi]
156 movdqa xmm7, xmm4
157 punpcklbw xmm4, xmm5
158 punpckhbw xmm7, xmm5
159 movdqa xmm5, xmm7
160 movdqu xmm6, [eax]
161 movdqu xmm7, [eax + edi]
162 lea eax, [eax + 2 * edi]
163 movdqu [esp], xmm5 // backup xmm5
164 neg edi
165 movdqa xmm5, xmm6 // use xmm5 as temp register.
166 punpcklbw xmm6, xmm7
167 punpckhbw xmm5, xmm7
168 movdqa xmm7, xmm5
169 lea eax, [eax + 8 * edi + 16]
170 neg edi
171 // Second round of bit swap.
172 movdqa xmm5, xmm0
173 punpcklwd xmm0, xmm2
174 punpckhwd xmm5, xmm2
175 movdqa xmm2, xmm5
176 movdqa xmm5, xmm1
177 punpcklwd xmm1, xmm3
178 punpckhwd xmm5, xmm3
179 movdqa xmm3, xmm5
180 movdqa xmm5, xmm4
181 punpcklwd xmm4, xmm6
182 punpckhwd xmm5, xmm6
183 movdqa xmm6, xmm5
184 movdqu xmm5, [esp] // restore xmm5
185 movdqu [esp], xmm6 // backup xmm6
186 movdqa xmm6, xmm5 // use xmm6 as temp register.
187 punpcklwd xmm5, xmm7
188 punpckhwd xmm6, xmm7
189 movdqa xmm7, xmm6
190 // Third round of bit swap.
191 // Write to the destination pointer.
192 movdqa xmm6, xmm0
193 punpckldq xmm0, xmm4
194 punpckhdq xmm6, xmm4
195 movdqa xmm4, xmm6
196 movdqu xmm6, [esp] // restore xmm6
197 movlpd qword ptr [edx], xmm0
198 movhpd qword ptr [ebx], xmm0
199 movlpd qword ptr [edx + esi], xmm4
200 lea edx, [edx + 2 * esi]
201 movhpd qword ptr [ebx + ebp], xmm4
202 lea ebx, [ebx + 2 * ebp]
203 movdqa xmm0, xmm2 // use xmm0 as the temp register.
204 punpckldq xmm2, xmm6
205 movlpd qword ptr [edx], xmm2
206 movhpd qword ptr [ebx], xmm2
207 punpckhdq xmm0, xmm6
208 movlpd qword ptr [edx + esi], xmm0
209 lea edx, [edx + 2 * esi]
210 movhpd qword ptr [ebx + ebp], xmm0
211 lea ebx, [ebx + 2 * ebp]
212 movdqa xmm0, xmm1 // use xmm0 as the temp register.
213 punpckldq xmm1, xmm5
214 movlpd qword ptr [edx], xmm1
215 movhpd qword ptr [ebx], xmm1
216 punpckhdq xmm0, xmm5
217 movlpd qword ptr [edx + esi], xmm0
218 lea edx, [edx + 2 * esi]
219 movhpd qword ptr [ebx + ebp], xmm0
220 lea ebx, [ebx + 2 * ebp]
221 movdqa xmm0, xmm3 // use xmm0 as the temp register.
222 punpckldq xmm3, xmm7
223 movlpd qword ptr [edx], xmm3
224 movhpd qword ptr [ebx], xmm3
225 punpckhdq xmm0, xmm7
226 sub ecx, 8
227 movlpd qword ptr [edx + esi], xmm0
228 lea edx, [edx + 2 * esi]
229 movhpd qword ptr [ebx + ebp], xmm0
230 lea ebx, [ebx + 2 * ebp]
231 jg convertloop
232
233 mov esp, [esp + 16]
234 pop ebp
235 pop edi
236 pop esi
237 pop ebx
238 ret
239 }
240 }
241
242 #endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
243
244 #ifdef __cplusplus
245 } // extern "C"
246 } // namespace libyuv
247 #endif
248