• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS. All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "libyuv/row.h"
12 #include "libyuv/rotate_row.h"
13 
14 #ifdef __cplusplus
15 namespace libyuv {
16 extern "C" {
17 #endif
18 
19 // This module is for 32 bit Visual C x86 and clangcl
20 #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
21 
22 __declspec(naked)
TransposeWx8_SSSE3(const uint8 * src,int src_stride,uint8 * dst,int dst_stride,int width)23 void TransposeWx8_SSSE3(const uint8* src, int src_stride,
24                         uint8* dst, int dst_stride, int width) {
25   __asm {
26     push      edi
27     push      esi
28     push      ebp
29     mov       eax, [esp + 12 + 4]   // src
30     mov       edi, [esp + 12 + 8]   // src_stride
31     mov       edx, [esp + 12 + 12]  // dst
32     mov       esi, [esp + 12 + 16]  // dst_stride
33     mov       ecx, [esp + 12 + 20]  // width
34 
35     // Read in the data from the source pointer.
36     // First round of bit swap.
37     align      4
38  convertloop:
39     movq      xmm0, qword ptr [eax]
40     lea       ebp, [eax + 8]
41     movq      xmm1, qword ptr [eax + edi]
42     lea       eax, [eax + 2 * edi]
43     punpcklbw xmm0, xmm1
44     movq      xmm2, qword ptr [eax]
45     movdqa    xmm1, xmm0
46     palignr   xmm1, xmm1, 8
47     movq      xmm3, qword ptr [eax + edi]
48     lea       eax, [eax + 2 * edi]
49     punpcklbw xmm2, xmm3
50     movdqa    xmm3, xmm2
51     movq      xmm4, qword ptr [eax]
52     palignr   xmm3, xmm3, 8
53     movq      xmm5, qword ptr [eax + edi]
54     punpcklbw xmm4, xmm5
55     lea       eax, [eax + 2 * edi]
56     movdqa    xmm5, xmm4
57     movq      xmm6, qword ptr [eax]
58     palignr   xmm5, xmm5, 8
59     movq      xmm7, qword ptr [eax + edi]
60     punpcklbw xmm6, xmm7
61     mov       eax, ebp
62     movdqa    xmm7, xmm6
63     palignr   xmm7, xmm7, 8
64     // Second round of bit swap.
65     punpcklwd xmm0, xmm2
66     punpcklwd xmm1, xmm3
67     movdqa    xmm2, xmm0
68     movdqa    xmm3, xmm1
69     palignr   xmm2, xmm2, 8
70     palignr   xmm3, xmm3, 8
71     punpcklwd xmm4, xmm6
72     punpcklwd xmm5, xmm7
73     movdqa    xmm6, xmm4
74     movdqa    xmm7, xmm5
75     palignr   xmm6, xmm6, 8
76     palignr   xmm7, xmm7, 8
77     // Third round of bit swap.
78     // Write to the destination pointer.
79     punpckldq xmm0, xmm4
80     movq      qword ptr [edx], xmm0
81     movdqa    xmm4, xmm0
82     palignr   xmm4, xmm4, 8
83     movq      qword ptr [edx + esi], xmm4
84     lea       edx, [edx + 2 * esi]
85     punpckldq xmm2, xmm6
86     movdqa    xmm6, xmm2
87     palignr   xmm6, xmm6, 8
88     movq      qword ptr [edx], xmm2
89     punpckldq xmm1, xmm5
90     movq      qword ptr [edx + esi], xmm6
91     lea       edx, [edx + 2 * esi]
92     movdqa    xmm5, xmm1
93     movq      qword ptr [edx], xmm1
94     palignr   xmm5, xmm5, 8
95     punpckldq xmm3, xmm7
96     movq      qword ptr [edx + esi], xmm5
97     lea       edx, [edx + 2 * esi]
98     movq      qword ptr [edx], xmm3
99     movdqa    xmm7, xmm3
100     palignr   xmm7, xmm7, 8
101     sub       ecx, 8
102     movq      qword ptr [edx + esi], xmm7
103     lea       edx, [edx + 2 * esi]
104     jg        convertloop
105 
106     pop       ebp
107     pop       esi
108     pop       edi
109     ret
110   }
111 }
112 
113 __declspec(naked)
TransposeUVWx8_SSE2(const uint8 * src,int src_stride,uint8 * dst_a,int dst_stride_a,uint8 * dst_b,int dst_stride_b,int w)114 void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
115                          uint8* dst_a, int dst_stride_a,
116                          uint8* dst_b, int dst_stride_b,
117                          int w) {
118   __asm {
119     push      ebx
120     push      esi
121     push      edi
122     push      ebp
123     mov       eax, [esp + 16 + 4]   // src
124     mov       edi, [esp + 16 + 8]   // src_stride
125     mov       edx, [esp + 16 + 12]  // dst_a
126     mov       esi, [esp + 16 + 16]  // dst_stride_a
127     mov       ebx, [esp + 16 + 20]  // dst_b
128     mov       ebp, [esp + 16 + 24]  // dst_stride_b
129     mov       ecx, esp
130     sub       esp, 4 + 16
131     and       esp, ~15
132     mov       [esp + 16], ecx
133     mov       ecx, [ecx + 16 + 28]  // w
134 
135     align      4
136  convertloop:
137     // Read in the data from the source pointer.
138     // First round of bit swap.
139     movdqu    xmm0, [eax]
140     movdqu    xmm1, [eax + edi]
141     lea       eax, [eax + 2 * edi]
142     movdqa    xmm7, xmm0  // use xmm7 as temp register.
143     punpcklbw xmm0, xmm1
144     punpckhbw xmm7, xmm1
145     movdqa    xmm1, xmm7
146     movdqu    xmm2, [eax]
147     movdqu    xmm3, [eax + edi]
148     lea       eax, [eax + 2 * edi]
149     movdqa    xmm7, xmm2
150     punpcklbw xmm2, xmm3
151     punpckhbw xmm7, xmm3
152     movdqa    xmm3, xmm7
153     movdqu    xmm4, [eax]
154     movdqu    xmm5, [eax + edi]
155     lea       eax, [eax + 2 * edi]
156     movdqa    xmm7, xmm4
157     punpcklbw xmm4, xmm5
158     punpckhbw xmm7, xmm5
159     movdqa    xmm5, xmm7
160     movdqu    xmm6, [eax]
161     movdqu    xmm7, [eax + edi]
162     lea       eax, [eax + 2 * edi]
163     movdqu    [esp], xmm5  // backup xmm5
164     neg       edi
165     movdqa    xmm5, xmm6   // use xmm5 as temp register.
166     punpcklbw xmm6, xmm7
167     punpckhbw xmm5, xmm7
168     movdqa    xmm7, xmm5
169     lea       eax, [eax + 8 * edi + 16]
170     neg       edi
171     // Second round of bit swap.
172     movdqa    xmm5, xmm0
173     punpcklwd xmm0, xmm2
174     punpckhwd xmm5, xmm2
175     movdqa    xmm2, xmm5
176     movdqa    xmm5, xmm1
177     punpcklwd xmm1, xmm3
178     punpckhwd xmm5, xmm3
179     movdqa    xmm3, xmm5
180     movdqa    xmm5, xmm4
181     punpcklwd xmm4, xmm6
182     punpckhwd xmm5, xmm6
183     movdqa    xmm6, xmm5
184     movdqu    xmm5, [esp]  // restore xmm5
185     movdqu    [esp], xmm6  // backup xmm6
186     movdqa    xmm6, xmm5    // use xmm6 as temp register.
187     punpcklwd xmm5, xmm7
188     punpckhwd xmm6, xmm7
189     movdqa    xmm7, xmm6
190     // Third round of bit swap.
191     // Write to the destination pointer.
192     movdqa    xmm6, xmm0
193     punpckldq xmm0, xmm4
194     punpckhdq xmm6, xmm4
195     movdqa    xmm4, xmm6
196     movdqu    xmm6, [esp]  // restore xmm6
197     movlpd    qword ptr [edx], xmm0
198     movhpd    qword ptr [ebx], xmm0
199     movlpd    qword ptr [edx + esi], xmm4
200     lea       edx, [edx + 2 * esi]
201     movhpd    qword ptr [ebx + ebp], xmm4
202     lea       ebx, [ebx + 2 * ebp]
203     movdqa    xmm0, xmm2   // use xmm0 as the temp register.
204     punpckldq xmm2, xmm6
205     movlpd    qword ptr [edx], xmm2
206     movhpd    qword ptr [ebx], xmm2
207     punpckhdq xmm0, xmm6
208     movlpd    qword ptr [edx + esi], xmm0
209     lea       edx, [edx + 2 * esi]
210     movhpd    qword ptr [ebx + ebp], xmm0
211     lea       ebx, [ebx + 2 * ebp]
212     movdqa    xmm0, xmm1   // use xmm0 as the temp register.
213     punpckldq xmm1, xmm5
214     movlpd    qword ptr [edx], xmm1
215     movhpd    qword ptr [ebx], xmm1
216     punpckhdq xmm0, xmm5
217     movlpd    qword ptr [edx + esi], xmm0
218     lea       edx, [edx + 2 * esi]
219     movhpd    qword ptr [ebx + ebp], xmm0
220     lea       ebx, [ebx + 2 * ebp]
221     movdqa    xmm0, xmm3   // use xmm0 as the temp register.
222     punpckldq xmm3, xmm7
223     movlpd    qword ptr [edx], xmm3
224     movhpd    qword ptr [ebx], xmm3
225     punpckhdq xmm0, xmm7
226     sub       ecx, 8
227     movlpd    qword ptr [edx + esi], xmm0
228     lea       edx, [edx + 2 * esi]
229     movhpd    qword ptr [ebx + ebp], xmm0
230     lea       ebx, [ebx + 2 * ebp]
231     jg        convertloop
232 
233     mov       esp, [esp + 16]
234     pop       ebp
235     pop       edi
236     pop       esi
237     pop       ebx
238     ret
239   }
240 }
241 
242 #endif  // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
243 
244 #ifdef __cplusplus
245 }  // extern "C"
246 }  // namespace libyuv
247 #endif
248