• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS. All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "libyuv/rotate_row.h"
12 #include "libyuv/row.h"
13 
14 #ifdef __cplusplus
15 namespace libyuv {
16 extern "C" {
17 #endif
18 
19 // This module is for 32 bit Visual C x86 and clangcl
20 #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
21 
TransposeWx8_SSSE3(const uint8 * src,int src_stride,uint8 * dst,int dst_stride,int width)22 __declspec(naked) void TransposeWx8_SSSE3(const uint8* src,
23                                           int src_stride,
24                                           uint8* dst,
25                                           int dst_stride,
26                                           int width) {
27   __asm {
28     push      edi
29     push      esi
30     push      ebp
31     mov       eax, [esp + 12 + 4]  // src
32     mov       edi, [esp + 12 + 8]  // src_stride
33     mov       edx, [esp + 12 + 12]  // dst
34     mov       esi, [esp + 12 + 16]  // dst_stride
35     mov       ecx, [esp + 12 + 20]  // width
36 
37     // Read in the data from the source pointer.
38     // First round of bit swap.
39     align      4
40  convertloop:
41     movq      xmm0, qword ptr [eax]
42     lea       ebp, [eax + 8]
43     movq      xmm1, qword ptr [eax + edi]
44     lea       eax, [eax + 2 * edi]
45     punpcklbw xmm0, xmm1
46     movq      xmm2, qword ptr [eax]
47     movdqa    xmm1, xmm0
48     palignr   xmm1, xmm1, 8
49     movq      xmm3, qword ptr [eax + edi]
50     lea       eax, [eax + 2 * edi]
51     punpcklbw xmm2, xmm3
52     movdqa    xmm3, xmm2
53     movq      xmm4, qword ptr [eax]
54     palignr   xmm3, xmm3, 8
55     movq      xmm5, qword ptr [eax + edi]
56     punpcklbw xmm4, xmm5
57     lea       eax, [eax + 2 * edi]
58     movdqa    xmm5, xmm4
59     movq      xmm6, qword ptr [eax]
60     palignr   xmm5, xmm5, 8
61     movq      xmm7, qword ptr [eax + edi]
62     punpcklbw xmm6, xmm7
63     mov       eax, ebp
64     movdqa    xmm7, xmm6
65     palignr   xmm7, xmm7, 8
66     // Second round of bit swap.
67     punpcklwd xmm0, xmm2
68     punpcklwd xmm1, xmm3
69     movdqa    xmm2, xmm0
70     movdqa    xmm3, xmm1
71     palignr   xmm2, xmm2, 8
72     palignr   xmm3, xmm3, 8
73     punpcklwd xmm4, xmm6
74     punpcklwd xmm5, xmm7
75     movdqa    xmm6, xmm4
76     movdqa    xmm7, xmm5
77     palignr   xmm6, xmm6, 8
78     palignr   xmm7, xmm7, 8
79     // Third round of bit swap.
80     // Write to the destination pointer.
81     punpckldq xmm0, xmm4
82     movq      qword ptr [edx], xmm0
83     movdqa    xmm4, xmm0
84     palignr   xmm4, xmm4, 8
85     movq      qword ptr [edx + esi], xmm4
86     lea       edx, [edx + 2 * esi]
87     punpckldq xmm2, xmm6
88     movdqa    xmm6, xmm2
89     palignr   xmm6, xmm6, 8
90     movq      qword ptr [edx], xmm2
91     punpckldq xmm1, xmm5
92     movq      qword ptr [edx + esi], xmm6
93     lea       edx, [edx + 2 * esi]
94     movdqa    xmm5, xmm1
95     movq      qword ptr [edx], xmm1
96     palignr   xmm5, xmm5, 8
97     punpckldq xmm3, xmm7
98     movq      qword ptr [edx + esi], xmm5
99     lea       edx, [edx + 2 * esi]
100     movq      qword ptr [edx], xmm3
101     movdqa    xmm7, xmm3
102     palignr   xmm7, xmm7, 8
103     sub       ecx, 8
104     movq      qword ptr [edx + esi], xmm7
105     lea       edx, [edx + 2 * esi]
106     jg        convertloop
107 
108     pop       ebp
109     pop       esi
110     pop       edi
111     ret
112   }
113 }
114 
TransposeUVWx8_SSE2(const uint8 * src,int src_stride,uint8 * dst_a,int dst_stride_a,uint8 * dst_b,int dst_stride_b,int w)115 __declspec(naked) void TransposeUVWx8_SSE2(const uint8* src,
116                                            int src_stride,
117                                            uint8* dst_a,
118                                            int dst_stride_a,
119                                            uint8* dst_b,
120                                            int dst_stride_b,
121                                            int w) {
122   __asm {
123     push      ebx
124     push      esi
125     push      edi
126     push      ebp
127     mov       eax, [esp + 16 + 4]  // src
128     mov       edi, [esp + 16 + 8]  // src_stride
129     mov       edx, [esp + 16 + 12]  // dst_a
130     mov       esi, [esp + 16 + 16]  // dst_stride_a
131     mov       ebx, [esp + 16 + 20]  // dst_b
132     mov       ebp, [esp + 16 + 24]  // dst_stride_b
133     mov       ecx, esp
134     sub       esp, 4 + 16
135     and       esp, ~15
136     mov       [esp + 16], ecx
137     mov       ecx, [ecx + 16 + 28]  // w
138 
139     align      4
140  convertloop:
141                      // Read in the data from the source pointer.
142         // First round of bit swap.
143     movdqu    xmm0, [eax]
144     movdqu    xmm1, [eax + edi]
145     lea       eax, [eax + 2 * edi]
146     movdqa    xmm7, xmm0  // use xmm7 as temp register.
147     punpcklbw xmm0, xmm1
148     punpckhbw xmm7, xmm1
149     movdqa    xmm1, xmm7
150     movdqu    xmm2, [eax]
151     movdqu    xmm3, [eax + edi]
152     lea       eax, [eax + 2 * edi]
153     movdqa    xmm7, xmm2
154     punpcklbw xmm2, xmm3
155     punpckhbw xmm7, xmm3
156     movdqa    xmm3, xmm7
157     movdqu    xmm4, [eax]
158     movdqu    xmm5, [eax + edi]
159     lea       eax, [eax + 2 * edi]
160     movdqa    xmm7, xmm4
161     punpcklbw xmm4, xmm5
162     punpckhbw xmm7, xmm5
163     movdqa    xmm5, xmm7
164     movdqu    xmm6, [eax]
165     movdqu    xmm7, [eax + edi]
166     lea       eax, [eax + 2 * edi]
167     movdqu    [esp], xmm5  // backup xmm5
168     neg       edi
169     movdqa    xmm5, xmm6             // use xmm5 as temp register.
170     punpcklbw xmm6, xmm7
171     punpckhbw xmm5, xmm7
172     movdqa    xmm7, xmm5
173     lea       eax, [eax + 8 * edi + 16]
174     neg       edi
175     // Second round of bit swap.
176     movdqa    xmm5, xmm0
177     punpcklwd xmm0, xmm2
178     punpckhwd xmm5, xmm2
179     movdqa    xmm2, xmm5
180     movdqa    xmm5, xmm1
181     punpcklwd xmm1, xmm3
182     punpckhwd xmm5, xmm3
183     movdqa    xmm3, xmm5
184     movdqa    xmm5, xmm4
185     punpcklwd xmm4, xmm6
186     punpckhwd xmm5, xmm6
187     movdqa    xmm6, xmm5
188     movdqu    xmm5, [esp]  // restore xmm5
189     movdqu    [esp], xmm6  // backup xmm6
190     movdqa    xmm6, xmm5             // use xmm6 as temp register.
191     punpcklwd xmm5, xmm7
192     punpckhwd xmm6, xmm7
193     movdqa    xmm7, xmm6
194     // Third round of bit swap.
195     // Write to the destination pointer.
196     movdqa    xmm6, xmm0
197     punpckldq xmm0, xmm4
198     punpckhdq xmm6, xmm4
199     movdqa    xmm4, xmm6
200     movdqu    xmm6, [esp]  // restore xmm6
201     movlpd    qword ptr [edx], xmm0
202     movhpd    qword ptr [ebx], xmm0
203     movlpd    qword ptr [edx + esi], xmm4
204     lea       edx, [edx + 2 * esi]
205     movhpd    qword ptr [ebx + ebp], xmm4
206     lea       ebx, [ebx + 2 * ebp]
207     movdqa    xmm0, xmm2  // use xmm0 as the temp register.
208     punpckldq xmm2, xmm6
209     movlpd    qword ptr [edx], xmm2
210     movhpd    qword ptr [ebx], xmm2
211     punpckhdq xmm0, xmm6
212     movlpd    qword ptr [edx + esi], xmm0
213     lea       edx, [edx + 2 * esi]
214     movhpd    qword ptr [ebx + ebp], xmm0
215     lea       ebx, [ebx + 2 * ebp]
216     movdqa    xmm0, xmm1  // use xmm0 as the temp register.
217     punpckldq xmm1, xmm5
218     movlpd    qword ptr [edx], xmm1
219     movhpd    qword ptr [ebx], xmm1
220     punpckhdq xmm0, xmm5
221     movlpd    qword ptr [edx + esi], xmm0
222     lea       edx, [edx + 2 * esi]
223     movhpd    qword ptr [ebx + ebp], xmm0
224     lea       ebx, [ebx + 2 * ebp]
225     movdqa    xmm0, xmm3  // use xmm0 as the temp register.
226     punpckldq xmm3, xmm7
227     movlpd    qword ptr [edx], xmm3
228     movhpd    qword ptr [ebx], xmm3
229     punpckhdq xmm0, xmm7
230     sub       ecx, 8
231     movlpd    qword ptr [edx + esi], xmm0
232     lea       edx, [edx + 2 * esi]
233     movhpd    qword ptr [ebx + ebp], xmm0
234     lea       ebx, [ebx + 2 * ebp]
235     jg        convertloop
236 
237     mov       esp, [esp + 16]
238     pop       ebp
239     pop       edi
240     pop       esi
241     pop       ebx
242     ret
243   }
244 }
245 
246 #endif  // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
247 
248 #ifdef __cplusplus
249 }  // extern "C"
250 }  // namespace libyuv
251 #endif
252