• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS. All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "libyuv/basic_types.h"
12 
13 #include "libyuv/compare_row.h"
14 #include "libyuv/row.h"
15 
16 #if defined(_MSC_VER)
17 #include <intrin.h>  // For __popcnt
18 #endif
19 
20 #ifdef __cplusplus
21 namespace libyuv {
22 extern "C" {
23 #endif
24 
25 // This module is for 32 bit Visual C x86 and clangcl
26 #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
27 
HammingDistance_SSE42(const uint8_t * src_a,const uint8_t * src_b,int count)28 uint32_t HammingDistance_SSE42(const uint8_t* src_a,
29                                const uint8_t* src_b,
30                                int count) {
31   uint32_t diff = 0u;
32 
33   int i;
34   for (i = 0; i < count - 3; i += 4) {
35     uint32_t x = *((uint32_t*)src_a) ^ *((uint32_t*)src_b);  // NOLINT
36     src_a += 4;
37     src_b += 4;
38     diff += __popcnt(x);
39   }
40   return diff;
41 }
42 
43 __declspec(naked) uint32_t
SumSquareError_SSE2(const uint8_t * src_a,const uint8_t * src_b,int count)44     SumSquareError_SSE2(const uint8_t* src_a, const uint8_t* src_b, int count) {
45   __asm {
46     mov        eax, [esp + 4]  // src_a
47     mov        edx, [esp + 8]  // src_b
48     mov        ecx, [esp + 12]  // count
49     pxor       xmm0, xmm0
50     pxor       xmm5, xmm5
51 
52   wloop:
53     movdqu     xmm1, [eax]
54     lea        eax,  [eax + 16]
55     movdqu     xmm2, [edx]
56     lea        edx,  [edx + 16]
57     movdqa     xmm3, xmm1  // abs trick
58     psubusb    xmm1, xmm2
59     psubusb    xmm2, xmm3
60     por        xmm1, xmm2
61     movdqa     xmm2, xmm1
62     punpcklbw  xmm1, xmm5
63     punpckhbw  xmm2, xmm5
64     pmaddwd    xmm1, xmm1
65     pmaddwd    xmm2, xmm2
66     paddd      xmm0, xmm1
67     paddd      xmm0, xmm2
68     sub        ecx, 16
69     jg         wloop
70 
71     pshufd     xmm1, xmm0, 0xee
72     paddd      xmm0, xmm1
73     pshufd     xmm1, xmm0, 0x01
74     paddd      xmm0, xmm1
75     movd       eax, xmm0
76     ret
77   }
78 }
79 
80 // Visual C 2012 required for AVX2.
81 #if _MSC_VER >= 1700
82 // C4752: found Intel(R) Advanced Vector Extensions; consider using /arch:AVX.
83 #pragma warning(disable : 4752)
84 __declspec(naked) uint32_t
SumSquareError_AVX2(const uint8_t * src_a,const uint8_t * src_b,int count)85     SumSquareError_AVX2(const uint8_t* src_a, const uint8_t* src_b, int count) {
86   __asm {
87     mov        eax, [esp + 4]  // src_a
88     mov        edx, [esp + 8]  // src_b
89     mov        ecx, [esp + 12]  // count
90     vpxor      ymm0, ymm0, ymm0  // sum
91     vpxor      ymm5, ymm5, ymm5  // constant 0 for unpck
92     sub        edx, eax
93 
94   wloop:
95     vmovdqu    ymm1, [eax]
96     vmovdqu    ymm2, [eax + edx]
97     lea        eax,  [eax + 32]
98     vpsubusb   ymm3, ymm1, ymm2  // abs difference trick
99     vpsubusb   ymm2, ymm2, ymm1
100     vpor       ymm1, ymm2, ymm3
101     vpunpcklbw ymm2, ymm1, ymm5  // u16.  mutates order.
102     vpunpckhbw ymm1, ymm1, ymm5
103     vpmaddwd   ymm2, ymm2, ymm2  // square + hadd to u32.
104     vpmaddwd   ymm1, ymm1, ymm1
105     vpaddd     ymm0, ymm0, ymm1
106     vpaddd     ymm0, ymm0, ymm2
107     sub        ecx, 32
108     jg         wloop
109 
110     vpshufd    ymm1, ymm0, 0xee  // 3, 2 + 1, 0 both lanes.
111     vpaddd     ymm0, ymm0, ymm1
112     vpshufd    ymm1, ymm0, 0x01  // 1 + 0 both lanes.
113     vpaddd     ymm0, ymm0, ymm1
114     vpermq     ymm1, ymm0, 0x02  // high + low lane.
115     vpaddd     ymm0, ymm0, ymm1
116     vmovd      eax, xmm0
117     vzeroupper
118     ret
119   }
120 }
121 #endif  // _MSC_VER >= 1700
122 
123 uvec32 kHash16x33 = {0x92d9e201, 0, 0, 0};  // 33 ^ 16
124 uvec32 kHashMul0 = {
125     0x0c3525e1,  // 33 ^ 15
126     0xa3476dc1,  // 33 ^ 14
127     0x3b4039a1,  // 33 ^ 13
128     0x4f5f0981,  // 33 ^ 12
129 };
130 uvec32 kHashMul1 = {
131     0x30f35d61,  // 33 ^ 11
132     0x855cb541,  // 33 ^ 10
133     0x040a9121,  // 33 ^ 9
134     0x747c7101,  // 33 ^ 8
135 };
136 uvec32 kHashMul2 = {
137     0xec41d4e1,  // 33 ^ 7
138     0x4cfa3cc1,  // 33 ^ 6
139     0x025528a1,  // 33 ^ 5
140     0x00121881,  // 33 ^ 4
141 };
142 uvec32 kHashMul3 = {
143     0x00008c61,  // 33 ^ 3
144     0x00000441,  // 33 ^ 2
145     0x00000021,  // 33 ^ 1
146     0x00000001,  // 33 ^ 0
147 };
148 
149 __declspec(naked) uint32_t
HashDjb2_SSE41(const uint8_t * src,int count,uint32_t seed)150     HashDjb2_SSE41(const uint8_t* src, int count, uint32_t seed) {
151   __asm {
152     mov        eax, [esp + 4]  // src
153     mov        ecx, [esp + 8]  // count
154     movd       xmm0, [esp + 12]  // seed
155 
156     pxor       xmm7, xmm7  // constant 0 for unpck
157     movdqa     xmm6, xmmword ptr kHash16x33
158 
159   wloop:
160     movdqu     xmm1, [eax]  // src[0-15]
161     lea        eax, [eax + 16]
162     pmulld     xmm0, xmm6  // hash *= 33 ^ 16
163     movdqa     xmm5, xmmword ptr kHashMul0
164     movdqa     xmm2, xmm1
165     punpcklbw  xmm2, xmm7  // src[0-7]
166     movdqa     xmm3, xmm2
167     punpcklwd  xmm3, xmm7  // src[0-3]
168     pmulld     xmm3, xmm5
169     movdqa     xmm5, xmmword ptr kHashMul1
170     movdqa     xmm4, xmm2
171     punpckhwd  xmm4, xmm7  // src[4-7]
172     pmulld     xmm4, xmm5
173     movdqa     xmm5, xmmword ptr kHashMul2
174     punpckhbw  xmm1, xmm7  // src[8-15]
175     movdqa     xmm2, xmm1
176     punpcklwd  xmm2, xmm7  // src[8-11]
177     pmulld     xmm2, xmm5
178     movdqa     xmm5, xmmword ptr kHashMul3
179     punpckhwd  xmm1, xmm7  // src[12-15]
180     pmulld     xmm1, xmm5
181     paddd      xmm3, xmm4  // add 16 results
182     paddd      xmm1, xmm2
183     paddd      xmm1, xmm3
184 
185     pshufd     xmm2, xmm1, 0x0e  // upper 2 dwords
186     paddd      xmm1, xmm2
187     pshufd     xmm2, xmm1, 0x01
188     paddd      xmm1, xmm2
189     paddd      xmm0, xmm1
190     sub        ecx, 16
191     jg         wloop
192 
193     movd       eax, xmm0  // return hash
194     ret
195   }
196 }
197 
198 // Visual C 2012 required for AVX2.
199 #if _MSC_VER >= 1700
200 __declspec(naked) uint32_t
HashDjb2_AVX2(const uint8_t * src,int count,uint32_t seed)201     HashDjb2_AVX2(const uint8_t* src, int count, uint32_t seed) {
202   __asm {
203     mov        eax, [esp + 4]  // src
204     mov        ecx, [esp + 8]  // count
205     vmovd      xmm0, [esp + 12]  // seed
206 
207   wloop:
208     vpmovzxbd  xmm3, [eax]  // src[0-3]
209     vpmulld    xmm0, xmm0, xmmword ptr kHash16x33  // hash *= 33 ^ 16
210     vpmovzxbd  xmm4, [eax + 4]  // src[4-7]
211     vpmulld    xmm3, xmm3, xmmword ptr kHashMul0
212     vpmovzxbd  xmm2, [eax + 8]  // src[8-11]
213     vpmulld    xmm4, xmm4, xmmword ptr kHashMul1
214     vpmovzxbd  xmm1, [eax + 12]  // src[12-15]
215     vpmulld    xmm2, xmm2, xmmword ptr kHashMul2
216     lea        eax, [eax + 16]
217     vpmulld    xmm1, xmm1, xmmword ptr kHashMul3
218     vpaddd     xmm3, xmm3, xmm4  // add 16 results
219     vpaddd     xmm1, xmm1, xmm2
220     vpaddd     xmm1, xmm1, xmm3
221     vpshufd    xmm2, xmm1, 0x0e  // upper 2 dwords
222     vpaddd     xmm1, xmm1,xmm2
223     vpshufd    xmm2, xmm1, 0x01
224     vpaddd     xmm1, xmm1, xmm2
225     vpaddd     xmm0, xmm0, xmm1
226     sub        ecx, 16
227     jg         wloop
228 
229     vmovd      eax, xmm0  // return hash
230     vzeroupper
231     ret
232   }
233 }
234 #endif  // _MSC_VER >= 1700
235 
236 #endif  // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
237 
238 #ifdef __cplusplus
239 }  // extern "C"
240 }  // namespace libyuv
241 #endif
242