1 /*
2 * Copyright 2012 The LibYuv Project Authors. All rights reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "libyuv/basic_types.h"
12
13 #include "libyuv/compare_row.h"
14 #include "libyuv/row.h"
15
16 #if defined(_MSC_VER)
17 #include <intrin.h> // For __popcnt
18 #endif
19
20 #ifdef __cplusplus
21 namespace libyuv {
22 extern "C" {
23 #endif
24
25 // This module is for 32 bit Visual C x86 and clangcl
26 #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
27
HammingDistance_SSE42(const uint8_t * src_a,const uint8_t * src_b,int count)28 uint32_t HammingDistance_SSE42(const uint8_t* src_a,
29 const uint8_t* src_b,
30 int count) {
31 uint32_t diff = 0u;
32
33 int i;
34 for (i = 0; i < count - 3; i += 4) {
35 uint32_t x = *((uint32_t*)src_a) ^ *((uint32_t*)src_b); // NOLINT
36 src_a += 4;
37 src_b += 4;
38 diff += __popcnt(x);
39 }
40 return diff;
41 }
42
43 __declspec(naked) uint32_t
SumSquareError_SSE2(const uint8_t * src_a,const uint8_t * src_b,int count)44 SumSquareError_SSE2(const uint8_t* src_a, const uint8_t* src_b, int count) {
45 __asm {
46 mov eax, [esp + 4] // src_a
47 mov edx, [esp + 8] // src_b
48 mov ecx, [esp + 12] // count
49 pxor xmm0, xmm0
50 pxor xmm5, xmm5
51
52 wloop:
53 movdqu xmm1, [eax]
54 lea eax, [eax + 16]
55 movdqu xmm2, [edx]
56 lea edx, [edx + 16]
57 movdqa xmm3, xmm1 // abs trick
58 psubusb xmm1, xmm2
59 psubusb xmm2, xmm3
60 por xmm1, xmm2
61 movdqa xmm2, xmm1
62 punpcklbw xmm1, xmm5
63 punpckhbw xmm2, xmm5
64 pmaddwd xmm1, xmm1
65 pmaddwd xmm2, xmm2
66 paddd xmm0, xmm1
67 paddd xmm0, xmm2
68 sub ecx, 16
69 jg wloop
70
71 pshufd xmm1, xmm0, 0xee
72 paddd xmm0, xmm1
73 pshufd xmm1, xmm0, 0x01
74 paddd xmm0, xmm1
75 movd eax, xmm0
76 ret
77 }
78 }
79
80 // Visual C 2012 required for AVX2.
81 #if _MSC_VER >= 1700
82 // C4752: found Intel(R) Advanced Vector Extensions; consider using /arch:AVX.
83 #pragma warning(disable : 4752)
84 __declspec(naked) uint32_t
SumSquareError_AVX2(const uint8_t * src_a,const uint8_t * src_b,int count)85 SumSquareError_AVX2(const uint8_t* src_a, const uint8_t* src_b, int count) {
86 __asm {
87 mov eax, [esp + 4] // src_a
88 mov edx, [esp + 8] // src_b
89 mov ecx, [esp + 12] // count
90 vpxor ymm0, ymm0, ymm0 // sum
91 vpxor ymm5, ymm5, ymm5 // constant 0 for unpck
92 sub edx, eax
93
94 wloop:
95 vmovdqu ymm1, [eax]
96 vmovdqu ymm2, [eax + edx]
97 lea eax, [eax + 32]
98 vpsubusb ymm3, ymm1, ymm2 // abs difference trick
99 vpsubusb ymm2, ymm2, ymm1
100 vpor ymm1, ymm2, ymm3
101 vpunpcklbw ymm2, ymm1, ymm5 // u16. mutates order.
102 vpunpckhbw ymm1, ymm1, ymm5
103 vpmaddwd ymm2, ymm2, ymm2 // square + hadd to u32.
104 vpmaddwd ymm1, ymm1, ymm1
105 vpaddd ymm0, ymm0, ymm1
106 vpaddd ymm0, ymm0, ymm2
107 sub ecx, 32
108 jg wloop
109
110 vpshufd ymm1, ymm0, 0xee // 3, 2 + 1, 0 both lanes.
111 vpaddd ymm0, ymm0, ymm1
112 vpshufd ymm1, ymm0, 0x01 // 1 + 0 both lanes.
113 vpaddd ymm0, ymm0, ymm1
114 vpermq ymm1, ymm0, 0x02 // high + low lane.
115 vpaddd ymm0, ymm0, ymm1
116 vmovd eax, xmm0
117 vzeroupper
118 ret
119 }
120 }
121 #endif // _MSC_VER >= 1700
122
123 uvec32 kHash16x33 = {0x92d9e201, 0, 0, 0}; // 33 ^ 16
124 uvec32 kHashMul0 = {
125 0x0c3525e1, // 33 ^ 15
126 0xa3476dc1, // 33 ^ 14
127 0x3b4039a1, // 33 ^ 13
128 0x4f5f0981, // 33 ^ 12
129 };
130 uvec32 kHashMul1 = {
131 0x30f35d61, // 33 ^ 11
132 0x855cb541, // 33 ^ 10
133 0x040a9121, // 33 ^ 9
134 0x747c7101, // 33 ^ 8
135 };
136 uvec32 kHashMul2 = {
137 0xec41d4e1, // 33 ^ 7
138 0x4cfa3cc1, // 33 ^ 6
139 0x025528a1, // 33 ^ 5
140 0x00121881, // 33 ^ 4
141 };
142 uvec32 kHashMul3 = {
143 0x00008c61, // 33 ^ 3
144 0x00000441, // 33 ^ 2
145 0x00000021, // 33 ^ 1
146 0x00000001, // 33 ^ 0
147 };
148
149 __declspec(naked) uint32_t
HashDjb2_SSE41(const uint8_t * src,int count,uint32_t seed)150 HashDjb2_SSE41(const uint8_t* src, int count, uint32_t seed) {
151 __asm {
152 mov eax, [esp + 4] // src
153 mov ecx, [esp + 8] // count
154 movd xmm0, [esp + 12] // seed
155
156 pxor xmm7, xmm7 // constant 0 for unpck
157 movdqa xmm6, xmmword ptr kHash16x33
158
159 wloop:
160 movdqu xmm1, [eax] // src[0-15]
161 lea eax, [eax + 16]
162 pmulld xmm0, xmm6 // hash *= 33 ^ 16
163 movdqa xmm5, xmmword ptr kHashMul0
164 movdqa xmm2, xmm1
165 punpcklbw xmm2, xmm7 // src[0-7]
166 movdqa xmm3, xmm2
167 punpcklwd xmm3, xmm7 // src[0-3]
168 pmulld xmm3, xmm5
169 movdqa xmm5, xmmword ptr kHashMul1
170 movdqa xmm4, xmm2
171 punpckhwd xmm4, xmm7 // src[4-7]
172 pmulld xmm4, xmm5
173 movdqa xmm5, xmmword ptr kHashMul2
174 punpckhbw xmm1, xmm7 // src[8-15]
175 movdqa xmm2, xmm1
176 punpcklwd xmm2, xmm7 // src[8-11]
177 pmulld xmm2, xmm5
178 movdqa xmm5, xmmword ptr kHashMul3
179 punpckhwd xmm1, xmm7 // src[12-15]
180 pmulld xmm1, xmm5
181 paddd xmm3, xmm4 // add 16 results
182 paddd xmm1, xmm2
183 paddd xmm1, xmm3
184
185 pshufd xmm2, xmm1, 0x0e // upper 2 dwords
186 paddd xmm1, xmm2
187 pshufd xmm2, xmm1, 0x01
188 paddd xmm1, xmm2
189 paddd xmm0, xmm1
190 sub ecx, 16
191 jg wloop
192
193 movd eax, xmm0 // return hash
194 ret
195 }
196 }
197
198 // Visual C 2012 required for AVX2.
199 #if _MSC_VER >= 1700
200 __declspec(naked) uint32_t
HashDjb2_AVX2(const uint8_t * src,int count,uint32_t seed)201 HashDjb2_AVX2(const uint8_t* src, int count, uint32_t seed) {
202 __asm {
203 mov eax, [esp + 4] // src
204 mov ecx, [esp + 8] // count
205 vmovd xmm0, [esp + 12] // seed
206
207 wloop:
208 vpmovzxbd xmm3, [eax] // src[0-3]
209 vpmulld xmm0, xmm0, xmmword ptr kHash16x33 // hash *= 33 ^ 16
210 vpmovzxbd xmm4, [eax + 4] // src[4-7]
211 vpmulld xmm3, xmm3, xmmword ptr kHashMul0
212 vpmovzxbd xmm2, [eax + 8] // src[8-11]
213 vpmulld xmm4, xmm4, xmmword ptr kHashMul1
214 vpmovzxbd xmm1, [eax + 12] // src[12-15]
215 vpmulld xmm2, xmm2, xmmword ptr kHashMul2
216 lea eax, [eax + 16]
217 vpmulld xmm1, xmm1, xmmword ptr kHashMul3
218 vpaddd xmm3, xmm3, xmm4 // add 16 results
219 vpaddd xmm1, xmm1, xmm2
220 vpaddd xmm1, xmm1, xmm3
221 vpshufd xmm2, xmm1, 0x0e // upper 2 dwords
222 vpaddd xmm1, xmm1,xmm2
223 vpshufd xmm2, xmm1, 0x01
224 vpaddd xmm1, xmm1, xmm2
225 vpaddd xmm0, xmm0, xmm1
226 sub ecx, 16
227 jg wloop
228
229 vmovd eax, xmm0 // return hash
230 vzeroupper
231 ret
232 }
233 }
234 #endif // _MSC_VER >= 1700
235
236 #endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
237
238 #ifdef __cplusplus
239 } // extern "C"
240 } // namespace libyuv
241 #endif
242