1 /*
2 * Copyright 2012 The LibYuv Project Authors. All rights reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "libyuv/basic_types.h"
12
13 #include "libyuv/compare_row.h"
14 #include "libyuv/row.h"
15
16 #ifdef __cplusplus
17 namespace libyuv {
18 extern "C" {
19 #endif
20
21 // This module is for 32 bit Visual C x86 and clangcl
22 #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
23
24 __declspec(naked)
SumSquareError_SSE2(const uint8 * src_a,const uint8 * src_b,int count)25 uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
26 __asm {
27 mov eax, [esp + 4] // src_a
28 mov edx, [esp + 8] // src_b
29 mov ecx, [esp + 12] // count
30 pxor xmm0, xmm0
31 pxor xmm5, xmm5
32
33 wloop:
34 movdqu xmm1, [eax]
35 lea eax, [eax + 16]
36 movdqu xmm2, [edx]
37 lea edx, [edx + 16]
38 movdqa xmm3, xmm1 // abs trick
39 psubusb xmm1, xmm2
40 psubusb xmm2, xmm3
41 por xmm1, xmm2
42 movdqa xmm2, xmm1
43 punpcklbw xmm1, xmm5
44 punpckhbw xmm2, xmm5
45 pmaddwd xmm1, xmm1
46 pmaddwd xmm2, xmm2
47 paddd xmm0, xmm1
48 paddd xmm0, xmm2
49 sub ecx, 16
50 jg wloop
51
52 pshufd xmm1, xmm0, 0xee
53 paddd xmm0, xmm1
54 pshufd xmm1, xmm0, 0x01
55 paddd xmm0, xmm1
56 movd eax, xmm0
57 ret
58 }
59 }
60
61 // Visual C 2012 required for AVX2.
62 #if _MSC_VER >= 1700
63 // C4752: found Intel(R) Advanced Vector Extensions; consider using /arch:AVX.
64 #pragma warning(disable: 4752)
65 __declspec(naked)
SumSquareError_AVX2(const uint8 * src_a,const uint8 * src_b,int count)66 uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) {
67 __asm {
68 mov eax, [esp + 4] // src_a
69 mov edx, [esp + 8] // src_b
70 mov ecx, [esp + 12] // count
71 vpxor ymm0, ymm0, ymm0 // sum
72 vpxor ymm5, ymm5, ymm5 // constant 0 for unpck
73 sub edx, eax
74
75 wloop:
76 vmovdqu ymm1, [eax]
77 vmovdqu ymm2, [eax + edx]
78 lea eax, [eax + 32]
79 vpsubusb ymm3, ymm1, ymm2 // abs difference trick
80 vpsubusb ymm2, ymm2, ymm1
81 vpor ymm1, ymm2, ymm3
82 vpunpcklbw ymm2, ymm1, ymm5 // u16. mutates order.
83 vpunpckhbw ymm1, ymm1, ymm5
84 vpmaddwd ymm2, ymm2, ymm2 // square + hadd to u32.
85 vpmaddwd ymm1, ymm1, ymm1
86 vpaddd ymm0, ymm0, ymm1
87 vpaddd ymm0, ymm0, ymm2
88 sub ecx, 32
89 jg wloop
90
91 vpshufd ymm1, ymm0, 0xee // 3, 2 + 1, 0 both lanes.
92 vpaddd ymm0, ymm0, ymm1
93 vpshufd ymm1, ymm0, 0x01 // 1 + 0 both lanes.
94 vpaddd ymm0, ymm0, ymm1
95 vpermq ymm1, ymm0, 0x02 // high + low lane.
96 vpaddd ymm0, ymm0, ymm1
97 vmovd eax, xmm0
98 vzeroupper
99 ret
100 }
101 }
102 #endif // _MSC_VER >= 1700
103
104 uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 }; // 33 ^ 16
105 uvec32 kHashMul0 = {
106 0x0c3525e1, // 33 ^ 15
107 0xa3476dc1, // 33 ^ 14
108 0x3b4039a1, // 33 ^ 13
109 0x4f5f0981, // 33 ^ 12
110 };
111 uvec32 kHashMul1 = {
112 0x30f35d61, // 33 ^ 11
113 0x855cb541, // 33 ^ 10
114 0x040a9121, // 33 ^ 9
115 0x747c7101, // 33 ^ 8
116 };
117 uvec32 kHashMul2 = {
118 0xec41d4e1, // 33 ^ 7
119 0x4cfa3cc1, // 33 ^ 6
120 0x025528a1, // 33 ^ 5
121 0x00121881, // 33 ^ 4
122 };
123 uvec32 kHashMul3 = {
124 0x00008c61, // 33 ^ 3
125 0x00000441, // 33 ^ 2
126 0x00000021, // 33 ^ 1
127 0x00000001, // 33 ^ 0
128 };
129
130 __declspec(naked)
HashDjb2_SSE41(const uint8 * src,int count,uint32 seed)131 uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
132 __asm {
133 mov eax, [esp + 4] // src
134 mov ecx, [esp + 8] // count
135 movd xmm0, [esp + 12] // seed
136
137 pxor xmm7, xmm7 // constant 0 for unpck
138 movdqa xmm6, xmmword ptr kHash16x33
139
140 wloop:
141 movdqu xmm1, [eax] // src[0-15]
142 lea eax, [eax + 16]
143 pmulld xmm0, xmm6 // hash *= 33 ^ 16
144 movdqa xmm5, xmmword ptr kHashMul0
145 movdqa xmm2, xmm1
146 punpcklbw xmm2, xmm7 // src[0-7]
147 movdqa xmm3, xmm2
148 punpcklwd xmm3, xmm7 // src[0-3]
149 pmulld xmm3, xmm5
150 movdqa xmm5, xmmword ptr kHashMul1
151 movdqa xmm4, xmm2
152 punpckhwd xmm4, xmm7 // src[4-7]
153 pmulld xmm4, xmm5
154 movdqa xmm5, xmmword ptr kHashMul2
155 punpckhbw xmm1, xmm7 // src[8-15]
156 movdqa xmm2, xmm1
157 punpcklwd xmm2, xmm7 // src[8-11]
158 pmulld xmm2, xmm5
159 movdqa xmm5, xmmword ptr kHashMul3
160 punpckhwd xmm1, xmm7 // src[12-15]
161 pmulld xmm1, xmm5
162 paddd xmm3, xmm4 // add 16 results
163 paddd xmm1, xmm2
164 paddd xmm1, xmm3
165
166 pshufd xmm2, xmm1, 0x0e // upper 2 dwords
167 paddd xmm1, xmm2
168 pshufd xmm2, xmm1, 0x01
169 paddd xmm1, xmm2
170 paddd xmm0, xmm1
171 sub ecx, 16
172 jg wloop
173
174 movd eax, xmm0 // return hash
175 ret
176 }
177 }
178
179 // Visual C 2012 required for AVX2.
180 #if _MSC_VER >= 1700
181 __declspec(naked)
HashDjb2_AVX2(const uint8 * src,int count,uint32 seed)182 uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed) {
183 __asm {
184 mov eax, [esp + 4] // src
185 mov ecx, [esp + 8] // count
186 vmovd xmm0, [esp + 12] // seed
187
188 wloop:
189 vpmovzxbd xmm3, [eax] // src[0-3]
190 vpmulld xmm0, xmm0, xmmword ptr kHash16x33 // hash *= 33 ^ 16
191 vpmovzxbd xmm4, [eax + 4] // src[4-7]
192 vpmulld xmm3, xmm3, xmmword ptr kHashMul0
193 vpmovzxbd xmm2, [eax + 8] // src[8-11]
194 vpmulld xmm4, xmm4, xmmword ptr kHashMul1
195 vpmovzxbd xmm1, [eax + 12] // src[12-15]
196 vpmulld xmm2, xmm2, xmmword ptr kHashMul2
197 lea eax, [eax + 16]
198 vpmulld xmm1, xmm1, xmmword ptr kHashMul3
199 vpaddd xmm3, xmm3, xmm4 // add 16 results
200 vpaddd xmm1, xmm1, xmm2
201 vpaddd xmm1, xmm1, xmm3
202 vpshufd xmm2, xmm1, 0x0e // upper 2 dwords
203 vpaddd xmm1, xmm1,xmm2
204 vpshufd xmm2, xmm1, 0x01
205 vpaddd xmm1, xmm1, xmm2
206 vpaddd xmm0, xmm0, xmm1
207 sub ecx, 16
208 jg wloop
209
210 vmovd eax, xmm0 // return hash
211 vzeroupper
212 ret
213 }
214 }
215 #endif // _MSC_VER >= 1700
216
217 #endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
218
219 #ifdef __cplusplus
220 } // extern "C"
221 } // namespace libyuv
222 #endif
223