1 /*
2 * Copyright 2012 The LibYuv Project Authors. All rights reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "libyuv/basic_types.h"
12 #include "libyuv/row.h"
13
14 #ifdef __cplusplus
15 namespace libyuv {
16 extern "C" {
17 #endif
18
19 // This module is for Visual C x86.
20 #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \
21 defined(_MSC_VER) && !defined(__clang__)
22
23 __declspec(naked)
SumSquareError_SSE2(const uint8 * src_a,const uint8 * src_b,int count)24 uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
25 __asm {
26 mov eax, [esp + 4] // src_a
27 mov edx, [esp + 8] // src_b
28 mov ecx, [esp + 12] // count
29 pxor xmm0, xmm0
30 pxor xmm5, xmm5
31
32 wloop:
33 movdqu xmm1, [eax]
34 lea eax, [eax + 16]
35 movdqu xmm2, [edx]
36 lea edx, [edx + 16]
37 movdqa xmm3, xmm1 // abs trick
38 psubusb xmm1, xmm2
39 psubusb xmm2, xmm3
40 por xmm1, xmm2
41 movdqa xmm2, xmm1
42 punpcklbw xmm1, xmm5
43 punpckhbw xmm2, xmm5
44 pmaddwd xmm1, xmm1
45 pmaddwd xmm2, xmm2
46 paddd xmm0, xmm1
47 paddd xmm0, xmm2
48 sub ecx, 16
49 jg wloop
50
51 pshufd xmm1, xmm0, 0xee
52 paddd xmm0, xmm1
53 pshufd xmm1, xmm0, 0x01
54 paddd xmm0, xmm1
55 movd eax, xmm0
56 ret
57 }
58 }
59
60 // Visual C 2012 required for AVX2.
61 #if _MSC_VER >= 1700
62 // C4752: found Intel(R) Advanced Vector Extensions; consider using /arch:AVX.
63 #pragma warning(disable: 4752)
64 __declspec(naked)
SumSquareError_AVX2(const uint8 * src_a,const uint8 * src_b,int count)65 uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) {
66 __asm {
67 mov eax, [esp + 4] // src_a
68 mov edx, [esp + 8] // src_b
69 mov ecx, [esp + 12] // count
70 vpxor ymm0, ymm0, ymm0 // sum
71 vpxor ymm5, ymm5, ymm5 // constant 0 for unpck
72 sub edx, eax
73
74 wloop:
75 vmovdqu ymm1, [eax]
76 vmovdqu ymm2, [eax + edx]
77 lea eax, [eax + 32]
78 vpsubusb ymm3, ymm1, ymm2 // abs difference trick
79 vpsubusb ymm2, ymm2, ymm1
80 vpor ymm1, ymm2, ymm3
81 vpunpcklbw ymm2, ymm1, ymm5 // u16. mutates order.
82 vpunpckhbw ymm1, ymm1, ymm5
83 vpmaddwd ymm2, ymm2, ymm2 // square + hadd to u32.
84 vpmaddwd ymm1, ymm1, ymm1
85 vpaddd ymm0, ymm0, ymm1
86 vpaddd ymm0, ymm0, ymm2
87 sub ecx, 32
88 jg wloop
89
90 vpshufd ymm1, ymm0, 0xee // 3, 2 + 1, 0 both lanes.
91 vpaddd ymm0, ymm0, ymm1
92 vpshufd ymm1, ymm0, 0x01 // 1 + 0 both lanes.
93 vpaddd ymm0, ymm0, ymm1
94 vpermq ymm1, ymm0, 0x02 // high + low lane.
95 vpaddd ymm0, ymm0, ymm1
96 vmovd eax, xmm0
97 vzeroupper
98 ret
99 }
100 }
101 #endif // _MSC_VER >= 1700
102
103 #define HAS_HASHDJB2_SSE41
104 static uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 }; // 33 ^ 16
105 static uvec32 kHashMul0 = {
106 0x0c3525e1, // 33 ^ 15
107 0xa3476dc1, // 33 ^ 14
108 0x3b4039a1, // 33 ^ 13
109 0x4f5f0981, // 33 ^ 12
110 };
111 static uvec32 kHashMul1 = {
112 0x30f35d61, // 33 ^ 11
113 0x855cb541, // 33 ^ 10
114 0x040a9121, // 33 ^ 9
115 0x747c7101, // 33 ^ 8
116 };
117 static uvec32 kHashMul2 = {
118 0xec41d4e1, // 33 ^ 7
119 0x4cfa3cc1, // 33 ^ 6
120 0x025528a1, // 33 ^ 5
121 0x00121881, // 33 ^ 4
122 };
123 static uvec32 kHashMul3 = {
124 0x00008c61, // 33 ^ 3
125 0x00000441, // 33 ^ 2
126 0x00000021, // 33 ^ 1
127 0x00000001, // 33 ^ 0
128 };
129
130 // 27: 66 0F 38 40 C6 pmulld xmm0,xmm6
131 // 44: 66 0F 38 40 DD pmulld xmm3,xmm5
132 // 59: 66 0F 38 40 E5 pmulld xmm4,xmm5
133 // 72: 66 0F 38 40 D5 pmulld xmm2,xmm5
134 // 83: 66 0F 38 40 CD pmulld xmm1,xmm5
135 #define pmulld(reg) _asm _emit 0x66 _asm _emit 0x0F _asm _emit 0x38 \
136 _asm _emit 0x40 _asm _emit reg
137
138 __declspec(naked)
HashDjb2_SSE41(const uint8 * src,int count,uint32 seed)139 uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
140 __asm {
141 mov eax, [esp + 4] // src
142 mov ecx, [esp + 8] // count
143 movd xmm0, [esp + 12] // seed
144
145 pxor xmm7, xmm7 // constant 0 for unpck
146 movdqa xmm6, kHash16x33
147
148 wloop:
149 movdqu xmm1, [eax] // src[0-15]
150 lea eax, [eax + 16]
151 pmulld(0xc6) // pmulld xmm0,xmm6 hash *= 33 ^ 16
152 movdqa xmm5, kHashMul0
153 movdqa xmm2, xmm1
154 punpcklbw xmm2, xmm7 // src[0-7]
155 movdqa xmm3, xmm2
156 punpcklwd xmm3, xmm7 // src[0-3]
157 pmulld(0xdd) // pmulld xmm3, xmm5
158 movdqa xmm5, kHashMul1
159 movdqa xmm4, xmm2
160 punpckhwd xmm4, xmm7 // src[4-7]
161 pmulld(0xe5) // pmulld xmm4, xmm5
162 movdqa xmm5, kHashMul2
163 punpckhbw xmm1, xmm7 // src[8-15]
164 movdqa xmm2, xmm1
165 punpcklwd xmm2, xmm7 // src[8-11]
166 pmulld(0xd5) // pmulld xmm2, xmm5
167 movdqa xmm5, kHashMul3
168 punpckhwd xmm1, xmm7 // src[12-15]
169 pmulld(0xcd) // pmulld xmm1, xmm5
170 paddd xmm3, xmm4 // add 16 results
171 paddd xmm1, xmm2
172 paddd xmm1, xmm3
173
174 pshufd xmm2, xmm1, 0x0e // upper 2 dwords
175 paddd xmm1, xmm2
176 pshufd xmm2, xmm1, 0x01
177 paddd xmm1, xmm2
178 paddd xmm0, xmm1
179 sub ecx, 16
180 jg wloop
181
182 movd eax, xmm0 // return hash
183 ret
184 }
185 }
186
187 // Visual C 2012 required for AVX2.
188 #if _MSC_VER >= 1700
189 __declspec(naked)
HashDjb2_AVX2(const uint8 * src,int count,uint32 seed)190 uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed) {
191 __asm {
192 mov eax, [esp + 4] // src
193 mov ecx, [esp + 8] // count
194 movd xmm0, [esp + 12] // seed
195 movdqa xmm6, kHash16x33
196
197 wloop:
198 vpmovzxbd xmm3, dword ptr [eax] // src[0-3]
199 pmulld xmm0, xmm6 // hash *= 33 ^ 16
200 vpmovzxbd xmm4, dword ptr [eax + 4] // src[4-7]
201 pmulld xmm3, kHashMul0
202 vpmovzxbd xmm2, dword ptr [eax + 8] // src[8-11]
203 pmulld xmm4, kHashMul1
204 vpmovzxbd xmm1, dword ptr [eax + 12] // src[12-15]
205 pmulld xmm2, kHashMul2
206 lea eax, [eax + 16]
207 pmulld xmm1, kHashMul3
208 paddd xmm3, xmm4 // add 16 results
209 paddd xmm1, xmm2
210 paddd xmm1, xmm3
211 pshufd xmm2, xmm1, 0x0e // upper 2 dwords
212 paddd xmm1, xmm2
213 pshufd xmm2, xmm1, 0x01
214 paddd xmm1, xmm2
215 paddd xmm0, xmm1
216 sub ecx, 16
217 jg wloop
218
219 movd eax, xmm0 // return hash
220 ret
221 }
222 }
223 #endif // _MSC_VER >= 1700
224 #endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
225
226 #ifdef __cplusplus
227 } // extern "C"
228 } // namespace libyuv
229 #endif
230