1 /*
2 * Copyright 2012 The LibYuv Project Authors. All rights reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "libyuv/basic_types.h"
12
13 #include "libyuv/compare_row.h"
14 #include "libyuv/row.h"
15
16 #ifdef __cplusplus
17 namespace libyuv {
18 extern "C" {
19 #endif
20
21 // This module is for GCC x86 and x64.
22 #if !defined(LIBYUV_DISABLE_X86) && \
23 (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
24
25 #if defined(__x86_64__)
HammingDistance_SSE42(const uint8_t * src_a,const uint8_t * src_b,int count)26 uint32_t HammingDistance_SSE42(const uint8_t* src_a,
27 const uint8_t* src_b,
28 int count) {
29 uint64_t diff = 0u;
30
31 asm volatile(
32 "xor %3,%3 \n"
33 "xor %%r8,%%r8 \n"
34 "xor %%r9,%%r9 \n"
35 "xor %%r10,%%r10 \n"
36
37 // Process 32 bytes per loop.
38 LABELALIGN
39 "1: \n"
40 "mov (%0),%%rcx \n"
41 "mov 0x8(%0),%%rdx \n"
42 "xor (%1),%%rcx \n"
43 "xor 0x8(%1),%%rdx \n"
44 "popcnt %%rcx,%%rcx \n"
45 "popcnt %%rdx,%%rdx \n"
46 "mov 0x10(%0),%%rsi \n"
47 "mov 0x18(%0),%%rdi \n"
48 "xor 0x10(%1),%%rsi \n"
49 "xor 0x18(%1),%%rdi \n"
50 "popcnt %%rsi,%%rsi \n"
51 "popcnt %%rdi,%%rdi \n"
52 "add $0x20,%0 \n"
53 "add $0x20,%1 \n"
54 "add %%rcx,%3 \n"
55 "add %%rdx,%%r8 \n"
56 "add %%rsi,%%r9 \n"
57 "add %%rdi,%%r10 \n"
58 "sub $0x20,%2 \n"
59 "jg 1b \n"
60
61 "add %%r8, %3 \n"
62 "add %%r9, %3 \n"
63 "add %%r10, %3 \n"
64 : "+r"(src_a), // %0
65 "+r"(src_b), // %1
66 "+r"(count), // %2
67 "=r"(diff) // %3
68 :
69 : "memory", "cc", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10");
70
71 return static_cast<uint32_t>(diff);
72 }
73 #else
74 uint32_t HammingDistance_SSE42(const uint8_t* src_a,
75 const uint8_t* src_b,
76 int count) {
77 uint32_t diff = 0u;
78
79 asm volatile(
80 // Process 16 bytes per loop.
81 LABELALIGN
82 "1: \n"
83 "mov (%0),%%ecx \n"
84 "mov 0x4(%0),%%edx \n"
85 "xor (%1),%%ecx \n"
86 "xor 0x4(%1),%%edx \n"
87 "popcnt %%ecx,%%ecx \n"
88 "add %%ecx,%3 \n"
89 "popcnt %%edx,%%edx \n"
90 "add %%edx,%3 \n"
91 "mov 0x8(%0),%%ecx \n"
92 "mov 0xc(%0),%%edx \n"
93 "xor 0x8(%1),%%ecx \n"
94 "xor 0xc(%1),%%edx \n"
95 "popcnt %%ecx,%%ecx \n"
96 "add %%ecx,%3 \n"
97 "popcnt %%edx,%%edx \n"
98 "add %%edx,%3 \n"
99 "add $0x10,%0 \n"
100 "add $0x10,%1 \n"
101 "sub $0x10,%2 \n"
102 "jg 1b \n"
103 : "+r"(src_a), // %0
104 "+r"(src_b), // %1
105 "+r"(count), // %2
106 "+r"(diff) // %3
107 :
108 : "memory", "cc", "ecx", "edx");
109
110 return diff;
111 }
112 #endif
113
114 static const vec8 kNibbleMask = {15, 15, 15, 15, 15, 15, 15, 15,
115 15, 15, 15, 15, 15, 15, 15, 15};
116 static const vec8 kBitCount = {0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4};
117
HammingDistance_SSSE3(const uint8_t * src_a,const uint8_t * src_b,int count)118 uint32_t HammingDistance_SSSE3(const uint8_t* src_a,
119 const uint8_t* src_b,
120 int count) {
121 uint32_t diff = 0u;
122
123 asm volatile(
124 "movdqa %4,%%xmm2 \n"
125 "movdqa %5,%%xmm3 \n"
126 "pxor %%xmm0,%%xmm0 \n"
127 "pxor %%xmm1,%%xmm1 \n"
128 "sub %0,%1 \n"
129
130 LABELALIGN
131 "1: \n"
132 "movdqa (%0),%%xmm4 \n"
133 "movdqa 0x10(%0), %%xmm5 \n"
134 "pxor (%0,%1), %%xmm4 \n"
135 "movdqa %%xmm4,%%xmm6 \n"
136 "pand %%xmm2,%%xmm6 \n"
137 "psrlw $0x4,%%xmm4 \n"
138 "movdqa %%xmm3,%%xmm7 \n"
139 "pshufb %%xmm6,%%xmm7 \n"
140 "pand %%xmm2,%%xmm4 \n"
141 "movdqa %%xmm3,%%xmm6 \n"
142 "pshufb %%xmm4,%%xmm6 \n"
143 "paddb %%xmm7,%%xmm6 \n"
144 "pxor 0x10(%0,%1),%%xmm5 \n"
145 "add $0x20,%0 \n"
146 "movdqa %%xmm5,%%xmm4 \n"
147 "pand %%xmm2,%%xmm5 \n"
148 "psrlw $0x4,%%xmm4 \n"
149 "movdqa %%xmm3,%%xmm7 \n"
150 "pshufb %%xmm5,%%xmm7 \n"
151 "pand %%xmm2,%%xmm4 \n"
152 "movdqa %%xmm3,%%xmm5 \n"
153 "pshufb %%xmm4,%%xmm5 \n"
154 "paddb %%xmm7,%%xmm5 \n"
155 "paddb %%xmm5,%%xmm6 \n"
156 "psadbw %%xmm1,%%xmm6 \n"
157 "paddd %%xmm6,%%xmm0 \n"
158 "sub $0x20,%2 \n"
159 "jg 1b \n"
160
161 "pshufd $0xaa,%%xmm0,%%xmm1 \n"
162 "paddd %%xmm1,%%xmm0 \n"
163 "movd %%xmm0, %3 \n"
164 : "+r"(src_a), // %0
165 "+r"(src_b), // %1
166 "+r"(count), // %2
167 "=r"(diff) // %3
168 : "m"(kNibbleMask), // %4
169 "m"(kBitCount) // %5
170 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
171 "xmm7");
172
173 return diff;
174 }
175
176 #ifdef HAS_HAMMINGDISTANCE_AVX2
HammingDistance_AVX2(const uint8_t * src_a,const uint8_t * src_b,int count)177 uint32_t HammingDistance_AVX2(const uint8_t* src_a,
178 const uint8_t* src_b,
179 int count) {
180 uint32_t diff = 0u;
181
182 asm volatile(
183 "vbroadcastf128 %4,%%ymm2 \n"
184 "vbroadcastf128 %5,%%ymm3 \n"
185 "vpxor %%ymm0,%%ymm0,%%ymm0 \n"
186 "vpxor %%ymm1,%%ymm1,%%ymm1 \n"
187 "sub %0,%1 \n"
188
189 LABELALIGN
190 "1: \n"
191 "vmovdqa (%0),%%ymm4 \n"
192 "vmovdqa 0x20(%0), %%ymm5 \n"
193 "vpxor (%0,%1), %%ymm4, %%ymm4 \n"
194 "vpand %%ymm2,%%ymm4,%%ymm6 \n"
195 "vpsrlw $0x4,%%ymm4,%%ymm4 \n"
196 "vpshufb %%ymm6,%%ymm3,%%ymm6 \n"
197 "vpand %%ymm2,%%ymm4,%%ymm4 \n"
198 "vpshufb %%ymm4,%%ymm3,%%ymm4 \n"
199 "vpaddb %%ymm4,%%ymm6,%%ymm6 \n"
200 "vpxor 0x20(%0,%1),%%ymm5,%%ymm4 \n"
201 "add $0x40,%0 \n"
202 "vpand %%ymm2,%%ymm4,%%ymm5 \n"
203 "vpsrlw $0x4,%%ymm4,%%ymm4 \n"
204 "vpshufb %%ymm5,%%ymm3,%%ymm5 \n"
205 "vpand %%ymm2,%%ymm4,%%ymm4 \n"
206 "vpshufb %%ymm4,%%ymm3,%%ymm4 \n"
207 "vpaddb %%ymm5,%%ymm4,%%ymm4 \n"
208 "vpaddb %%ymm6,%%ymm4,%%ymm4 \n"
209 "vpsadbw %%ymm1,%%ymm4,%%ymm4 \n"
210 "vpaddd %%ymm0,%%ymm4,%%ymm0 \n"
211 "sub $0x40,%2 \n"
212 "jg 1b \n"
213
214 "vpermq $0xb1,%%ymm0,%%ymm1 \n"
215 "vpaddd %%ymm1,%%ymm0,%%ymm0 \n"
216 "vpermq $0xaa,%%ymm0,%%ymm1 \n"
217 "vpaddd %%ymm1,%%ymm0,%%ymm0 \n"
218 "vmovd %%xmm0, %3 \n"
219 "vzeroupper \n"
220 : "+r"(src_a), // %0
221 "+r"(src_b), // %1
222 "+r"(count), // %2
223 "=r"(diff) // %3
224 : "m"(kNibbleMask), // %4
225 "m"(kBitCount) // %5
226 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
227
228 return diff;
229 }
230 #endif // HAS_HAMMINGDISTANCE_AVX2
231
SumSquareError_SSE2(const uint8_t * src_a,const uint8_t * src_b,int count)232 uint32_t SumSquareError_SSE2(const uint8_t* src_a,
233 const uint8_t* src_b,
234 int count) {
235 uint32_t sse;
236 asm volatile(
237 "pxor %%xmm0,%%xmm0 \n"
238 "pxor %%xmm5,%%xmm5 \n"
239
240 LABELALIGN
241 "1: \n"
242 "movdqu (%0),%%xmm1 \n"
243 "lea 0x10(%0),%0 \n"
244 "movdqu (%1),%%xmm2 \n"
245 "lea 0x10(%1),%1 \n"
246 "movdqa %%xmm1,%%xmm3 \n"
247 "psubusb %%xmm2,%%xmm1 \n"
248 "psubusb %%xmm3,%%xmm2 \n"
249 "por %%xmm2,%%xmm1 \n"
250 "movdqa %%xmm1,%%xmm2 \n"
251 "punpcklbw %%xmm5,%%xmm1 \n"
252 "punpckhbw %%xmm5,%%xmm2 \n"
253 "pmaddwd %%xmm1,%%xmm1 \n"
254 "pmaddwd %%xmm2,%%xmm2 \n"
255 "paddd %%xmm1,%%xmm0 \n"
256 "paddd %%xmm2,%%xmm0 \n"
257 "sub $0x10,%2 \n"
258 "jg 1b \n"
259
260 "pshufd $0xee,%%xmm0,%%xmm1 \n"
261 "paddd %%xmm1,%%xmm0 \n"
262 "pshufd $0x1,%%xmm0,%%xmm1 \n"
263 "paddd %%xmm1,%%xmm0 \n"
264 "movd %%xmm0,%3 \n"
265
266 : "+r"(src_a), // %0
267 "+r"(src_b), // %1
268 "+r"(count), // %2
269 "=g"(sse) // %3
270 ::"memory",
271 "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
272 return sse;
273 }
274
275 static const uvec32 kHash16x33 = {0x92d9e201, 0, 0, 0}; // 33 ^ 16
276 static const uvec32 kHashMul0 = {
277 0x0c3525e1, // 33 ^ 15
278 0xa3476dc1, // 33 ^ 14
279 0x3b4039a1, // 33 ^ 13
280 0x4f5f0981, // 33 ^ 12
281 };
282 static const uvec32 kHashMul1 = {
283 0x30f35d61, // 33 ^ 11
284 0x855cb541, // 33 ^ 10
285 0x040a9121, // 33 ^ 9
286 0x747c7101, // 33 ^ 8
287 };
288 static const uvec32 kHashMul2 = {
289 0xec41d4e1, // 33 ^ 7
290 0x4cfa3cc1, // 33 ^ 6
291 0x025528a1, // 33 ^ 5
292 0x00121881, // 33 ^ 4
293 };
294 static const uvec32 kHashMul3 = {
295 0x00008c61, // 33 ^ 3
296 0x00000441, // 33 ^ 2
297 0x00000021, // 33 ^ 1
298 0x00000001, // 33 ^ 0
299 };
300
HashDjb2_SSE41(const uint8_t * src,int count,uint32_t seed)301 uint32_t HashDjb2_SSE41(const uint8_t* src, int count, uint32_t seed) {
302 uint32_t hash;
303 asm volatile(
304 "movd %2,%%xmm0 \n"
305 "pxor %%xmm7,%%xmm7 \n"
306 "movdqa %4,%%xmm6 \n"
307
308 LABELALIGN
309 "1: \n"
310 "movdqu (%0),%%xmm1 \n"
311 "lea 0x10(%0),%0 \n"
312 "pmulld %%xmm6,%%xmm0 \n"
313 "movdqa %5,%%xmm5 \n"
314 "movdqa %%xmm1,%%xmm2 \n"
315 "punpcklbw %%xmm7,%%xmm2 \n"
316 "movdqa %%xmm2,%%xmm3 \n"
317 "punpcklwd %%xmm7,%%xmm3 \n"
318 "pmulld %%xmm5,%%xmm3 \n"
319 "movdqa %6,%%xmm5 \n"
320 "movdqa %%xmm2,%%xmm4 \n"
321 "punpckhwd %%xmm7,%%xmm4 \n"
322 "pmulld %%xmm5,%%xmm4 \n"
323 "movdqa %7,%%xmm5 \n"
324 "punpckhbw %%xmm7,%%xmm1 \n"
325 "movdqa %%xmm1,%%xmm2 \n"
326 "punpcklwd %%xmm7,%%xmm2 \n"
327 "pmulld %%xmm5,%%xmm2 \n"
328 "movdqa %8,%%xmm5 \n"
329 "punpckhwd %%xmm7,%%xmm1 \n"
330 "pmulld %%xmm5,%%xmm1 \n"
331 "paddd %%xmm4,%%xmm3 \n"
332 "paddd %%xmm2,%%xmm1 \n"
333 "paddd %%xmm3,%%xmm1 \n"
334 "pshufd $0xe,%%xmm1,%%xmm2 \n"
335 "paddd %%xmm2,%%xmm1 \n"
336 "pshufd $0x1,%%xmm1,%%xmm2 \n"
337 "paddd %%xmm2,%%xmm1 \n"
338 "paddd %%xmm1,%%xmm0 \n"
339 "sub $0x10,%1 \n"
340 "jg 1b \n"
341 "movd %%xmm0,%3 \n"
342 : "+r"(src), // %0
343 "+r"(count), // %1
344 "+rm"(seed), // %2
345 "=g"(hash) // %3
346 : "m"(kHash16x33), // %4
347 "m"(kHashMul0), // %5
348 "m"(kHashMul1), // %6
349 "m"(kHashMul2), // %7
350 "m"(kHashMul3) // %8
351 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
352 "xmm7");
353 return hash;
354 }
355 #endif // defined(__x86_64__) || (defined(__i386__) && !defined(__pic__)))
356
357 #ifdef __cplusplus
358 } // extern "C"
359 } // namespace libyuv
360 #endif
361