1 /* Xxh64.c -- XXH64 hash calculation
2 original code: Copyright (c) Yann Collet.
3 2023-08-18 : modified by Igor Pavlov.
4 This source code is licensed under BSD 2-Clause License.
5 */
6 
7 #include "Precomp.h"
8 
9 #include "CpuArch.h"
10 #include "RotateDefs.h"
11 #include "Xxh64.h"
12 
13 #define Z7_XXH_PRIME64_1  UINT64_CONST(0x9E3779B185EBCA87)
14 #define Z7_XXH_PRIME64_2  UINT64_CONST(0xC2B2AE3D27D4EB4F)
15 #define Z7_XXH_PRIME64_3  UINT64_CONST(0x165667B19E3779F9)
16 #define Z7_XXH_PRIME64_4  UINT64_CONST(0x85EBCA77C2B2AE63)
17 #define Z7_XXH_PRIME64_5  UINT64_CONST(0x27D4EB2F165667C5)
18 
Xxh64State_Init(CXxh64State * p)19 void Xxh64State_Init(CXxh64State *p)
20 {
21   const UInt64 seed = 0;
22   p->v[0] = seed + Z7_XXH_PRIME64_1 + Z7_XXH_PRIME64_2;
23   p->v[1] = seed + Z7_XXH_PRIME64_2;
24   p->v[2] = seed;
25   p->v[3] = seed - Z7_XXH_PRIME64_1;
26 }
27 
28 #if !defined(MY_CPU_64BIT) && defined(MY_CPU_X86) && defined(_MSC_VER)
29   #define Z7_XXH64_USE_ASM
30 #endif
31 
32 #if !defined(MY_CPU_64BIT) && defined(MY_CPU_X86) \
33     && defined(Z7_MSC_VER_ORIGINAL) && Z7_MSC_VER_ORIGINAL > 1200
34 /* we try to avoid __allmul calls in MSVC for 64-bit multiply.
35    But MSVC6 still uses __allmul for our code.
36    So for MSVC6 we use default 64-bit multiply without our optimization.
37 */
38 #define LOW32(b)        ((UInt32)(b & 0xffffffff))
39 /* MSVC compiler (MSVC > 1200) can use "mul" instruction
40    without __allmul for our MY_emulu MACRO.
41    MY_emulu is similar to __emulu(a, b) MACRO */
42 #define MY_emulu(a, b)      ((UInt64)(a) * (b))
43 #define MY_SET_HIGH32(a)    ((UInt64)(a) << 32)
44 #define MY_MUL32_SET_HIGH32(a, b) MY_SET_HIGH32((UInt32)(a) * (UInt32)(b))
45 // /*
46 #define MY_MUL64(a, b) \
47     ( MY_emulu((UInt32)(a), LOW32(b)) + \
48       MY_SET_HIGH32( \
49         (UInt32)((a) >> 32) * LOW32(b) + \
50         (UInt32)(a) * (UInt32)((b) >> 32) \
51       ))
52 // */
53 /*
54 #define MY_MUL64(a, b) \
55     ( MY_emulu((UInt32)(a), LOW32(b)) \
56       + MY_MUL32_SET_HIGH32((a) >> 32, LOW32(b)) + \
57       + MY_MUL32_SET_HIGH32(a, (b) >> 32) \
58     )
59 */
60 
61 #define MY_MUL_32_64(a32, b) \
62     ( MY_emulu((UInt32)(a32), LOW32(b)) \
63       + MY_MUL32_SET_HIGH32(a32, (b) >> 32) \
64     )
65 
66 #else
67 #define MY_MUL64(a, b)        ((a) * (b))
68 #define MY_MUL_32_64(a32, b)  ((a32) * (UInt64)(b))
69 #endif
70 
71 
72 static
73 Z7_FORCE_INLINE
Xxh64_Round(UInt64 acc,UInt64 input)74 UInt64 Xxh64_Round(UInt64 acc, UInt64 input)
75 {
76   acc += MY_MUL64(input, Z7_XXH_PRIME64_2);
77   acc = Z7_ROTL64(acc, 31);
78   return MY_MUL64(acc, Z7_XXH_PRIME64_1);
79 }
80 
Xxh64_Merge(UInt64 acc,UInt64 val)81 static UInt64 Xxh64_Merge(UInt64 acc, UInt64 val)
82 {
83   acc ^= Xxh64_Round(0, val);
84   return MY_MUL64(acc, Z7_XXH_PRIME64_1) + Z7_XXH_PRIME64_4;
85 }
86 
87 
88 #ifdef Z7_XXH64_USE_ASM
89 
90 #define Z7_XXH_PRIME64_1_HIGH  0x9E3779B1
91 #define Z7_XXH_PRIME64_1_LOW   0x85EBCA87
92 #define Z7_XXH_PRIME64_2_HIGH  0xC2B2AE3D
93 #define Z7_XXH_PRIME64_2_LOW   0x27D4EB4F
94 
95 void
96 Z7_NO_INLINE
97 __declspec(naked)
98 Z7_FASTCALL
Xxh64State_UpdateBlocks(CXxh64State * p,const void * data,const void * end)99 Xxh64State_UpdateBlocks(CXxh64State *p, const void *data, const void *end)
100 {
101  #if !defined(__clang__)
102   UNUSED_VAR(p)
103   UNUSED_VAR(data)
104   UNUSED_VAR(end)
105  #endif
106   __asm   push    ebx
107   __asm   push    ebp
108   __asm   push    esi
109   __asm   push    edi
110 
111   #define STACK_OFFSET  4 * 8
112   __asm   sub     esp, STACK_OFFSET
113 
114 #define COPY_1(n) \
115   __asm   mov     eax, [ecx + n * 4] \
116   __asm   mov     [esp + n * 4], eax \
117 
118 #define COPY_2(n) \
119   __asm   mov     eax, [esp + n * 4] \
120   __asm   mov     [ecx + n * 4], eax \
121 
122   COPY_1(0)
123   __asm   mov     edi, [ecx + 1 * 4] \
124   COPY_1(2)
125   COPY_1(3)
126   COPY_1(4)
127   COPY_1(5)
128   COPY_1(6)
129   COPY_1(7)
130 
131   __asm   mov     esi, edx \
132   __asm   mov     [esp + 0 * 8 + 4], ecx
133   __asm   mov     ecx, Z7_XXH_PRIME64_2_LOW \
134   __asm   mov     ebp, Z7_XXH_PRIME64_1_LOW \
135 
136 #define R(n, state1, state1_reg) \
137   __asm   mov     eax, [esi + n * 8] \
138   __asm   imul    ebx, eax, Z7_XXH_PRIME64_2_HIGH \
139   __asm   add     ebx, state1 \
140   __asm   mul     ecx \
141   __asm   add     edx, ebx \
142   __asm   mov     ebx, [esi + n * 8 + 4] \
143   __asm   imul    ebx, ecx \
144   __asm   add     eax, [esp + n * 8] \
145   __asm   adc     edx, ebx \
146   __asm   mov     ebx, eax \
147   __asm   shld    eax, edx, 31 \
148   __asm   shld    edx, ebx, 31 \
149   __asm   imul    state1_reg, eax, Z7_XXH_PRIME64_1_HIGH \
150   __asm   imul    edx, ebp \
151   __asm   add     state1_reg, edx \
152   __asm   mul     ebp \
153   __asm   add     state1_reg, edx \
154   __asm   mov     [esp + n * 8], eax \
155 
156 #define R2(n) \
157   R(n, [esp + n * 8 + 4], ebx) \
158   __asm   mov     [esp + n * 8 + 4], ebx \
159 
160   __asm   align 16
161   __asm   main_loop:
162   R(0, edi, edi)
163   R2(1)
164   R2(2)
165   R2(3)
166   __asm   add     esi, 32
167   __asm   cmp     esi, [esp + STACK_OFFSET + 4 * 4 + 4]
168   __asm   jne     main_loop
169 
170   __asm   mov     ecx, [esp + 0 * 8 + 4]
171 
172   COPY_2(0)
173   __asm   mov     [ecx + 1 * 4], edi
174   COPY_2(2)
175   COPY_2(3)
176   COPY_2(4)
177   COPY_2(5)
178   COPY_2(6)
179   COPY_2(7)
180 
181   __asm   add     esp, STACK_OFFSET
182   __asm   pop     edi
183   __asm   pop     esi
184   __asm   pop     ebp
185   __asm   pop     ebx
186   __asm   ret     4
187 }
188 
189 #else
190 
191 void
192 Z7_NO_INLINE
193 Z7_FASTCALL
Xxh64State_UpdateBlocks(CXxh64State * p,const void * _data,const void * end)194 Xxh64State_UpdateBlocks(CXxh64State *p, const void *_data, const void *end)
195 {
196   const Byte *data = (const Byte *)_data;
197   UInt64 v[4];
198   v[0] = p->v[0];
199   v[1] = p->v[1];
200   v[2] = p->v[2];
201   v[3] = p->v[3];
202   do
203   {
204     v[0] = Xxh64_Round(v[0], GetUi64(data));  data += 8;
205     v[1] = Xxh64_Round(v[1], GetUi64(data));  data += 8;
206     v[2] = Xxh64_Round(v[2], GetUi64(data));  data += 8;
207     v[3] = Xxh64_Round(v[3], GetUi64(data));  data += 8;
208   }
209   while (data != end);
210   p->v[0] = v[0];
211   p->v[1] = v[1];
212   p->v[2] = v[2];
213   p->v[3] = v[3];
214 }
215 
216 #endif
217 
Xxh64State_Digest(const CXxh64State * p,const void * _data,UInt64 count)218 UInt64 Xxh64State_Digest(const CXxh64State *p, const void *_data, UInt64 count)
219 {
220   UInt64 h = p->v[2];
221 
222   if (count >= 32)
223   {
224     h = Z7_ROTL64(p->v[0], 1) +
225         Z7_ROTL64(p->v[1], 7) +
226         Z7_ROTL64(h, 12) +
227         Z7_ROTL64(p->v[3], 18);
228     h = Xxh64_Merge(h, p->v[0]);
229     h = Xxh64_Merge(h, p->v[1]);
230     h = Xxh64_Merge(h, p->v[2]);
231     h = Xxh64_Merge(h, p->v[3]);
232   }
233   else
234     h += Z7_XXH_PRIME64_5;
235 
236   h += count;
237 
238   // XXH64_finalize():
239   {
240     unsigned cnt = (unsigned)count & 31;
241     const Byte *data = (const Byte *)_data;
242     while (cnt >= 8)
243     {
244       h ^= Xxh64_Round(0, GetUi64(data));
245       data += 8;
246       h = Z7_ROTL64(h, 27);
247       h = MY_MUL64(h, Z7_XXH_PRIME64_1) + Z7_XXH_PRIME64_4;
248       cnt -= 8;
249     }
250     if (cnt >= 4)
251     {
252       const UInt32 v = GetUi32(data);
253       data += 4;
254       h ^= MY_MUL_32_64(v, Z7_XXH_PRIME64_1);
255       h = Z7_ROTL64(h, 23);
256       h = MY_MUL64(h, Z7_XXH_PRIME64_2) + Z7_XXH_PRIME64_3;
257       cnt -= 4;
258     }
259     while (cnt)
260     {
261       const UInt32 v = *data++;
262       h ^= MY_MUL_32_64(v, Z7_XXH_PRIME64_5);
263       h = Z7_ROTL64(h, 11);
264       h = MY_MUL64(h, Z7_XXH_PRIME64_1);
265       cnt--;
266     }
267     // XXH64_avalanche(h):
268     h ^= h >> 33;  h = MY_MUL64(h, Z7_XXH_PRIME64_2);
269     h ^= h >> 29;  h = MY_MUL64(h, Z7_XXH_PRIME64_3);
270     h ^= h >> 32;
271     return h;
272   }
273 }
274 
275 
Xxh64_Init(CXxh64 * p)276 void Xxh64_Init(CXxh64 *p)
277 {
278   Xxh64State_Init(&p->state);
279   p->count = 0;
280   p->buf64[0] = 0;
281   p->buf64[1] = 0;
282   p->buf64[2] = 0;
283   p->buf64[3] = 0;
284 }
285 
Xxh64_Update(CXxh64 * p,const void * _data,size_t size)286 void Xxh64_Update(CXxh64 *p, const void *_data, size_t size)
287 {
288   const Byte *data = (const Byte *)_data;
289   unsigned cnt;
290   if (size == 0)
291     return;
292   cnt = (unsigned)p->count;
293   p->count += size;
294 
295   if (cnt &= 31)
296   {
297     unsigned rem = 32 - cnt;
298     Byte *dest = (Byte *)p->buf64 + cnt;
299     if (rem > size)
300       rem = (unsigned)size;
301     size -= rem;
302     cnt += rem;
303     // memcpy((Byte *)p->buf64 + cnt, data, rem);
304     do
305       *dest++ = *data++;
306     while (--rem);
307     if (cnt != 32)
308       return;
309     Xxh64State_UpdateBlocks(&p->state, p->buf64, &p->buf64[4]);
310   }
311 
312   if (size &= ~(size_t)31)
313   {
314     Xxh64State_UpdateBlocks(&p->state, data, data + size);
315     data += size;
316   }
317 
318   cnt = (unsigned)p->count & 31;
319   if (cnt)
320   {
321     // memcpy(p->buf64, data, cnt);
322     Byte *dest = (Byte *)p->buf64;
323     do
324       *dest++ = *data++;
325     while (--cnt);
326   }
327 }
328