1 /* LzFind.c -- Match finder for LZ algorithms
2 2023-03-14 : Igor Pavlov : Public domain */
3
4 #include "Precomp.h"
5
6 #include <string.h>
7 // #include <stdio.h>
8
9 #include "CpuArch.h"
10 #include "LzFind.h"
11 #include "LzHash.h"
12
13 #define kBlockMoveAlign (1 << 7) // alignment for memmove()
14 #define kBlockSizeAlign (1 << 16) // alignment for block allocation
15 #define kBlockSizeReserveMin (1 << 24) // it's 1/256 from 4 GB dictinary
16
17 #define kEmptyHashValue 0
18
19 #define kMaxValForNormalize ((UInt32)0)
20 // #define kMaxValForNormalize ((UInt32)(1 << 20) + 0xfff) // for debug
21
22 // #define kNormalizeAlign (1 << 7) // alignment for speculated accesses
23
24 #define GET_AVAIL_BYTES(p) \
25 Inline_MatchFinder_GetNumAvailableBytes(p)
26
27
28 // #define kFix5HashSize (kHash2Size + kHash3Size + kHash4Size)
29 #define kFix5HashSize kFix4HashSize
30
31 /*
32 HASH2_CALC:
33 if (hv) match, then cur[0] and cur[1] also match
34 */
35 #define HASH2_CALC hv = GetUi16(cur);
36
37 // (crc[0 ... 255] & 0xFF) provides one-to-one correspondence to [0 ... 255]
38
39 /*
40 HASH3_CALC:
41 if (cur[0]) and (h2) match, then cur[1] also match
42 if (cur[0]) and (hv) match, then cur[1] and cur[2] also match
43 */
44 #define HASH3_CALC { \
45 UInt32 temp = p->crc[cur[0]] ^ cur[1]; \
46 h2 = temp & (kHash2Size - 1); \
47 hv = (temp ^ ((UInt32)cur[2] << 8)) & p->hashMask; }
48
49 #define HASH4_CALC { \
50 UInt32 temp = p->crc[cur[0]] ^ cur[1]; \
51 h2 = temp & (kHash2Size - 1); \
52 temp ^= ((UInt32)cur[2] << 8); \
53 h3 = temp & (kHash3Size - 1); \
54 hv = (temp ^ (p->crc[cur[3]] << kLzHash_CrcShift_1)) & p->hashMask; }
55
56 #define HASH5_CALC { \
57 UInt32 temp = p->crc[cur[0]] ^ cur[1]; \
58 h2 = temp & (kHash2Size - 1); \
59 temp ^= ((UInt32)cur[2] << 8); \
60 h3 = temp & (kHash3Size - 1); \
61 temp ^= (p->crc[cur[3]] << kLzHash_CrcShift_1); \
62 /* h4 = temp & p->hash4Mask; */ /* (kHash4Size - 1); */ \
63 hv = (temp ^ (p->crc[cur[4]] << kLzHash_CrcShift_2)) & p->hashMask; }
64
65 #define HASH_ZIP_CALC hv = ((cur[2] | ((UInt32)cur[0] << 8)) ^ p->crc[cur[1]]) & 0xFFFF;
66
67
LzInWindow_Free(CMatchFinder * p,ISzAllocPtr alloc)68 static void LzInWindow_Free(CMatchFinder *p, ISzAllocPtr alloc)
69 {
70 // if (!p->directInput)
71 {
72 ISzAlloc_Free(alloc, p->bufBase);
73 p->bufBase = NULL;
74 }
75 }
76
77
LzInWindow_Create2(CMatchFinder * p,UInt32 blockSize,ISzAllocPtr alloc)78 static int LzInWindow_Create2(CMatchFinder *p, UInt32 blockSize, ISzAllocPtr alloc)
79 {
80 if (blockSize == 0)
81 return 0;
82 if (!p->bufBase || p->blockSize != blockSize)
83 {
84 // size_t blockSizeT;
85 LzInWindow_Free(p, alloc);
86 p->blockSize = blockSize;
87 // blockSizeT = blockSize;
88
89 // printf("\nblockSize = 0x%x\n", blockSize);
90 /*
91 #if defined _WIN64
92 // we can allocate 4GiB, but still use UInt32 for (p->blockSize)
93 // we use UInt32 type for (p->blockSize), because
94 // we don't want to wrap over 4 GiB,
95 // when we use (p->streamPos - p->pos) that is UInt32.
96 if (blockSize >= (UInt32)0 - (UInt32)kBlockSizeAlign)
97 {
98 blockSizeT = ((size_t)1 << 32);
99 printf("\nchanged to blockSizeT = 4GiB\n");
100 }
101 #endif
102 */
103
104 p->bufBase = (Byte *)ISzAlloc_Alloc(alloc, blockSize);
105 // printf("\nbufferBase = %p\n", p->bufBase);
106 // return 0; // for debug
107 }
108 return (p->bufBase != NULL);
109 }
110
MatchFinder_GetPointerToCurrentPos(CMatchFinder * p)111 static const Byte *MatchFinder_GetPointerToCurrentPos(CMatchFinder *p) { return p->buffer; }
112
MatchFinder_GetNumAvailableBytes(CMatchFinder * p)113 static UInt32 MatchFinder_GetNumAvailableBytes(CMatchFinder *p) { return GET_AVAIL_BYTES(p); }
114
115
116 Z7_NO_INLINE
MatchFinder_ReadBlock(CMatchFinder * p)117 static void MatchFinder_ReadBlock(CMatchFinder *p)
118 {
119 if (p->streamEndWasReached || p->result != SZ_OK)
120 return;
121
122 /* We use (p->streamPos - p->pos) value.
123 (p->streamPos < p->pos) is allowed. */
124
125 if (p->directInput)
126 {
127 UInt32 curSize = 0xFFFFFFFF - GET_AVAIL_BYTES(p);
128 if (curSize > p->directInputRem)
129 curSize = (UInt32)p->directInputRem;
130 p->streamPos += curSize;
131 p->directInputRem -= curSize;
132 if (p->directInputRem == 0)
133 p->streamEndWasReached = 1;
134 return;
135 }
136
137 for (;;)
138 {
139 const Byte *dest = p->buffer + GET_AVAIL_BYTES(p);
140 size_t size = (size_t)(p->bufBase + p->blockSize - dest);
141 if (size == 0)
142 {
143 /* we call ReadBlock() after NeedMove() and MoveBlock().
144 NeedMove() and MoveBlock() povide more than (keepSizeAfter)
145 to the end of (blockSize).
146 So we don't execute this branch in normal code flow.
147 We can go here, if we will call ReadBlock() before NeedMove(), MoveBlock().
148 */
149 // p->result = SZ_ERROR_FAIL; // we can show error here
150 return;
151 }
152
153 // #define kRead 3
154 // if (size > kRead) size = kRead; // for debug
155
156 /*
157 // we need cast (Byte *)dest.
158 #ifdef __clang__
159 #pragma GCC diagnostic ignored "-Wcast-qual"
160 #endif
161 */
162 p->result = ISeqInStream_Read(p->stream,
163 p->bufBase + (dest - p->bufBase), &size);
164 if (p->result != SZ_OK)
165 return;
166 if (size == 0)
167 {
168 p->streamEndWasReached = 1;
169 return;
170 }
171 p->streamPos += (UInt32)size;
172 if (GET_AVAIL_BYTES(p) > p->keepSizeAfter)
173 return;
174 /* here and in another (p->keepSizeAfter) checks we keep on 1 byte more than was requested by Create() function
175 (GET_AVAIL_BYTES(p) >= p->keepSizeAfter) - minimal required size */
176 }
177
178 // on exit: (p->result != SZ_OK || p->streamEndWasReached || GET_AVAIL_BYTES(p) > p->keepSizeAfter)
179 }
180
181
182
183 Z7_NO_INLINE
MatchFinder_MoveBlock(CMatchFinder * p)184 void MatchFinder_MoveBlock(CMatchFinder *p)
185 {
186 const size_t offset = (size_t)(p->buffer - p->bufBase) - p->keepSizeBefore;
187 const size_t keepBefore = (offset & (kBlockMoveAlign - 1)) + p->keepSizeBefore;
188 p->buffer = p->bufBase + keepBefore;
189 memmove(p->bufBase,
190 p->bufBase + (offset & ~((size_t)kBlockMoveAlign - 1)),
191 keepBefore + (size_t)GET_AVAIL_BYTES(p));
192 }
193
194 /* We call MoveBlock() before ReadBlock().
195 So MoveBlock() can be wasteful operation, if the whole input data
196 can fit in current block even without calling MoveBlock().
197 in important case where (dataSize <= historySize)
198 condition (p->blockSize > dataSize + p->keepSizeAfter) is met
199 So there is no MoveBlock() in that case case.
200 */
201
MatchFinder_NeedMove(CMatchFinder * p)202 int MatchFinder_NeedMove(CMatchFinder *p)
203 {
204 if (p->directInput)
205 return 0;
206 if (p->streamEndWasReached || p->result != SZ_OK)
207 return 0;
208 return ((size_t)(p->bufBase + p->blockSize - p->buffer) <= p->keepSizeAfter);
209 }
210
MatchFinder_ReadIfRequired(CMatchFinder * p)211 void MatchFinder_ReadIfRequired(CMatchFinder *p)
212 {
213 if (p->keepSizeAfter >= GET_AVAIL_BYTES(p))
214 MatchFinder_ReadBlock(p);
215 }
216
217
218
MatchFinder_SetDefaultSettings(CMatchFinder * p)219 static void MatchFinder_SetDefaultSettings(CMatchFinder *p)
220 {
221 p->cutValue = 32;
222 p->btMode = 1;
223 p->numHashBytes = 4;
224 p->numHashBytes_Min = 2;
225 p->numHashOutBits = 0;
226 p->bigHash = 0;
227 }
228
229 #define kCrcPoly 0xEDB88320
230
MatchFinder_Construct(CMatchFinder * p)231 void MatchFinder_Construct(CMatchFinder *p)
232 {
233 unsigned i;
234 p->buffer = NULL;
235 p->bufBase = NULL;
236 p->directInput = 0;
237 p->stream = NULL;
238 p->hash = NULL;
239 p->expectedDataSize = (UInt64)(Int64)-1;
240 MatchFinder_SetDefaultSettings(p);
241
242 for (i = 0; i < 256; i++)
243 {
244 UInt32 r = (UInt32)i;
245 unsigned j;
246 for (j = 0; j < 8; j++)
247 r = (r >> 1) ^ (kCrcPoly & ((UInt32)0 - (r & 1)));
248 p->crc[i] = r;
249 }
250 }
251
252 #undef kCrcPoly
253
MatchFinder_FreeThisClassMemory(CMatchFinder * p,ISzAllocPtr alloc)254 static void MatchFinder_FreeThisClassMemory(CMatchFinder *p, ISzAllocPtr alloc)
255 {
256 ISzAlloc_Free(alloc, p->hash);
257 p->hash = NULL;
258 }
259
MatchFinder_Free(CMatchFinder * p,ISzAllocPtr alloc)260 void MatchFinder_Free(CMatchFinder *p, ISzAllocPtr alloc)
261 {
262 MatchFinder_FreeThisClassMemory(p, alloc);
263 LzInWindow_Free(p, alloc);
264 }
265
AllocRefs(size_t num,ISzAllocPtr alloc)266 static CLzRef* AllocRefs(size_t num, ISzAllocPtr alloc)
267 {
268 const size_t sizeInBytes = (size_t)num * sizeof(CLzRef);
269 if (sizeInBytes / sizeof(CLzRef) != num)
270 return NULL;
271 return (CLzRef *)ISzAlloc_Alloc(alloc, sizeInBytes);
272 }
273
274 #if (kBlockSizeReserveMin < kBlockSizeAlign * 2)
275 #error Stop_Compiling_Bad_Reserve
276 #endif
277
278
279
GetBlockSize(CMatchFinder * p,UInt32 historySize)280 static UInt32 GetBlockSize(CMatchFinder *p, UInt32 historySize)
281 {
282 UInt32 blockSize = (p->keepSizeBefore + p->keepSizeAfter);
283 /*
284 if (historySize > kMaxHistorySize)
285 return 0;
286 */
287 // printf("\nhistorySize == 0x%x\n", historySize);
288
289 if (p->keepSizeBefore < historySize || blockSize < p->keepSizeBefore) // if 32-bit overflow
290 return 0;
291
292 {
293 const UInt32 kBlockSizeMax = (UInt32)0 - (UInt32)kBlockSizeAlign;
294 const UInt32 rem = kBlockSizeMax - blockSize;
295 const UInt32 reserve = (blockSize >> (blockSize < ((UInt32)1 << 30) ? 1 : 2))
296 + (1 << 12) + kBlockMoveAlign + kBlockSizeAlign; // do not overflow 32-bit here
297 if (blockSize >= kBlockSizeMax
298 || rem < kBlockSizeReserveMin) // we reject settings that will be slow
299 return 0;
300 if (reserve >= rem)
301 blockSize = kBlockSizeMax;
302 else
303 {
304 blockSize += reserve;
305 blockSize &= ~(UInt32)(kBlockSizeAlign - 1);
306 }
307 }
308 // printf("\n LzFind_blockSize = %x\n", blockSize);
309 // printf("\n LzFind_blockSize = %d\n", blockSize >> 20);
310 return blockSize;
311 }
312
313
314 // input is historySize
MatchFinder_GetHashMask2(CMatchFinder * p,UInt32 hs)315 static UInt32 MatchFinder_GetHashMask2(CMatchFinder *p, UInt32 hs)
316 {
317 if (p->numHashBytes == 2)
318 return (1 << 16) - 1;
319 if (hs != 0)
320 hs--;
321 hs |= (hs >> 1);
322 hs |= (hs >> 2);
323 hs |= (hs >> 4);
324 hs |= (hs >> 8);
325 // we propagated 16 bits in (hs). Low 16 bits must be set later
326 if (hs >= (1 << 24))
327 {
328 if (p->numHashBytes == 3)
329 hs = (1 << 24) - 1;
330 /* if (bigHash) mode, GetHeads4b() in LzFindMt.c needs (hs >= ((1 << 24) - 1))) */
331 }
332 // (hash_size >= (1 << 16)) : Required for (numHashBytes > 2)
333 hs |= (1 << 16) - 1; /* don't change it! */
334 // bt5: we adjust the size with recommended minimum size
335 if (p->numHashBytes >= 5)
336 hs |= (256 << kLzHash_CrcShift_2) - 1;
337 return hs;
338 }
339
340 // input is historySize
MatchFinder_GetHashMask(CMatchFinder * p,UInt32 hs)341 static UInt32 MatchFinder_GetHashMask(CMatchFinder *p, UInt32 hs)
342 {
343 if (p->numHashBytes == 2)
344 return (1 << 16) - 1;
345 if (hs != 0)
346 hs--;
347 hs |= (hs >> 1);
348 hs |= (hs >> 2);
349 hs |= (hs >> 4);
350 hs |= (hs >> 8);
351 // we propagated 16 bits in (hs). Low 16 bits must be set later
352 hs >>= 1;
353 if (hs >= (1 << 24))
354 {
355 if (p->numHashBytes == 3)
356 hs = (1 << 24) - 1;
357 else
358 hs >>= 1;
359 /* if (bigHash) mode, GetHeads4b() in LzFindMt.c needs (hs >= ((1 << 24) - 1))) */
360 }
361 // (hash_size >= (1 << 16)) : Required for (numHashBytes > 2)
362 hs |= (1 << 16) - 1; /* don't change it! */
363 // bt5: we adjust the size with recommended minimum size
364 if (p->numHashBytes >= 5)
365 hs |= (256 << kLzHash_CrcShift_2) - 1;
366 return hs;
367 }
368
369
MatchFinder_Create(CMatchFinder * p,UInt32 historySize,UInt32 keepAddBufferBefore,UInt32 matchMaxLen,UInt32 keepAddBufferAfter,ISzAllocPtr alloc)370 int MatchFinder_Create(CMatchFinder *p, UInt32 historySize,
371 UInt32 keepAddBufferBefore, UInt32 matchMaxLen, UInt32 keepAddBufferAfter,
372 ISzAllocPtr alloc)
373 {
374 /* we need one additional byte in (p->keepSizeBefore),
375 since we use MoveBlock() after (p->pos++) and before dictionary using */
376 // keepAddBufferBefore = (UInt32)0xFFFFFFFF - (1 << 22); // for debug
377 p->keepSizeBefore = historySize + keepAddBufferBefore + 1;
378
379 keepAddBufferAfter += matchMaxLen;
380 /* we need (p->keepSizeAfter >= p->numHashBytes) */
381 if (keepAddBufferAfter < p->numHashBytes)
382 keepAddBufferAfter = p->numHashBytes;
383 // keepAddBufferAfter -= 2; // for debug
384 p->keepSizeAfter = keepAddBufferAfter;
385
386 if (p->directInput)
387 p->blockSize = 0;
388 if (p->directInput || LzInWindow_Create2(p, GetBlockSize(p, historySize), alloc))
389 {
390 size_t hashSizeSum;
391 {
392 UInt32 hs;
393 UInt32 hsCur;
394
395 if (p->numHashOutBits != 0)
396 {
397 unsigned numBits = p->numHashOutBits;
398 const unsigned nbMax =
399 (p->numHashBytes == 2 ? 16 :
400 (p->numHashBytes == 3 ? 24 : 32));
401 if (numBits > nbMax)
402 numBits = nbMax;
403 if (numBits >= 32)
404 hs = (UInt32)0 - 1;
405 else
406 hs = ((UInt32)1 << numBits) - 1;
407 // (hash_size >= (1 << 16)) : Required for (numHashBytes > 2)
408 hs |= (1 << 16) - 1; /* don't change it! */
409 if (p->numHashBytes >= 5)
410 hs |= (256 << kLzHash_CrcShift_2) - 1;
411 {
412 const UInt32 hs2 = MatchFinder_GetHashMask2(p, historySize);
413 if (hs > hs2)
414 hs = hs2;
415 }
416 hsCur = hs;
417 if (p->expectedDataSize < historySize)
418 {
419 const UInt32 hs2 = MatchFinder_GetHashMask2(p, (UInt32)p->expectedDataSize);
420 if (hsCur > hs2)
421 hsCur = hs2;
422 }
423 }
424 else
425 {
426 hs = MatchFinder_GetHashMask(p, historySize);
427 hsCur = hs;
428 if (p->expectedDataSize < historySize)
429 {
430 hsCur = MatchFinder_GetHashMask(p, (UInt32)p->expectedDataSize);
431 if (hsCur > hs) // is it possible?
432 hsCur = hs;
433 }
434 }
435
436 p->hashMask = hsCur;
437
438 hashSizeSum = hs;
439 hashSizeSum++;
440 if (hashSizeSum < hs)
441 return 0;
442 {
443 UInt32 fixedHashSize = 0;
444 if (p->numHashBytes > 2 && p->numHashBytes_Min <= 2) fixedHashSize += kHash2Size;
445 if (p->numHashBytes > 3 && p->numHashBytes_Min <= 3) fixedHashSize += kHash3Size;
446 // if (p->numHashBytes > 4) p->fixedHashSize += hs4; // kHash4Size;
447 hashSizeSum += fixedHashSize;
448 p->fixedHashSize = fixedHashSize;
449 }
450 }
451
452 p->matchMaxLen = matchMaxLen;
453
454 {
455 size_t newSize;
456 size_t numSons;
457 const UInt32 newCyclicBufferSize = historySize + 1; // do not change it
458 p->historySize = historySize;
459 p->cyclicBufferSize = newCyclicBufferSize; // it must be = (historySize + 1)
460
461 numSons = newCyclicBufferSize;
462 if (p->btMode)
463 numSons <<= 1;
464 newSize = hashSizeSum + numSons;
465
466 if (numSons < newCyclicBufferSize || newSize < numSons)
467 return 0;
468
469 // aligned size is not required here, but it can be better for some loops
470 #define NUM_REFS_ALIGN_MASK 0xF
471 newSize = (newSize + NUM_REFS_ALIGN_MASK) & ~(size_t)NUM_REFS_ALIGN_MASK;
472
473 // 22.02: we don't reallocate buffer, if old size is enough
474 if (p->hash && p->numRefs >= newSize)
475 return 1;
476
477 MatchFinder_FreeThisClassMemory(p, alloc);
478 p->numRefs = newSize;
479 p->hash = AllocRefs(newSize, alloc);
480
481 if (p->hash)
482 {
483 p->son = p->hash + hashSizeSum;
484 return 1;
485 }
486 }
487 }
488
489 MatchFinder_Free(p, alloc);
490 return 0;
491 }
492
493
MatchFinder_SetLimits(CMatchFinder * p)494 static void MatchFinder_SetLimits(CMatchFinder *p)
495 {
496 UInt32 k;
497 UInt32 n = kMaxValForNormalize - p->pos;
498 if (n == 0)
499 n = (UInt32)(Int32)-1; // we allow (pos == 0) at start even with (kMaxValForNormalize == 0)
500
501 k = p->cyclicBufferSize - p->cyclicBufferPos;
502 if (k < n)
503 n = k;
504
505 k = GET_AVAIL_BYTES(p);
506 {
507 const UInt32 ksa = p->keepSizeAfter;
508 UInt32 mm = p->matchMaxLen;
509 if (k > ksa)
510 k -= ksa; // we must limit exactly to keepSizeAfter for ReadBlock
511 else if (k >= mm)
512 {
513 // the limitation for (p->lenLimit) update
514 k -= mm; // optimization : to reduce the number of checks
515 k++;
516 // k = 1; // non-optimized version : for debug
517 }
518 else
519 {
520 mm = k;
521 if (k != 0)
522 k = 1;
523 }
524 p->lenLimit = mm;
525 }
526 if (k < n)
527 n = k;
528
529 p->posLimit = p->pos + n;
530 }
531
532
MatchFinder_Init_LowHash(CMatchFinder * p)533 void MatchFinder_Init_LowHash(CMatchFinder *p)
534 {
535 size_t i;
536 CLzRef *items = p->hash;
537 const size_t numItems = p->fixedHashSize;
538 for (i = 0; i < numItems; i++)
539 items[i] = kEmptyHashValue;
540 }
541
542
MatchFinder_Init_HighHash(CMatchFinder * p)543 void MatchFinder_Init_HighHash(CMatchFinder *p)
544 {
545 size_t i;
546 CLzRef *items = p->hash + p->fixedHashSize;
547 const size_t numItems = (size_t)p->hashMask + 1;
548 for (i = 0; i < numItems; i++)
549 items[i] = kEmptyHashValue;
550 }
551
552
MatchFinder_Init_4(CMatchFinder * p)553 void MatchFinder_Init_4(CMatchFinder *p)
554 {
555 if (!p->directInput)
556 p->buffer = p->bufBase;
557 {
558 /* kEmptyHashValue = 0 (Zero) is used in hash tables as NO-VALUE marker.
559 the code in CMatchFinderMt expects (pos = 1) */
560 p->pos =
561 p->streamPos =
562 1; // it's smallest optimal value. do not change it
563 // 0; // for debug
564 }
565 p->result = SZ_OK;
566 p->streamEndWasReached = 0;
567 }
568
569
570 // (CYC_TO_POS_OFFSET == 0) is expected by some optimized code
571 #define CYC_TO_POS_OFFSET 0
572 // #define CYC_TO_POS_OFFSET 1 // for debug
573
MatchFinder_Init(CMatchFinder * p)574 void MatchFinder_Init(CMatchFinder *p)
575 {
576 MatchFinder_Init_HighHash(p);
577 MatchFinder_Init_LowHash(p);
578 MatchFinder_Init_4(p);
579 // if (readData)
580 MatchFinder_ReadBlock(p);
581
582 /* if we init (cyclicBufferPos = pos), then we can use one variable
583 instead of both (cyclicBufferPos) and (pos) : only before (cyclicBufferPos) wrapping */
584 p->cyclicBufferPos = (p->pos - CYC_TO_POS_OFFSET); // init with relation to (pos)
585 // p->cyclicBufferPos = 0; // smallest value
586 // p->son[0] = p->son[1] = 0; // unused: we can init skipped record for speculated accesses.
587 MatchFinder_SetLimits(p);
588 }
589
590
591
592 #ifdef MY_CPU_X86_OR_AMD64
593 #if defined(__clang__) && (__clang_major__ >= 4) \
594 || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40701)
595 // || defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1900)
596
597 #define USE_LZFIND_SATUR_SUB_128
598 #define USE_LZFIND_SATUR_SUB_256
599 #define LZFIND_ATTRIB_SSE41 __attribute__((__target__("sse4.1")))
600 #define LZFIND_ATTRIB_AVX2 __attribute__((__target__("avx2")))
601 #elif defined(_MSC_VER)
602 #if (_MSC_VER >= 1600)
603 #define USE_LZFIND_SATUR_SUB_128
604 #endif
605 #if (_MSC_VER >= 1900)
606 #define USE_LZFIND_SATUR_SUB_256
607 #endif
608 #endif
609
610 // #elif defined(MY_CPU_ARM_OR_ARM64)
611 #elif defined(MY_CPU_ARM64)
612
613 #if defined(__clang__) && (__clang_major__ >= 8) \
614 || defined(__GNUC__) && (__GNUC__ >= 8)
615 #define USE_LZFIND_SATUR_SUB_128
616 #ifdef MY_CPU_ARM64
617 // #define LZFIND_ATTRIB_SSE41 __attribute__((__target__("")))
618 #else
619 // #define LZFIND_ATTRIB_SSE41 __attribute__((__target__("fpu=crypto-neon-fp-armv8")))
620 #endif
621
622 #elif defined(_MSC_VER)
623 #if (_MSC_VER >= 1910)
624 #define USE_LZFIND_SATUR_SUB_128
625 #endif
626 #endif
627
628 #if defined(_MSC_VER) && defined(MY_CPU_ARM64)
629 #include <arm64_neon.h>
630 #else
631 #include <arm_neon.h>
632 #endif
633
634 #endif
635
636
637 #ifdef USE_LZFIND_SATUR_SUB_128
638
639 // #define Z7_SHOW_HW_STATUS
640
641 #ifdef Z7_SHOW_HW_STATUS
642 #include <stdio.h>
643 #define PRF(x) x
644 PRF(;)
645 #else
646 #define PRF(x)
647 #endif
648
649
650 #ifdef MY_CPU_ARM_OR_ARM64
651
652 #ifdef MY_CPU_ARM64
653 // #define FORCE_LZFIND_SATUR_SUB_128
654 #endif
655 typedef uint32x4_t LzFind_v128;
656 #define SASUB_128_V(v, s) \
657 vsubq_u32(vmaxq_u32(v, s), s)
658
659 #else // MY_CPU_ARM_OR_ARM64
660
661 #include <smmintrin.h> // sse4.1
662
663 typedef __m128i LzFind_v128;
664 // SSE 4.1
665 #define SASUB_128_V(v, s) \
666 _mm_sub_epi32(_mm_max_epu32(v, s), s)
667
668 #endif // MY_CPU_ARM_OR_ARM64
669
670
671 #define SASUB_128(i) \
672 *( LzFind_v128 *)( void *)(items + (i) * 4) = SASUB_128_V( \
673 *(const LzFind_v128 *)(const void *)(items + (i) * 4), sub2);
674
675
676 Z7_NO_INLINE
677 static
678 #ifdef LZFIND_ATTRIB_SSE41
679 LZFIND_ATTRIB_SSE41
680 #endif
681 void
682 Z7_FASTCALL
LzFind_SaturSub_128(UInt32 subValue,CLzRef * items,const CLzRef * lim)683 LzFind_SaturSub_128(UInt32 subValue, CLzRef *items, const CLzRef *lim)
684 {
685 const LzFind_v128 sub2 =
686 #ifdef MY_CPU_ARM_OR_ARM64
687 vdupq_n_u32(subValue);
688 #else
689 _mm_set_epi32((Int32)subValue, (Int32)subValue, (Int32)subValue, (Int32)subValue);
690 #endif
691 Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
692 do
693 {
694 SASUB_128(0) SASUB_128(1) items += 2 * 4;
695 SASUB_128(0) SASUB_128(1) items += 2 * 4;
696 }
697 while (items != lim);
698 }
699
700
701
702 #ifdef USE_LZFIND_SATUR_SUB_256
703
704 #include <immintrin.h> // avx
705 /*
706 clang :immintrin.h uses
707 #if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
708 defined(__AVX2__)
709 #include <avx2intrin.h>
710 #endif
711 so we need <avxintrin.h> for clang-cl */
712
713 #if defined(__clang__)
714 #include <avxintrin.h>
715 #include <avx2intrin.h>
716 #endif
717
718 // AVX2:
719 #define SASUB_256(i) \
720 *( __m256i *)( void *)(items + (i) * 8) = \
721 _mm256_sub_epi32(_mm256_max_epu32( \
722 *(const __m256i *)(const void *)(items + (i) * 8), sub2), sub2);
723
724 Z7_NO_INLINE
725 static
726 #ifdef LZFIND_ATTRIB_AVX2
727 LZFIND_ATTRIB_AVX2
728 #endif
729 void
730 Z7_FASTCALL
LzFind_SaturSub_256(UInt32 subValue,CLzRef * items,const CLzRef * lim)731 LzFind_SaturSub_256(UInt32 subValue, CLzRef *items, const CLzRef *lim)
732 {
733 const __m256i sub2 = _mm256_set_epi32(
734 (Int32)subValue, (Int32)subValue, (Int32)subValue, (Int32)subValue,
735 (Int32)subValue, (Int32)subValue, (Int32)subValue, (Int32)subValue);
736 Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
737 do
738 {
739 SASUB_256(0) SASUB_256(1) items += 2 * 8;
740 SASUB_256(0) SASUB_256(1) items += 2 * 8;
741 }
742 while (items != lim);
743 }
744 #endif // USE_LZFIND_SATUR_SUB_256
745
746 #ifndef FORCE_LZFIND_SATUR_SUB_128
747 typedef void (Z7_FASTCALL *LZFIND_SATUR_SUB_CODE_FUNC)(
748 UInt32 subValue, CLzRef *items, const CLzRef *lim);
749 static LZFIND_SATUR_SUB_CODE_FUNC g_LzFind_SaturSub;
750 #endif // FORCE_LZFIND_SATUR_SUB_128
751
752 #endif // USE_LZFIND_SATUR_SUB_128
753
754
755 // kEmptyHashValue must be zero
756 // #define SASUB_32(i) { UInt32 v = items[i]; UInt32 m = v - subValue; if (v < subValue) m = kEmptyHashValue; items[i] = m; }
757 #define SASUB_32(i) { UInt32 v = items[i]; if (v < subValue) v = subValue; items[i] = v - subValue; }
758
759 #ifdef FORCE_LZFIND_SATUR_SUB_128
760
761 #define DEFAULT_SaturSub LzFind_SaturSub_128
762
763 #else
764
765 #define DEFAULT_SaturSub LzFind_SaturSub_32
766
767 Z7_NO_INLINE
768 static
769 void
770 Z7_FASTCALL
LzFind_SaturSub_32(UInt32 subValue,CLzRef * items,const CLzRef * lim)771 LzFind_SaturSub_32(UInt32 subValue, CLzRef *items, const CLzRef *lim)
772 {
773 Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
774 do
775 {
776 SASUB_32(0) SASUB_32(1) items += 2;
777 SASUB_32(0) SASUB_32(1) items += 2;
778 SASUB_32(0) SASUB_32(1) items += 2;
779 SASUB_32(0) SASUB_32(1) items += 2;
780 }
781 while (items != lim);
782 }
783
784 #endif
785
786
787 Z7_NO_INLINE
MatchFinder_Normalize3(UInt32 subValue,CLzRef * items,size_t numItems)788 void MatchFinder_Normalize3(UInt32 subValue, CLzRef *items, size_t numItems)
789 {
790 #define LZFIND_NORM_ALIGN_BLOCK_SIZE (1 << 7)
791 Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
792 for (; numItems != 0 && ((unsigned)(ptrdiff_t)items & (LZFIND_NORM_ALIGN_BLOCK_SIZE - 1)) != 0; numItems--)
793 {
794 SASUB_32(0)
795 items++;
796 }
797 {
798 const size_t k_Align_Mask = (LZFIND_NORM_ALIGN_BLOCK_SIZE / 4 - 1);
799 CLzRef *lim = items + (numItems & ~(size_t)k_Align_Mask);
800 numItems &= k_Align_Mask;
801 if (items != lim)
802 {
803 #if defined(USE_LZFIND_SATUR_SUB_128) && !defined(FORCE_LZFIND_SATUR_SUB_128)
804 if (g_LzFind_SaturSub)
805 g_LzFind_SaturSub(subValue, items, lim);
806 else
807 #endif
808 DEFAULT_SaturSub(subValue, items, lim);
809 }
810 items = lim;
811 }
812 Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
813 for (; numItems != 0; numItems--)
814 {
815 SASUB_32(0)
816 items++;
817 }
818 }
819
820
821
822 // call MatchFinder_CheckLimits() only after (p->pos++) update
823
824 Z7_NO_INLINE
MatchFinder_CheckLimits(CMatchFinder * p)825 static void MatchFinder_CheckLimits(CMatchFinder *p)
826 {
827 if (// !p->streamEndWasReached && p->result == SZ_OK &&
828 p->keepSizeAfter == GET_AVAIL_BYTES(p))
829 {
830 // we try to read only in exact state (p->keepSizeAfter == GET_AVAIL_BYTES(p))
831 if (MatchFinder_NeedMove(p))
832 MatchFinder_MoveBlock(p);
833 MatchFinder_ReadBlock(p);
834 }
835
836 if (p->pos == kMaxValForNormalize)
837 if (GET_AVAIL_BYTES(p) >= p->numHashBytes) // optional optimization for last bytes of data.
838 /*
839 if we disable normalization for last bytes of data, and
840 if (data_size == 4 GiB), we don't call wastfull normalization,
841 but (pos) will be wrapped over Zero (0) in that case.
842 And we cannot resume later to normal operation
843 */
844 {
845 // MatchFinder_Normalize(p);
846 /* after normalization we need (p->pos >= p->historySize + 1); */
847 /* we can reduce subValue to aligned value, if want to keep alignment
848 of (p->pos) and (p->buffer) for speculated accesses. */
849 const UInt32 subValue = (p->pos - p->historySize - 1) /* & ~(UInt32)(kNormalizeAlign - 1) */;
850 // const UInt32 subValue = (1 << 15); // for debug
851 // printf("\nMatchFinder_Normalize() subValue == 0x%x\n", subValue);
852 MatchFinder_REDUCE_OFFSETS(p, subValue)
853 MatchFinder_Normalize3(subValue, p->hash, (size_t)p->hashMask + 1 + p->fixedHashSize);
854 {
855 size_t numSonRefs = p->cyclicBufferSize;
856 if (p->btMode)
857 numSonRefs <<= 1;
858 MatchFinder_Normalize3(subValue, p->son, numSonRefs);
859 }
860 }
861
862 if (p->cyclicBufferPos == p->cyclicBufferSize)
863 p->cyclicBufferPos = 0;
864
865 MatchFinder_SetLimits(p);
866 }
867
868
869 /*
870 (lenLimit > maxLen)
871 */
872 Z7_FORCE_INLINE
Hc_GetMatchesSpec(size_t lenLimit,UInt32 curMatch,UInt32 pos,const Byte * cur,CLzRef * son,size_t _cyclicBufferPos,UInt32 _cyclicBufferSize,UInt32 cutValue,UInt32 * d,unsigned maxLen)873 static UInt32 * Hc_GetMatchesSpec(size_t lenLimit, UInt32 curMatch, UInt32 pos, const Byte *cur, CLzRef *son,
874 size_t _cyclicBufferPos, UInt32 _cyclicBufferSize, UInt32 cutValue,
875 UInt32 *d, unsigned maxLen)
876 {
877 /*
878 son[_cyclicBufferPos] = curMatch;
879 for (;;)
880 {
881 UInt32 delta = pos - curMatch;
882 if (cutValue-- == 0 || delta >= _cyclicBufferSize)
883 return d;
884 {
885 const Byte *pb = cur - delta;
886 curMatch = son[_cyclicBufferPos - delta + ((delta > _cyclicBufferPos) ? _cyclicBufferSize : 0)];
887 if (pb[maxLen] == cur[maxLen] && *pb == *cur)
888 {
889 UInt32 len = 0;
890 while (++len != lenLimit)
891 if (pb[len] != cur[len])
892 break;
893 if (maxLen < len)
894 {
895 maxLen = len;
896 *d++ = len;
897 *d++ = delta - 1;
898 if (len == lenLimit)
899 return d;
900 }
901 }
902 }
903 }
904 */
905
906 const Byte *lim = cur + lenLimit;
907 son[_cyclicBufferPos] = curMatch;
908
909 do
910 {
911 UInt32 delta;
912
913 if (curMatch == 0)
914 break;
915 // if (curMatch2 >= curMatch) return NULL;
916 delta = pos - curMatch;
917 if (delta >= _cyclicBufferSize)
918 break;
919 {
920 ptrdiff_t diff;
921 curMatch = son[_cyclicBufferPos - delta + ((delta > _cyclicBufferPos) ? _cyclicBufferSize : 0)];
922 diff = (ptrdiff_t)0 - (ptrdiff_t)delta;
923 if (cur[maxLen] == cur[(ptrdiff_t)maxLen + diff])
924 {
925 const Byte *c = cur;
926 while (*c == c[diff])
927 {
928 if (++c == lim)
929 {
930 d[0] = (UInt32)(lim - cur);
931 d[1] = delta - 1;
932 return d + 2;
933 }
934 }
935 {
936 const unsigned len = (unsigned)(c - cur);
937 if (maxLen < len)
938 {
939 maxLen = len;
940 d[0] = (UInt32)len;
941 d[1] = delta - 1;
942 d += 2;
943 }
944 }
945 }
946 }
947 }
948 while (--cutValue);
949
950 return d;
951 }
952
953
954 Z7_FORCE_INLINE
GetMatchesSpec1(UInt32 lenLimit,UInt32 curMatch,UInt32 pos,const Byte * cur,CLzRef * son,size_t _cyclicBufferPos,UInt32 _cyclicBufferSize,UInt32 cutValue,UInt32 * d,UInt32 maxLen)955 UInt32 * GetMatchesSpec1(UInt32 lenLimit, UInt32 curMatch, UInt32 pos, const Byte *cur, CLzRef *son,
956 size_t _cyclicBufferPos, UInt32 _cyclicBufferSize, UInt32 cutValue,
957 UInt32 *d, UInt32 maxLen)
958 {
959 CLzRef *ptr0 = son + ((size_t)_cyclicBufferPos << 1) + 1;
960 CLzRef *ptr1 = son + ((size_t)_cyclicBufferPos << 1);
961 unsigned len0 = 0, len1 = 0;
962
963 UInt32 cmCheck;
964
965 // if (curMatch >= pos) { *ptr0 = *ptr1 = kEmptyHashValue; return NULL; }
966
967 cmCheck = (UInt32)(pos - _cyclicBufferSize);
968 if ((UInt32)pos <= _cyclicBufferSize)
969 cmCheck = 0;
970
971 if (cmCheck < curMatch)
972 do
973 {
974 const UInt32 delta = pos - curMatch;
975 {
976 CLzRef *pair = son + ((size_t)(_cyclicBufferPos - delta + ((delta > _cyclicBufferPos) ? _cyclicBufferSize : 0)) << 1);
977 const Byte *pb = cur - delta;
978 unsigned len = (len0 < len1 ? len0 : len1);
979 const UInt32 pair0 = pair[0];
980 if (pb[len] == cur[len])
981 {
982 if (++len != lenLimit && pb[len] == cur[len])
983 while (++len != lenLimit)
984 if (pb[len] != cur[len])
985 break;
986 if (maxLen < len)
987 {
988 maxLen = (UInt32)len;
989 *d++ = (UInt32)len;
990 *d++ = delta - 1;
991 if (len == lenLimit)
992 {
993 *ptr1 = pair0;
994 *ptr0 = pair[1];
995 return d;
996 }
997 }
998 }
999 if (pb[len] < cur[len])
1000 {
1001 *ptr1 = curMatch;
1002 // const UInt32 curMatch2 = pair[1];
1003 // if (curMatch2 >= curMatch) { *ptr0 = *ptr1 = kEmptyHashValue; return NULL; }
1004 // curMatch = curMatch2;
1005 curMatch = pair[1];
1006 ptr1 = pair + 1;
1007 len1 = len;
1008 }
1009 else
1010 {
1011 *ptr0 = curMatch;
1012 curMatch = pair[0];
1013 ptr0 = pair;
1014 len0 = len;
1015 }
1016 }
1017 }
1018 while(--cutValue && cmCheck < curMatch);
1019
1020 *ptr0 = *ptr1 = kEmptyHashValue;
1021 return d;
1022 }
1023
1024
SkipMatchesSpec(UInt32 lenLimit,UInt32 curMatch,UInt32 pos,const Byte * cur,CLzRef * son,size_t _cyclicBufferPos,UInt32 _cyclicBufferSize,UInt32 cutValue)1025 static void SkipMatchesSpec(UInt32 lenLimit, UInt32 curMatch, UInt32 pos, const Byte *cur, CLzRef *son,
1026 size_t _cyclicBufferPos, UInt32 _cyclicBufferSize, UInt32 cutValue)
1027 {
1028 CLzRef *ptr0 = son + ((size_t)_cyclicBufferPos << 1) + 1;
1029 CLzRef *ptr1 = son + ((size_t)_cyclicBufferPos << 1);
1030 unsigned len0 = 0, len1 = 0;
1031
1032 UInt32 cmCheck;
1033
1034 cmCheck = (UInt32)(pos - _cyclicBufferSize);
1035 if ((UInt32)pos <= _cyclicBufferSize)
1036 cmCheck = 0;
1037
1038 if (// curMatch >= pos || // failure
1039 cmCheck < curMatch)
1040 do
1041 {
1042 const UInt32 delta = pos - curMatch;
1043 {
1044 CLzRef *pair = son + ((size_t)(_cyclicBufferPos - delta + ((delta > _cyclicBufferPos) ? _cyclicBufferSize : 0)) << 1);
1045 const Byte *pb = cur - delta;
1046 unsigned len = (len0 < len1 ? len0 : len1);
1047 if (pb[len] == cur[len])
1048 {
1049 while (++len != lenLimit)
1050 if (pb[len] != cur[len])
1051 break;
1052 {
1053 if (len == lenLimit)
1054 {
1055 *ptr1 = pair[0];
1056 *ptr0 = pair[1];
1057 return;
1058 }
1059 }
1060 }
1061 if (pb[len] < cur[len])
1062 {
1063 *ptr1 = curMatch;
1064 curMatch = pair[1];
1065 ptr1 = pair + 1;
1066 len1 = len;
1067 }
1068 else
1069 {
1070 *ptr0 = curMatch;
1071 curMatch = pair[0];
1072 ptr0 = pair;
1073 len0 = len;
1074 }
1075 }
1076 }
1077 while(--cutValue && cmCheck < curMatch);
1078
1079 *ptr0 = *ptr1 = kEmptyHashValue;
1080 return;
1081 }
1082
1083
1084 #define MOVE_POS \
1085 ++p->cyclicBufferPos; \
1086 p->buffer++; \
1087 { const UInt32 pos1 = p->pos + 1; p->pos = pos1; if (pos1 == p->posLimit) MatchFinder_CheckLimits(p); }
1088
1089 #define MOVE_POS_RET MOVE_POS return distances;
1090
1091 Z7_NO_INLINE
MatchFinder_MovePos(CMatchFinder * p)1092 static void MatchFinder_MovePos(CMatchFinder *p)
1093 {
1094 /* we go here at the end of stream data, when (avail < num_hash_bytes)
1095 We don't update sons[cyclicBufferPos << btMode].
1096 So (sons) record will contain junk. And we cannot resume match searching
1097 to normal operation, even if we will provide more input data in buffer.
1098 p->sons[p->cyclicBufferPos << p->btMode] = 0; // kEmptyHashValue
1099 if (p->btMode)
1100 p->sons[(p->cyclicBufferPos << p->btMode) + 1] = 0; // kEmptyHashValue
1101 */
1102 MOVE_POS
1103 }
1104
1105 #define GET_MATCHES_HEADER2(minLen, ret_op) \
1106 unsigned lenLimit; UInt32 hv; const Byte *cur; UInt32 curMatch; \
1107 lenLimit = (unsigned)p->lenLimit; { if (lenLimit < minLen) { MatchFinder_MovePos(p); ret_op; }} \
1108 cur = p->buffer;
1109
1110 #define GET_MATCHES_HEADER(minLen) GET_MATCHES_HEADER2(minLen, return distances)
1111 #define SKIP_HEADER(minLen) do { GET_MATCHES_HEADER2(minLen, continue)
1112
1113 #define MF_PARAMS(p) lenLimit, curMatch, p->pos, p->buffer, p->son, p->cyclicBufferPos, p->cyclicBufferSize, p->cutValue
1114
1115 #define SKIP_FOOTER SkipMatchesSpec(MF_PARAMS(p)); MOVE_POS } while (--num);
1116
1117 #define GET_MATCHES_FOOTER_BASE(_maxLen_, func) \
1118 distances = func(MF_PARAMS(p), \
1119 distances, (UInt32)_maxLen_); MOVE_POS_RET
1120
1121 #define GET_MATCHES_FOOTER_BT(_maxLen_) \
1122 GET_MATCHES_FOOTER_BASE(_maxLen_, GetMatchesSpec1)
1123
1124 #define GET_MATCHES_FOOTER_HC(_maxLen_) \
1125 GET_MATCHES_FOOTER_BASE(_maxLen_, Hc_GetMatchesSpec)
1126
1127
1128
1129 #define UPDATE_maxLen { \
1130 const ptrdiff_t diff = (ptrdiff_t)0 - (ptrdiff_t)d2; \
1131 const Byte *c = cur + maxLen; \
1132 const Byte *lim = cur + lenLimit; \
1133 for (; c != lim; c++) if (*(c + diff) != *c) break; \
1134 maxLen = (unsigned)(c - cur); }
1135
Bt2_MatchFinder_GetMatches(CMatchFinder * p,UInt32 * distances)1136 static UInt32* Bt2_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances)
1137 {
1138 GET_MATCHES_HEADER(2)
1139 HASH2_CALC
1140 curMatch = p->hash[hv];
1141 p->hash[hv] = p->pos;
1142 GET_MATCHES_FOOTER_BT(1)
1143 }
1144
Bt3Zip_MatchFinder_GetMatches(CMatchFinder * p,UInt32 * distances)1145 UInt32* Bt3Zip_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances)
1146 {
1147 GET_MATCHES_HEADER(3)
1148 HASH_ZIP_CALC
1149 curMatch = p->hash[hv];
1150 p->hash[hv] = p->pos;
1151 GET_MATCHES_FOOTER_BT(2)
1152 }
1153
1154
1155 #define SET_mmm \
1156 mmm = p->cyclicBufferSize; \
1157 if (pos < mmm) \
1158 mmm = pos;
1159
1160
Bt3_MatchFinder_GetMatches(CMatchFinder * p,UInt32 * distances)1161 static UInt32* Bt3_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances)
1162 {
1163 UInt32 mmm;
1164 UInt32 h2, d2, pos;
1165 unsigned maxLen;
1166 UInt32 *hash;
1167 GET_MATCHES_HEADER(3)
1168
1169 HASH3_CALC
1170
1171 hash = p->hash;
1172 pos = p->pos;
1173
1174 d2 = pos - hash[h2];
1175
1176 curMatch = (hash + kFix3HashSize)[hv];
1177
1178 hash[h2] = pos;
1179 (hash + kFix3HashSize)[hv] = pos;
1180
1181 SET_mmm
1182
1183 maxLen = 2;
1184
1185 if (d2 < mmm && *(cur - d2) == *cur)
1186 {
1187 UPDATE_maxLen
1188 distances[0] = (UInt32)maxLen;
1189 distances[1] = d2 - 1;
1190 distances += 2;
1191 if (maxLen == lenLimit)
1192 {
1193 SkipMatchesSpec(MF_PARAMS(p));
1194 MOVE_POS_RET
1195 }
1196 }
1197
1198 GET_MATCHES_FOOTER_BT(maxLen)
1199 }
1200
1201
Bt4_MatchFinder_GetMatches(CMatchFinder * p,UInt32 * distances)1202 static UInt32* Bt4_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances)
1203 {
1204 UInt32 mmm;
1205 UInt32 h2, h3, d2, d3, pos;
1206 unsigned maxLen;
1207 UInt32 *hash;
1208 GET_MATCHES_HEADER(4)
1209
1210 HASH4_CALC
1211
1212 hash = p->hash;
1213 pos = p->pos;
1214
1215 d2 = pos - hash [h2];
1216 d3 = pos - (hash + kFix3HashSize)[h3];
1217 curMatch = (hash + kFix4HashSize)[hv];
1218
1219 hash [h2] = pos;
1220 (hash + kFix3HashSize)[h3] = pos;
1221 (hash + kFix4HashSize)[hv] = pos;
1222
1223 SET_mmm
1224
1225 maxLen = 3;
1226
1227 for (;;)
1228 {
1229 if (d2 < mmm && *(cur - d2) == *cur)
1230 {
1231 distances[0] = 2;
1232 distances[1] = d2 - 1;
1233 distances += 2;
1234 if (*(cur - d2 + 2) == cur[2])
1235 {
1236 // distances[-2] = 3;
1237 }
1238 else if (d3 < mmm && *(cur - d3) == *cur)
1239 {
1240 d2 = d3;
1241 distances[1] = d3 - 1;
1242 distances += 2;
1243 }
1244 else
1245 break;
1246 }
1247 else if (d3 < mmm && *(cur - d3) == *cur)
1248 {
1249 d2 = d3;
1250 distances[1] = d3 - 1;
1251 distances += 2;
1252 }
1253 else
1254 break;
1255
1256 UPDATE_maxLen
1257 distances[-2] = (UInt32)maxLen;
1258 if (maxLen == lenLimit)
1259 {
1260 SkipMatchesSpec(MF_PARAMS(p));
1261 MOVE_POS_RET
1262 }
1263 break;
1264 }
1265
1266 GET_MATCHES_FOOTER_BT(maxLen)
1267 }
1268
1269
Bt5_MatchFinder_GetMatches(CMatchFinder * p,UInt32 * distances)1270 static UInt32* Bt5_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances)
1271 {
1272 UInt32 mmm;
1273 UInt32 h2, h3, d2, d3, maxLen, pos;
1274 UInt32 *hash;
1275 GET_MATCHES_HEADER(5)
1276
1277 HASH5_CALC
1278
1279 hash = p->hash;
1280 pos = p->pos;
1281
1282 d2 = pos - hash [h2];
1283 d3 = pos - (hash + kFix3HashSize)[h3];
1284 // d4 = pos - (hash + kFix4HashSize)[h4];
1285
1286 curMatch = (hash + kFix5HashSize)[hv];
1287
1288 hash [h2] = pos;
1289 (hash + kFix3HashSize)[h3] = pos;
1290 // (hash + kFix4HashSize)[h4] = pos;
1291 (hash + kFix5HashSize)[hv] = pos;
1292
1293 SET_mmm
1294
1295 maxLen = 4;
1296
1297 for (;;)
1298 {
1299 if (d2 < mmm && *(cur - d2) == *cur)
1300 {
1301 distances[0] = 2;
1302 distances[1] = d2 - 1;
1303 distances += 2;
1304 if (*(cur - d2 + 2) == cur[2])
1305 {
1306 }
1307 else if (d3 < mmm && *(cur - d3) == *cur)
1308 {
1309 distances[1] = d3 - 1;
1310 distances += 2;
1311 d2 = d3;
1312 }
1313 else
1314 break;
1315 }
1316 else if (d3 < mmm && *(cur - d3) == *cur)
1317 {
1318 distances[1] = d3 - 1;
1319 distances += 2;
1320 d2 = d3;
1321 }
1322 else
1323 break;
1324
1325 distances[-2] = 3;
1326 if (*(cur - d2 + 3) != cur[3])
1327 break;
1328 UPDATE_maxLen
1329 distances[-2] = (UInt32)maxLen;
1330 if (maxLen == lenLimit)
1331 {
1332 SkipMatchesSpec(MF_PARAMS(p));
1333 MOVE_POS_RET
1334 }
1335 break;
1336 }
1337
1338 GET_MATCHES_FOOTER_BT(maxLen)
1339 }
1340
1341
Hc4_MatchFinder_GetMatches(CMatchFinder * p,UInt32 * distances)1342 static UInt32* Hc4_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances)
1343 {
1344 UInt32 mmm;
1345 UInt32 h2, h3, d2, d3, pos;
1346 unsigned maxLen;
1347 UInt32 *hash;
1348 GET_MATCHES_HEADER(4)
1349
1350 HASH4_CALC
1351
1352 hash = p->hash;
1353 pos = p->pos;
1354
1355 d2 = pos - hash [h2];
1356 d3 = pos - (hash + kFix3HashSize)[h3];
1357 curMatch = (hash + kFix4HashSize)[hv];
1358
1359 hash [h2] = pos;
1360 (hash + kFix3HashSize)[h3] = pos;
1361 (hash + kFix4HashSize)[hv] = pos;
1362
1363 SET_mmm
1364
1365 maxLen = 3;
1366
1367 for (;;)
1368 {
1369 if (d2 < mmm && *(cur - d2) == *cur)
1370 {
1371 distances[0] = 2;
1372 distances[1] = d2 - 1;
1373 distances += 2;
1374 if (*(cur - d2 + 2) == cur[2])
1375 {
1376 // distances[-2] = 3;
1377 }
1378 else if (d3 < mmm && *(cur - d3) == *cur)
1379 {
1380 d2 = d3;
1381 distances[1] = d3 - 1;
1382 distances += 2;
1383 }
1384 else
1385 break;
1386 }
1387 else if (d3 < mmm && *(cur - d3) == *cur)
1388 {
1389 d2 = d3;
1390 distances[1] = d3 - 1;
1391 distances += 2;
1392 }
1393 else
1394 break;
1395
1396 UPDATE_maxLen
1397 distances[-2] = (UInt32)maxLen;
1398 if (maxLen == lenLimit)
1399 {
1400 p->son[p->cyclicBufferPos] = curMatch;
1401 MOVE_POS_RET
1402 }
1403 break;
1404 }
1405
1406 GET_MATCHES_FOOTER_HC(maxLen)
1407 }
1408
1409
Hc5_MatchFinder_GetMatches(CMatchFinder * p,UInt32 * distances)1410 static UInt32 * Hc5_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances)
1411 {
1412 UInt32 mmm;
1413 UInt32 h2, h3, d2, d3, maxLen, pos;
1414 UInt32 *hash;
1415 GET_MATCHES_HEADER(5)
1416
1417 HASH5_CALC
1418
1419 hash = p->hash;
1420 pos = p->pos;
1421
1422 d2 = pos - hash [h2];
1423 d3 = pos - (hash + kFix3HashSize)[h3];
1424 // d4 = pos - (hash + kFix4HashSize)[h4];
1425
1426 curMatch = (hash + kFix5HashSize)[hv];
1427
1428 hash [h2] = pos;
1429 (hash + kFix3HashSize)[h3] = pos;
1430 // (hash + kFix4HashSize)[h4] = pos;
1431 (hash + kFix5HashSize)[hv] = pos;
1432
1433 SET_mmm
1434
1435 maxLen = 4;
1436
1437 for (;;)
1438 {
1439 if (d2 < mmm && *(cur - d2) == *cur)
1440 {
1441 distances[0] = 2;
1442 distances[1] = d2 - 1;
1443 distances += 2;
1444 if (*(cur - d2 + 2) == cur[2])
1445 {
1446 }
1447 else if (d3 < mmm && *(cur - d3) == *cur)
1448 {
1449 distances[1] = d3 - 1;
1450 distances += 2;
1451 d2 = d3;
1452 }
1453 else
1454 break;
1455 }
1456 else if (d3 < mmm && *(cur - d3) == *cur)
1457 {
1458 distances[1] = d3 - 1;
1459 distances += 2;
1460 d2 = d3;
1461 }
1462 else
1463 break;
1464
1465 distances[-2] = 3;
1466 if (*(cur - d2 + 3) != cur[3])
1467 break;
1468 UPDATE_maxLen
1469 distances[-2] = maxLen;
1470 if (maxLen == lenLimit)
1471 {
1472 p->son[p->cyclicBufferPos] = curMatch;
1473 MOVE_POS_RET
1474 }
1475 break;
1476 }
1477
1478 GET_MATCHES_FOOTER_HC(maxLen)
1479 }
1480
1481
Hc3Zip_MatchFinder_GetMatches(CMatchFinder * p,UInt32 * distances)1482 UInt32* Hc3Zip_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances)
1483 {
1484 GET_MATCHES_HEADER(3)
1485 HASH_ZIP_CALC
1486 curMatch = p->hash[hv];
1487 p->hash[hv] = p->pos;
1488 GET_MATCHES_FOOTER_HC(2)
1489 }
1490
1491
Bt2_MatchFinder_Skip(CMatchFinder * p,UInt32 num)1492 static void Bt2_MatchFinder_Skip(CMatchFinder *p, UInt32 num)
1493 {
1494 SKIP_HEADER(2)
1495 {
1496 HASH2_CALC
1497 curMatch = p->hash[hv];
1498 p->hash[hv] = p->pos;
1499 }
1500 SKIP_FOOTER
1501 }
1502
Bt3Zip_MatchFinder_Skip(CMatchFinder * p,UInt32 num)1503 void Bt3Zip_MatchFinder_Skip(CMatchFinder *p, UInt32 num)
1504 {
1505 SKIP_HEADER(3)
1506 {
1507 HASH_ZIP_CALC
1508 curMatch = p->hash[hv];
1509 p->hash[hv] = p->pos;
1510 }
1511 SKIP_FOOTER
1512 }
1513
Bt3_MatchFinder_Skip(CMatchFinder * p,UInt32 num)1514 static void Bt3_MatchFinder_Skip(CMatchFinder *p, UInt32 num)
1515 {
1516 SKIP_HEADER(3)
1517 {
1518 UInt32 h2;
1519 UInt32 *hash;
1520 HASH3_CALC
1521 hash = p->hash;
1522 curMatch = (hash + kFix3HashSize)[hv];
1523 hash[h2] =
1524 (hash + kFix3HashSize)[hv] = p->pos;
1525 }
1526 SKIP_FOOTER
1527 }
1528
Bt4_MatchFinder_Skip(CMatchFinder * p,UInt32 num)1529 static void Bt4_MatchFinder_Skip(CMatchFinder *p, UInt32 num)
1530 {
1531 SKIP_HEADER(4)
1532 {
1533 UInt32 h2, h3;
1534 UInt32 *hash;
1535 HASH4_CALC
1536 hash = p->hash;
1537 curMatch = (hash + kFix4HashSize)[hv];
1538 hash [h2] =
1539 (hash + kFix3HashSize)[h3] =
1540 (hash + kFix4HashSize)[hv] = p->pos;
1541 }
1542 SKIP_FOOTER
1543 }
1544
Bt5_MatchFinder_Skip(CMatchFinder * p,UInt32 num)1545 static void Bt5_MatchFinder_Skip(CMatchFinder *p, UInt32 num)
1546 {
1547 SKIP_HEADER(5)
1548 {
1549 UInt32 h2, h3;
1550 UInt32 *hash;
1551 HASH5_CALC
1552 hash = p->hash;
1553 curMatch = (hash + kFix5HashSize)[hv];
1554 hash [h2] =
1555 (hash + kFix3HashSize)[h3] =
1556 // (hash + kFix4HashSize)[h4] =
1557 (hash + kFix5HashSize)[hv] = p->pos;
1558 }
1559 SKIP_FOOTER
1560 }
1561
1562
1563 #define HC_SKIP_HEADER(minLen) \
1564 do { if (p->lenLimit < minLen) { MatchFinder_MovePos(p); num--; continue; } { \
1565 const Byte *cur; \
1566 UInt32 *hash; \
1567 UInt32 *son; \
1568 UInt32 pos = p->pos; \
1569 UInt32 num2 = num; \
1570 /* (p->pos == p->posLimit) is not allowed here !!! */ \
1571 { const UInt32 rem = p->posLimit - pos; if (num2 > rem) num2 = rem; } \
1572 num -= num2; \
1573 { const UInt32 cycPos = p->cyclicBufferPos; \
1574 son = p->son + cycPos; \
1575 p->cyclicBufferPos = cycPos + num2; } \
1576 cur = p->buffer; \
1577 hash = p->hash; \
1578 do { \
1579 UInt32 curMatch; \
1580 UInt32 hv;
1581
1582
1583 #define HC_SKIP_FOOTER \
1584 cur++; pos++; *son++ = curMatch; \
1585 } while (--num2); \
1586 p->buffer = cur; \
1587 p->pos = pos; \
1588 if (pos == p->posLimit) MatchFinder_CheckLimits(p); \
1589 }} while(num); \
1590
1591
Hc4_MatchFinder_Skip(CMatchFinder * p,UInt32 num)1592 static void Hc4_MatchFinder_Skip(CMatchFinder *p, UInt32 num)
1593 {
1594 HC_SKIP_HEADER(4)
1595
1596 UInt32 h2, h3;
1597 HASH4_CALC
1598 curMatch = (hash + kFix4HashSize)[hv];
1599 hash [h2] =
1600 (hash + kFix3HashSize)[h3] =
1601 (hash + kFix4HashSize)[hv] = pos;
1602
1603 HC_SKIP_FOOTER
1604 }
1605
1606
Hc5_MatchFinder_Skip(CMatchFinder * p,UInt32 num)1607 static void Hc5_MatchFinder_Skip(CMatchFinder *p, UInt32 num)
1608 {
1609 HC_SKIP_HEADER(5)
1610
1611 UInt32 h2, h3;
1612 HASH5_CALC
1613 curMatch = (hash + kFix5HashSize)[hv];
1614 hash [h2] =
1615 (hash + kFix3HashSize)[h3] =
1616 // (hash + kFix4HashSize)[h4] =
1617 (hash + kFix5HashSize)[hv] = pos;
1618
1619 HC_SKIP_FOOTER
1620 }
1621
1622
Hc3Zip_MatchFinder_Skip(CMatchFinder * p,UInt32 num)1623 void Hc3Zip_MatchFinder_Skip(CMatchFinder *p, UInt32 num)
1624 {
1625 HC_SKIP_HEADER(3)
1626
1627 HASH_ZIP_CALC
1628 curMatch = hash[hv];
1629 hash[hv] = pos;
1630
1631 HC_SKIP_FOOTER
1632 }
1633
1634
MatchFinder_CreateVTable(CMatchFinder * p,IMatchFinder2 * vTable)1635 void MatchFinder_CreateVTable(CMatchFinder *p, IMatchFinder2 *vTable)
1636 {
1637 vTable->Init = (Mf_Init_Func)MatchFinder_Init;
1638 vTable->GetNumAvailableBytes = (Mf_GetNumAvailableBytes_Func)MatchFinder_GetNumAvailableBytes;
1639 vTable->GetPointerToCurrentPos = (Mf_GetPointerToCurrentPos_Func)MatchFinder_GetPointerToCurrentPos;
1640 if (!p->btMode)
1641 {
1642 if (p->numHashBytes <= 4)
1643 {
1644 vTable->GetMatches = (Mf_GetMatches_Func)Hc4_MatchFinder_GetMatches;
1645 vTable->Skip = (Mf_Skip_Func)Hc4_MatchFinder_Skip;
1646 }
1647 else
1648 {
1649 vTable->GetMatches = (Mf_GetMatches_Func)Hc5_MatchFinder_GetMatches;
1650 vTable->Skip = (Mf_Skip_Func)Hc5_MatchFinder_Skip;
1651 }
1652 }
1653 else if (p->numHashBytes == 2)
1654 {
1655 vTable->GetMatches = (Mf_GetMatches_Func)Bt2_MatchFinder_GetMatches;
1656 vTable->Skip = (Mf_Skip_Func)Bt2_MatchFinder_Skip;
1657 }
1658 else if (p->numHashBytes == 3)
1659 {
1660 vTable->GetMatches = (Mf_GetMatches_Func)Bt3_MatchFinder_GetMatches;
1661 vTable->Skip = (Mf_Skip_Func)Bt3_MatchFinder_Skip;
1662 }
1663 else if (p->numHashBytes == 4)
1664 {
1665 vTable->GetMatches = (Mf_GetMatches_Func)Bt4_MatchFinder_GetMatches;
1666 vTable->Skip = (Mf_Skip_Func)Bt4_MatchFinder_Skip;
1667 }
1668 else
1669 {
1670 vTable->GetMatches = (Mf_GetMatches_Func)Bt5_MatchFinder_GetMatches;
1671 vTable->Skip = (Mf_Skip_Func)Bt5_MatchFinder_Skip;
1672 }
1673 }
1674
1675
1676
LzFindPrepare(void)1677 void LzFindPrepare(void)
1678 {
1679 #ifndef FORCE_LZFIND_SATUR_SUB_128
1680 #ifdef USE_LZFIND_SATUR_SUB_128
1681 LZFIND_SATUR_SUB_CODE_FUNC f = NULL;
1682 #ifdef MY_CPU_ARM_OR_ARM64
1683 {
1684 if (CPU_IsSupported_NEON())
1685 {
1686 // #pragma message ("=== LzFind NEON")
1687 PRF(printf("\n=== LzFind NEON\n"));
1688 f = LzFind_SaturSub_128;
1689 }
1690 // f = 0; // for debug
1691 }
1692 #else // MY_CPU_ARM_OR_ARM64
1693 if (CPU_IsSupported_SSE41())
1694 {
1695 // #pragma message ("=== LzFind SSE41")
1696 PRF(printf("\n=== LzFind SSE41\n"));
1697 f = LzFind_SaturSub_128;
1698
1699 #ifdef USE_LZFIND_SATUR_SUB_256
1700 if (CPU_IsSupported_AVX2())
1701 {
1702 // #pragma message ("=== LzFind AVX2")
1703 PRF(printf("\n=== LzFind AVX2\n"));
1704 f = LzFind_SaturSub_256;
1705 }
1706 #endif
1707 }
1708 #endif // MY_CPU_ARM_OR_ARM64
1709 g_LzFind_SaturSub = f;
1710 #endif // USE_LZFIND_SATUR_SUB_128
1711 #endif // FORCE_LZFIND_SATUR_SUB_128
1712 }
1713
1714
1715 #undef MOVE_POS
1716 #undef MOVE_POS_RET
1717 #undef PRF
1718