• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* LzFind.c -- Match finder for LZ algorithms
2 2023-03-14 : Igor Pavlov : Public domain */
3 
4 #include "Precomp.h"
5 
6 #include <string.h>
7 // #include <stdio.h>
8 
9 #include "CpuArch.h"
10 #include "LzFind.h"
11 #include "LzHash.h"
12 
13 #define kBlockMoveAlign       (1 << 7)    // alignment for memmove()
14 #define kBlockSizeAlign       (1 << 16)   // alignment for block allocation
15 #define kBlockSizeReserveMin  (1 << 24)   // it's 1/256 from 4 GB dictinary
16 
17 #define kEmptyHashValue 0
18 
19 #define kMaxValForNormalize ((UInt32)0)
20 // #define kMaxValForNormalize ((UInt32)(1 << 20) + 0xfff) // for debug
21 
22 // #define kNormalizeAlign (1 << 7) // alignment for speculated accesses
23 
24 #define GET_AVAIL_BYTES(p) \
25   Inline_MatchFinder_GetNumAvailableBytes(p)
26 
27 
28 // #define kFix5HashSize (kHash2Size + kHash3Size + kHash4Size)
29 #define kFix5HashSize kFix4HashSize
30 
31 /*
32  HASH2_CALC:
33    if (hv) match, then cur[0] and cur[1] also match
34 */
35 #define HASH2_CALC hv = GetUi16(cur);
36 
37 // (crc[0 ... 255] & 0xFF) provides one-to-one correspondence to [0 ... 255]
38 
39 /*
40  HASH3_CALC:
41    if (cur[0]) and (h2) match, then cur[1]            also match
42    if (cur[0]) and (hv) match, then cur[1] and cur[2] also match
43 */
44 #define HASH3_CALC { \
45   UInt32 temp = p->crc[cur[0]] ^ cur[1]; \
46   h2 = temp & (kHash2Size - 1); \
47   hv = (temp ^ ((UInt32)cur[2] << 8)) & p->hashMask; }
48 
49 #define HASH4_CALC { \
50   UInt32 temp = p->crc[cur[0]] ^ cur[1]; \
51   h2 = temp & (kHash2Size - 1); \
52   temp ^= ((UInt32)cur[2] << 8); \
53   h3 = temp & (kHash3Size - 1); \
54   hv = (temp ^ (p->crc[cur[3]] << kLzHash_CrcShift_1)) & p->hashMask; }
55 
56 #define HASH5_CALC { \
57   UInt32 temp = p->crc[cur[0]] ^ cur[1]; \
58   h2 = temp & (kHash2Size - 1); \
59   temp ^= ((UInt32)cur[2] << 8); \
60   h3 = temp & (kHash3Size - 1); \
61   temp ^= (p->crc[cur[3]] << kLzHash_CrcShift_1); \
62   /* h4 = temp & p->hash4Mask; */ /* (kHash4Size - 1); */ \
63   hv = (temp ^ (p->crc[cur[4]] << kLzHash_CrcShift_2)) & p->hashMask; }
64 
65 #define HASH_ZIP_CALC hv = ((cur[2] | ((UInt32)cur[0] << 8)) ^ p->crc[cur[1]]) & 0xFFFF;
66 
67 
LzInWindow_Free(CMatchFinder * p,ISzAllocPtr alloc)68 static void LzInWindow_Free(CMatchFinder *p, ISzAllocPtr alloc)
69 {
70   // if (!p->directInput)
71   {
72     ISzAlloc_Free(alloc, p->bufBase);
73     p->bufBase = NULL;
74   }
75 }
76 
77 
LzInWindow_Create2(CMatchFinder * p,UInt32 blockSize,ISzAllocPtr alloc)78 static int LzInWindow_Create2(CMatchFinder *p, UInt32 blockSize, ISzAllocPtr alloc)
79 {
80   if (blockSize == 0)
81     return 0;
82   if (!p->bufBase || p->blockSize != blockSize)
83   {
84     // size_t blockSizeT;
85     LzInWindow_Free(p, alloc);
86     p->blockSize = blockSize;
87     // blockSizeT = blockSize;
88 
89     // printf("\nblockSize = 0x%x\n", blockSize);
90     /*
91     #if defined _WIN64
92     // we can allocate 4GiB, but still use UInt32 for (p->blockSize)
93     // we use UInt32 type for (p->blockSize), because
94     // we don't want to wrap over 4 GiB,
95     // when we use (p->streamPos - p->pos) that is UInt32.
96     if (blockSize >= (UInt32)0 - (UInt32)kBlockSizeAlign)
97     {
98       blockSizeT = ((size_t)1 << 32);
99       printf("\nchanged to blockSizeT = 4GiB\n");
100     }
101     #endif
102     */
103 
104     p->bufBase = (Byte *)ISzAlloc_Alloc(alloc, blockSize);
105     // printf("\nbufferBase = %p\n", p->bufBase);
106     // return 0; // for debug
107   }
108   return (p->bufBase != NULL);
109 }
110 
MatchFinder_GetPointerToCurrentPos(CMatchFinder * p)111 static const Byte *MatchFinder_GetPointerToCurrentPos(CMatchFinder *p) { return p->buffer; }
112 
MatchFinder_GetNumAvailableBytes(CMatchFinder * p)113 static UInt32 MatchFinder_GetNumAvailableBytes(CMatchFinder *p) { return GET_AVAIL_BYTES(p); }
114 
115 
116 Z7_NO_INLINE
MatchFinder_ReadBlock(CMatchFinder * p)117 static void MatchFinder_ReadBlock(CMatchFinder *p)
118 {
119   if (p->streamEndWasReached || p->result != SZ_OK)
120     return;
121 
122   /* We use (p->streamPos - p->pos) value.
123      (p->streamPos < p->pos) is allowed. */
124 
125   if (p->directInput)
126   {
127     UInt32 curSize = 0xFFFFFFFF - GET_AVAIL_BYTES(p);
128     if (curSize > p->directInputRem)
129       curSize = (UInt32)p->directInputRem;
130     p->streamPos += curSize;
131     p->directInputRem -= curSize;
132     if (p->directInputRem == 0)
133       p->streamEndWasReached = 1;
134     return;
135   }
136 
137   for (;;)
138   {
139     const Byte *dest = p->buffer + GET_AVAIL_BYTES(p);
140     size_t size = (size_t)(p->bufBase + p->blockSize - dest);
141     if (size == 0)
142     {
143       /* we call ReadBlock() after NeedMove() and MoveBlock().
144          NeedMove() and MoveBlock() povide more than (keepSizeAfter)
145          to the end of (blockSize).
146          So we don't execute this branch in normal code flow.
147          We can go here, if we will call ReadBlock() before NeedMove(), MoveBlock().
148       */
149       // p->result = SZ_ERROR_FAIL; // we can show error here
150       return;
151     }
152 
153     // #define kRead 3
154     // if (size > kRead) size = kRead; // for debug
155 
156     /*
157     // we need cast (Byte *)dest.
158     #ifdef __clang__
159       #pragma GCC diagnostic ignored "-Wcast-qual"
160     #endif
161     */
162     p->result = ISeqInStream_Read(p->stream,
163         p->bufBase + (dest - p->bufBase), &size);
164     if (p->result != SZ_OK)
165       return;
166     if (size == 0)
167     {
168       p->streamEndWasReached = 1;
169       return;
170     }
171     p->streamPos += (UInt32)size;
172     if (GET_AVAIL_BYTES(p) > p->keepSizeAfter)
173       return;
174     /* here and in another (p->keepSizeAfter) checks we keep on 1 byte more than was requested by Create() function
175          (GET_AVAIL_BYTES(p) >= p->keepSizeAfter) - minimal required size */
176   }
177 
178   // on exit: (p->result != SZ_OK || p->streamEndWasReached || GET_AVAIL_BYTES(p) > p->keepSizeAfter)
179 }
180 
181 
182 
183 Z7_NO_INLINE
MatchFinder_MoveBlock(CMatchFinder * p)184 void MatchFinder_MoveBlock(CMatchFinder *p)
185 {
186   const size_t offset = (size_t)(p->buffer - p->bufBase) - p->keepSizeBefore;
187   const size_t keepBefore = (offset & (kBlockMoveAlign - 1)) + p->keepSizeBefore;
188   p->buffer = p->bufBase + keepBefore;
189   memmove(p->bufBase,
190       p->bufBase + (offset & ~((size_t)kBlockMoveAlign - 1)),
191       keepBefore + (size_t)GET_AVAIL_BYTES(p));
192 }
193 
194 /* We call MoveBlock() before ReadBlock().
195    So MoveBlock() can be wasteful operation, if the whole input data
196    can fit in current block even without calling MoveBlock().
197    in important case where (dataSize <= historySize)
198      condition (p->blockSize > dataSize + p->keepSizeAfter) is met
199      So there is no MoveBlock() in that case case.
200 */
201 
MatchFinder_NeedMove(CMatchFinder * p)202 int MatchFinder_NeedMove(CMatchFinder *p)
203 {
204   if (p->directInput)
205     return 0;
206   if (p->streamEndWasReached || p->result != SZ_OK)
207     return 0;
208   return ((size_t)(p->bufBase + p->blockSize - p->buffer) <= p->keepSizeAfter);
209 }
210 
MatchFinder_ReadIfRequired(CMatchFinder * p)211 void MatchFinder_ReadIfRequired(CMatchFinder *p)
212 {
213   if (p->keepSizeAfter >= GET_AVAIL_BYTES(p))
214     MatchFinder_ReadBlock(p);
215 }
216 
217 
218 
MatchFinder_SetDefaultSettings(CMatchFinder * p)219 static void MatchFinder_SetDefaultSettings(CMatchFinder *p)
220 {
221   p->cutValue = 32;
222   p->btMode = 1;
223   p->numHashBytes = 4;
224   p->numHashBytes_Min = 2;
225   p->numHashOutBits = 0;
226   p->bigHash = 0;
227 }
228 
229 #define kCrcPoly 0xEDB88320
230 
MatchFinder_Construct(CMatchFinder * p)231 void MatchFinder_Construct(CMatchFinder *p)
232 {
233   unsigned i;
234   p->buffer = NULL;
235   p->bufBase = NULL;
236   p->directInput = 0;
237   p->stream = NULL;
238   p->hash = NULL;
239   p->expectedDataSize = (UInt64)(Int64)-1;
240   MatchFinder_SetDefaultSettings(p);
241 
242   for (i = 0; i < 256; i++)
243   {
244     UInt32 r = (UInt32)i;
245     unsigned j;
246     for (j = 0; j < 8; j++)
247       r = (r >> 1) ^ (kCrcPoly & ((UInt32)0 - (r & 1)));
248     p->crc[i] = r;
249   }
250 }
251 
252 #undef kCrcPoly
253 
MatchFinder_FreeThisClassMemory(CMatchFinder * p,ISzAllocPtr alloc)254 static void MatchFinder_FreeThisClassMemory(CMatchFinder *p, ISzAllocPtr alloc)
255 {
256   ISzAlloc_Free(alloc, p->hash);
257   p->hash = NULL;
258 }
259 
MatchFinder_Free(CMatchFinder * p,ISzAllocPtr alloc)260 void MatchFinder_Free(CMatchFinder *p, ISzAllocPtr alloc)
261 {
262   MatchFinder_FreeThisClassMemory(p, alloc);
263   LzInWindow_Free(p, alloc);
264 }
265 
AllocRefs(size_t num,ISzAllocPtr alloc)266 static CLzRef* AllocRefs(size_t num, ISzAllocPtr alloc)
267 {
268   const size_t sizeInBytes = (size_t)num * sizeof(CLzRef);
269   if (sizeInBytes / sizeof(CLzRef) != num)
270     return NULL;
271   return (CLzRef *)ISzAlloc_Alloc(alloc, sizeInBytes);
272 }
273 
274 #if (kBlockSizeReserveMin < kBlockSizeAlign * 2)
275   #error Stop_Compiling_Bad_Reserve
276 #endif
277 
278 
279 
GetBlockSize(CMatchFinder * p,UInt32 historySize)280 static UInt32 GetBlockSize(CMatchFinder *p, UInt32 historySize)
281 {
282   UInt32 blockSize = (p->keepSizeBefore + p->keepSizeAfter);
283   /*
284   if (historySize > kMaxHistorySize)
285     return 0;
286   */
287   // printf("\nhistorySize == 0x%x\n", historySize);
288 
289   if (p->keepSizeBefore < historySize || blockSize < p->keepSizeBefore)  // if 32-bit overflow
290     return 0;
291 
292   {
293     const UInt32 kBlockSizeMax = (UInt32)0 - (UInt32)kBlockSizeAlign;
294     const UInt32 rem = kBlockSizeMax - blockSize;
295     const UInt32 reserve = (blockSize >> (blockSize < ((UInt32)1 << 30) ? 1 : 2))
296         + (1 << 12) + kBlockMoveAlign + kBlockSizeAlign; // do not overflow 32-bit here
297     if (blockSize >= kBlockSizeMax
298         || rem < kBlockSizeReserveMin) // we reject settings that will be slow
299       return 0;
300     if (reserve >= rem)
301       blockSize = kBlockSizeMax;
302     else
303     {
304       blockSize += reserve;
305       blockSize &= ~(UInt32)(kBlockSizeAlign - 1);
306     }
307   }
308   // printf("\n LzFind_blockSize = %x\n", blockSize);
309   // printf("\n LzFind_blockSize = %d\n", blockSize >> 20);
310   return blockSize;
311 }
312 
313 
314 // input is historySize
MatchFinder_GetHashMask2(CMatchFinder * p,UInt32 hs)315 static UInt32 MatchFinder_GetHashMask2(CMatchFinder *p, UInt32 hs)
316 {
317   if (p->numHashBytes == 2)
318     return (1 << 16) - 1;
319   if (hs != 0)
320     hs--;
321   hs |= (hs >> 1);
322   hs |= (hs >> 2);
323   hs |= (hs >> 4);
324   hs |= (hs >> 8);
325   // we propagated 16 bits in (hs). Low 16 bits must be set later
326   if (hs >= (1 << 24))
327   {
328     if (p->numHashBytes == 3)
329       hs = (1 << 24) - 1;
330     /* if (bigHash) mode, GetHeads4b() in LzFindMt.c needs (hs >= ((1 << 24) - 1))) */
331   }
332   // (hash_size >= (1 << 16)) : Required for (numHashBytes > 2)
333   hs |= (1 << 16) - 1; /* don't change it! */
334   // bt5: we adjust the size with recommended minimum size
335   if (p->numHashBytes >= 5)
336     hs |= (256 << kLzHash_CrcShift_2) - 1;
337   return hs;
338 }
339 
340 // input is historySize
MatchFinder_GetHashMask(CMatchFinder * p,UInt32 hs)341 static UInt32 MatchFinder_GetHashMask(CMatchFinder *p, UInt32 hs)
342 {
343   if (p->numHashBytes == 2)
344     return (1 << 16) - 1;
345   if (hs != 0)
346     hs--;
347   hs |= (hs >> 1);
348   hs |= (hs >> 2);
349   hs |= (hs >> 4);
350   hs |= (hs >> 8);
351   // we propagated 16 bits in (hs). Low 16 bits must be set later
352   hs >>= 1;
353   if (hs >= (1 << 24))
354   {
355     if (p->numHashBytes == 3)
356       hs = (1 << 24) - 1;
357     else
358       hs >>= 1;
359     /* if (bigHash) mode, GetHeads4b() in LzFindMt.c needs (hs >= ((1 << 24) - 1))) */
360   }
361   // (hash_size >= (1 << 16)) : Required for (numHashBytes > 2)
362   hs |= (1 << 16) - 1; /* don't change it! */
363   // bt5: we adjust the size with recommended minimum size
364   if (p->numHashBytes >= 5)
365     hs |= (256 << kLzHash_CrcShift_2) - 1;
366   return hs;
367 }
368 
369 
MatchFinder_Create(CMatchFinder * p,UInt32 historySize,UInt32 keepAddBufferBefore,UInt32 matchMaxLen,UInt32 keepAddBufferAfter,ISzAllocPtr alloc)370 int MatchFinder_Create(CMatchFinder *p, UInt32 historySize,
371     UInt32 keepAddBufferBefore, UInt32 matchMaxLen, UInt32 keepAddBufferAfter,
372     ISzAllocPtr alloc)
373 {
374   /* we need one additional byte in (p->keepSizeBefore),
375      since we use MoveBlock() after (p->pos++) and before dictionary using */
376   // keepAddBufferBefore = (UInt32)0xFFFFFFFF - (1 << 22); // for debug
377   p->keepSizeBefore = historySize + keepAddBufferBefore + 1;
378 
379   keepAddBufferAfter += matchMaxLen;
380   /* we need (p->keepSizeAfter >= p->numHashBytes) */
381   if (keepAddBufferAfter < p->numHashBytes)
382     keepAddBufferAfter = p->numHashBytes;
383   // keepAddBufferAfter -= 2; // for debug
384   p->keepSizeAfter = keepAddBufferAfter;
385 
386   if (p->directInput)
387     p->blockSize = 0;
388   if (p->directInput || LzInWindow_Create2(p, GetBlockSize(p, historySize), alloc))
389   {
390     size_t hashSizeSum;
391     {
392       UInt32 hs;
393       UInt32 hsCur;
394 
395       if (p->numHashOutBits != 0)
396       {
397         unsigned numBits = p->numHashOutBits;
398         const unsigned nbMax =
399             (p->numHashBytes == 2 ? 16 :
400             (p->numHashBytes == 3 ? 24 : 32));
401         if (numBits > nbMax)
402           numBits = nbMax;
403         if (numBits >= 32)
404           hs = (UInt32)0 - 1;
405         else
406           hs = ((UInt32)1 << numBits) - 1;
407         // (hash_size >= (1 << 16)) : Required for (numHashBytes > 2)
408         hs |= (1 << 16) - 1; /* don't change it! */
409         if (p->numHashBytes >= 5)
410           hs |= (256 << kLzHash_CrcShift_2) - 1;
411         {
412           const UInt32 hs2 = MatchFinder_GetHashMask2(p, historySize);
413           if (hs > hs2)
414             hs = hs2;
415         }
416         hsCur = hs;
417         if (p->expectedDataSize < historySize)
418         {
419           const UInt32 hs2 = MatchFinder_GetHashMask2(p, (UInt32)p->expectedDataSize);
420           if (hsCur > hs2)
421             hsCur = hs2;
422         }
423       }
424       else
425       {
426         hs = MatchFinder_GetHashMask(p, historySize);
427         hsCur = hs;
428         if (p->expectedDataSize < historySize)
429         {
430           hsCur = MatchFinder_GetHashMask(p, (UInt32)p->expectedDataSize);
431           if (hsCur > hs) // is it possible?
432             hsCur = hs;
433         }
434       }
435 
436       p->hashMask = hsCur;
437 
438       hashSizeSum = hs;
439       hashSizeSum++;
440       if (hashSizeSum < hs)
441         return 0;
442       {
443         UInt32 fixedHashSize = 0;
444         if (p->numHashBytes > 2 && p->numHashBytes_Min <= 2) fixedHashSize += kHash2Size;
445         if (p->numHashBytes > 3 && p->numHashBytes_Min <= 3) fixedHashSize += kHash3Size;
446         // if (p->numHashBytes > 4) p->fixedHashSize += hs4; // kHash4Size;
447         hashSizeSum += fixedHashSize;
448         p->fixedHashSize = fixedHashSize;
449       }
450     }
451 
452     p->matchMaxLen = matchMaxLen;
453 
454     {
455       size_t newSize;
456       size_t numSons;
457       const UInt32 newCyclicBufferSize = historySize + 1; // do not change it
458       p->historySize = historySize;
459       p->cyclicBufferSize = newCyclicBufferSize; // it must be = (historySize + 1)
460 
461       numSons = newCyclicBufferSize;
462       if (p->btMode)
463         numSons <<= 1;
464       newSize = hashSizeSum + numSons;
465 
466       if (numSons < newCyclicBufferSize || newSize < numSons)
467         return 0;
468 
469       // aligned size is not required here, but it can be better for some loops
470       #define NUM_REFS_ALIGN_MASK 0xF
471       newSize = (newSize + NUM_REFS_ALIGN_MASK) & ~(size_t)NUM_REFS_ALIGN_MASK;
472 
473       // 22.02: we don't reallocate buffer, if old size is enough
474       if (p->hash && p->numRefs >= newSize)
475         return 1;
476 
477       MatchFinder_FreeThisClassMemory(p, alloc);
478       p->numRefs = newSize;
479       p->hash = AllocRefs(newSize, alloc);
480 
481       if (p->hash)
482       {
483         p->son = p->hash + hashSizeSum;
484         return 1;
485       }
486     }
487   }
488 
489   MatchFinder_Free(p, alloc);
490   return 0;
491 }
492 
493 
MatchFinder_SetLimits(CMatchFinder * p)494 static void MatchFinder_SetLimits(CMatchFinder *p)
495 {
496   UInt32 k;
497   UInt32 n = kMaxValForNormalize - p->pos;
498   if (n == 0)
499     n = (UInt32)(Int32)-1;  // we allow (pos == 0) at start even with (kMaxValForNormalize == 0)
500 
501   k = p->cyclicBufferSize - p->cyclicBufferPos;
502   if (k < n)
503     n = k;
504 
505   k = GET_AVAIL_BYTES(p);
506   {
507     const UInt32 ksa = p->keepSizeAfter;
508     UInt32 mm = p->matchMaxLen;
509     if (k > ksa)
510       k -= ksa; // we must limit exactly to keepSizeAfter for ReadBlock
511     else if (k >= mm)
512     {
513       // the limitation for (p->lenLimit) update
514       k -= mm;   // optimization : to reduce the number of checks
515       k++;
516       // k = 1; // non-optimized version : for debug
517     }
518     else
519     {
520       mm = k;
521       if (k != 0)
522         k = 1;
523     }
524     p->lenLimit = mm;
525   }
526   if (k < n)
527     n = k;
528 
529   p->posLimit = p->pos + n;
530 }
531 
532 
MatchFinder_Init_LowHash(CMatchFinder * p)533 void MatchFinder_Init_LowHash(CMatchFinder *p)
534 {
535   size_t i;
536   CLzRef *items = p->hash;
537   const size_t numItems = p->fixedHashSize;
538   for (i = 0; i < numItems; i++)
539     items[i] = kEmptyHashValue;
540 }
541 
542 
MatchFinder_Init_HighHash(CMatchFinder * p)543 void MatchFinder_Init_HighHash(CMatchFinder *p)
544 {
545   size_t i;
546   CLzRef *items = p->hash + p->fixedHashSize;
547   const size_t numItems = (size_t)p->hashMask + 1;
548   for (i = 0; i < numItems; i++)
549     items[i] = kEmptyHashValue;
550 }
551 
552 
MatchFinder_Init_4(CMatchFinder * p)553 void MatchFinder_Init_4(CMatchFinder *p)
554 {
555   if (!p->directInput)
556     p->buffer = p->bufBase;
557   {
558     /* kEmptyHashValue = 0 (Zero) is used in hash tables as NO-VALUE marker.
559        the code in CMatchFinderMt expects (pos = 1) */
560     p->pos =
561     p->streamPos =
562         1; // it's smallest optimal value. do not change it
563         // 0; // for debug
564   }
565   p->result = SZ_OK;
566   p->streamEndWasReached = 0;
567 }
568 
569 
570 // (CYC_TO_POS_OFFSET == 0) is expected by some optimized code
571 #define CYC_TO_POS_OFFSET 0
572 // #define CYC_TO_POS_OFFSET 1 // for debug
573 
MatchFinder_Init(CMatchFinder * p)574 void MatchFinder_Init(CMatchFinder *p)
575 {
576   MatchFinder_Init_HighHash(p);
577   MatchFinder_Init_LowHash(p);
578   MatchFinder_Init_4(p);
579   // if (readData)
580   MatchFinder_ReadBlock(p);
581 
582   /* if we init (cyclicBufferPos = pos), then we can use one variable
583      instead of both (cyclicBufferPos) and (pos) : only before (cyclicBufferPos) wrapping */
584   p->cyclicBufferPos = (p->pos - CYC_TO_POS_OFFSET); // init with relation to (pos)
585   // p->cyclicBufferPos = 0; // smallest value
586   // p->son[0] = p->son[1] = 0; // unused: we can init skipped record for speculated accesses.
587   MatchFinder_SetLimits(p);
588 }
589 
590 
591 
592 #ifdef MY_CPU_X86_OR_AMD64
593   #if defined(__clang__) && (__clang_major__ >= 4) \
594     || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40701)
595     // || defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1900)
596 
597       #define USE_LZFIND_SATUR_SUB_128
598       #define USE_LZFIND_SATUR_SUB_256
599       #define LZFIND_ATTRIB_SSE41 __attribute__((__target__("sse4.1")))
600       #define LZFIND_ATTRIB_AVX2  __attribute__((__target__("avx2")))
601   #elif defined(_MSC_VER)
602     #if (_MSC_VER >= 1600)
603       #define USE_LZFIND_SATUR_SUB_128
604     #endif
605     #if (_MSC_VER >= 1900)
606       #define USE_LZFIND_SATUR_SUB_256
607     #endif
608   #endif
609 
610 // #elif defined(MY_CPU_ARM_OR_ARM64)
611 #elif defined(MY_CPU_ARM64)
612 
613   #if defined(__clang__) && (__clang_major__ >= 8) \
614     || defined(__GNUC__) && (__GNUC__ >= 8)
615       #define USE_LZFIND_SATUR_SUB_128
616     #ifdef MY_CPU_ARM64
617       // #define LZFIND_ATTRIB_SSE41 __attribute__((__target__("")))
618     #else
619       // #define LZFIND_ATTRIB_SSE41 __attribute__((__target__("fpu=crypto-neon-fp-armv8")))
620     #endif
621 
622   #elif defined(_MSC_VER)
623     #if (_MSC_VER >= 1910)
624       #define USE_LZFIND_SATUR_SUB_128
625     #endif
626   #endif
627 
628   #if defined(_MSC_VER) && defined(MY_CPU_ARM64)
629     #include <arm64_neon.h>
630   #else
631     #include <arm_neon.h>
632   #endif
633 
634 #endif
635 
636 
637 #ifdef USE_LZFIND_SATUR_SUB_128
638 
639 // #define Z7_SHOW_HW_STATUS
640 
641 #ifdef Z7_SHOW_HW_STATUS
642 #include <stdio.h>
643 #define PRF(x) x
644 PRF(;)
645 #else
646 #define PRF(x)
647 #endif
648 
649 
650 #ifdef MY_CPU_ARM_OR_ARM64
651 
652 #ifdef MY_CPU_ARM64
653 // #define FORCE_LZFIND_SATUR_SUB_128
654 #endif
655 typedef uint32x4_t LzFind_v128;
656 #define SASUB_128_V(v, s) \
657   vsubq_u32(vmaxq_u32(v, s), s)
658 
659 #else // MY_CPU_ARM_OR_ARM64
660 
661 #include <smmintrin.h> // sse4.1
662 
663 typedef __m128i LzFind_v128;
664 // SSE 4.1
665 #define SASUB_128_V(v, s)   \
666   _mm_sub_epi32(_mm_max_epu32(v, s), s)
667 
668 #endif // MY_CPU_ARM_OR_ARM64
669 
670 
671 #define SASUB_128(i) \
672   *(      LzFind_v128 *)(      void *)(items + (i) * 4) = SASUB_128_V( \
673   *(const LzFind_v128 *)(const void *)(items + (i) * 4), sub2);
674 
675 
676 Z7_NO_INLINE
677 static
678 #ifdef LZFIND_ATTRIB_SSE41
679 LZFIND_ATTRIB_SSE41
680 #endif
681 void
682 Z7_FASTCALL
LzFind_SaturSub_128(UInt32 subValue,CLzRef * items,const CLzRef * lim)683 LzFind_SaturSub_128(UInt32 subValue, CLzRef *items, const CLzRef *lim)
684 {
685   const LzFind_v128 sub2 =
686     #ifdef MY_CPU_ARM_OR_ARM64
687       vdupq_n_u32(subValue);
688     #else
689       _mm_set_epi32((Int32)subValue, (Int32)subValue, (Int32)subValue, (Int32)subValue);
690     #endif
691   Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
692   do
693   {
694     SASUB_128(0)  SASUB_128(1)  items += 2 * 4;
695     SASUB_128(0)  SASUB_128(1)  items += 2 * 4;
696   }
697   while (items != lim);
698 }
699 
700 
701 
702 #ifdef USE_LZFIND_SATUR_SUB_256
703 
704 #include <immintrin.h> // avx
705 /*
706 clang :immintrin.h uses
707 #if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
708     defined(__AVX2__)
709 #include <avx2intrin.h>
710 #endif
711 so we need <avxintrin.h> for clang-cl */
712 
713 #if defined(__clang__)
714 #include <avxintrin.h>
715 #include <avx2intrin.h>
716 #endif
717 
718 // AVX2:
719 #define SASUB_256(i) \
720     *(      __m256i *)(      void *)(items + (i) * 8) = \
721    _mm256_sub_epi32(_mm256_max_epu32( \
722     *(const __m256i *)(const void *)(items + (i) * 8), sub2), sub2);
723 
724 Z7_NO_INLINE
725 static
726 #ifdef LZFIND_ATTRIB_AVX2
727 LZFIND_ATTRIB_AVX2
728 #endif
729 void
730 Z7_FASTCALL
LzFind_SaturSub_256(UInt32 subValue,CLzRef * items,const CLzRef * lim)731 LzFind_SaturSub_256(UInt32 subValue, CLzRef *items, const CLzRef *lim)
732 {
733   const __m256i sub2 = _mm256_set_epi32(
734       (Int32)subValue, (Int32)subValue, (Int32)subValue, (Int32)subValue,
735       (Int32)subValue, (Int32)subValue, (Int32)subValue, (Int32)subValue);
736   Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
737   do
738   {
739     SASUB_256(0)  SASUB_256(1)  items += 2 * 8;
740     SASUB_256(0)  SASUB_256(1)  items += 2 * 8;
741   }
742   while (items != lim);
743 }
744 #endif // USE_LZFIND_SATUR_SUB_256
745 
746 #ifndef FORCE_LZFIND_SATUR_SUB_128
747 typedef void (Z7_FASTCALL *LZFIND_SATUR_SUB_CODE_FUNC)(
748     UInt32 subValue, CLzRef *items, const CLzRef *lim);
749 static LZFIND_SATUR_SUB_CODE_FUNC g_LzFind_SaturSub;
750 #endif // FORCE_LZFIND_SATUR_SUB_128
751 
752 #endif // USE_LZFIND_SATUR_SUB_128
753 
754 
755 // kEmptyHashValue must be zero
756 // #define SASUB_32(i)  { UInt32 v = items[i];  UInt32 m = v - subValue;  if (v < subValue) m = kEmptyHashValue;  items[i] = m; }
757 #define SASUB_32(i)  { UInt32 v = items[i];  if (v < subValue) v = subValue; items[i] = v - subValue; }
758 
759 #ifdef FORCE_LZFIND_SATUR_SUB_128
760 
761 #define DEFAULT_SaturSub LzFind_SaturSub_128
762 
763 #else
764 
765 #define DEFAULT_SaturSub LzFind_SaturSub_32
766 
767 Z7_NO_INLINE
768 static
769 void
770 Z7_FASTCALL
LzFind_SaturSub_32(UInt32 subValue,CLzRef * items,const CLzRef * lim)771 LzFind_SaturSub_32(UInt32 subValue, CLzRef *items, const CLzRef *lim)
772 {
773   Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
774   do
775   {
776     SASUB_32(0)  SASUB_32(1)  items += 2;
777     SASUB_32(0)  SASUB_32(1)  items += 2;
778     SASUB_32(0)  SASUB_32(1)  items += 2;
779     SASUB_32(0)  SASUB_32(1)  items += 2;
780   }
781   while (items != lim);
782 }
783 
784 #endif
785 
786 
787 Z7_NO_INLINE
MatchFinder_Normalize3(UInt32 subValue,CLzRef * items,size_t numItems)788 void MatchFinder_Normalize3(UInt32 subValue, CLzRef *items, size_t numItems)
789 {
790   #define LZFIND_NORM_ALIGN_BLOCK_SIZE (1 << 7)
791   Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
792   for (; numItems != 0 && ((unsigned)(ptrdiff_t)items & (LZFIND_NORM_ALIGN_BLOCK_SIZE - 1)) != 0; numItems--)
793   {
794     SASUB_32(0)
795     items++;
796   }
797   {
798     const size_t k_Align_Mask = (LZFIND_NORM_ALIGN_BLOCK_SIZE / 4 - 1);
799     CLzRef *lim = items + (numItems & ~(size_t)k_Align_Mask);
800     numItems &= k_Align_Mask;
801     if (items != lim)
802     {
803       #if defined(USE_LZFIND_SATUR_SUB_128) && !defined(FORCE_LZFIND_SATUR_SUB_128)
804         if (g_LzFind_SaturSub)
805           g_LzFind_SaturSub(subValue, items, lim);
806         else
807       #endif
808           DEFAULT_SaturSub(subValue, items, lim);
809     }
810     items = lim;
811   }
812   Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
813   for (; numItems != 0; numItems--)
814   {
815     SASUB_32(0)
816     items++;
817   }
818 }
819 
820 
821 
822 // call MatchFinder_CheckLimits() only after (p->pos++) update
823 
824 Z7_NO_INLINE
MatchFinder_CheckLimits(CMatchFinder * p)825 static void MatchFinder_CheckLimits(CMatchFinder *p)
826 {
827   if (// !p->streamEndWasReached && p->result == SZ_OK &&
828       p->keepSizeAfter == GET_AVAIL_BYTES(p))
829   {
830     // we try to read only in exact state (p->keepSizeAfter == GET_AVAIL_BYTES(p))
831     if (MatchFinder_NeedMove(p))
832       MatchFinder_MoveBlock(p);
833     MatchFinder_ReadBlock(p);
834   }
835 
836   if (p->pos == kMaxValForNormalize)
837   if (GET_AVAIL_BYTES(p) >= p->numHashBytes) // optional optimization for last bytes of data.
838     /*
839        if we disable normalization for last bytes of data, and
840        if (data_size == 4 GiB), we don't call wastfull normalization,
841        but (pos) will be wrapped over Zero (0) in that case.
842        And we cannot resume later to normal operation
843     */
844   {
845     // MatchFinder_Normalize(p);
846     /* after normalization we need (p->pos >= p->historySize + 1); */
847     /* we can reduce subValue to aligned value, if want to keep alignment
848        of (p->pos) and (p->buffer) for speculated accesses. */
849     const UInt32 subValue = (p->pos - p->historySize - 1) /* & ~(UInt32)(kNormalizeAlign - 1) */;
850     // const UInt32 subValue = (1 << 15); // for debug
851     // printf("\nMatchFinder_Normalize() subValue == 0x%x\n", subValue);
852     MatchFinder_REDUCE_OFFSETS(p, subValue)
853     MatchFinder_Normalize3(subValue, p->hash, (size_t)p->hashMask + 1 + p->fixedHashSize);
854     {
855       size_t numSonRefs = p->cyclicBufferSize;
856       if (p->btMode)
857         numSonRefs <<= 1;
858       MatchFinder_Normalize3(subValue, p->son, numSonRefs);
859     }
860   }
861 
862   if (p->cyclicBufferPos == p->cyclicBufferSize)
863     p->cyclicBufferPos = 0;
864 
865   MatchFinder_SetLimits(p);
866 }
867 
868 
869 /*
870   (lenLimit > maxLen)
871 */
872 Z7_FORCE_INLINE
Hc_GetMatchesSpec(size_t lenLimit,UInt32 curMatch,UInt32 pos,const Byte * cur,CLzRef * son,size_t _cyclicBufferPos,UInt32 _cyclicBufferSize,UInt32 cutValue,UInt32 * d,unsigned maxLen)873 static UInt32 * Hc_GetMatchesSpec(size_t lenLimit, UInt32 curMatch, UInt32 pos, const Byte *cur, CLzRef *son,
874     size_t _cyclicBufferPos, UInt32 _cyclicBufferSize, UInt32 cutValue,
875     UInt32 *d, unsigned maxLen)
876 {
877   /*
878   son[_cyclicBufferPos] = curMatch;
879   for (;;)
880   {
881     UInt32 delta = pos - curMatch;
882     if (cutValue-- == 0 || delta >= _cyclicBufferSize)
883       return d;
884     {
885       const Byte *pb = cur - delta;
886       curMatch = son[_cyclicBufferPos - delta + ((delta > _cyclicBufferPos) ? _cyclicBufferSize : 0)];
887       if (pb[maxLen] == cur[maxLen] && *pb == *cur)
888       {
889         UInt32 len = 0;
890         while (++len != lenLimit)
891           if (pb[len] != cur[len])
892             break;
893         if (maxLen < len)
894         {
895           maxLen = len;
896           *d++ = len;
897           *d++ = delta - 1;
898           if (len == lenLimit)
899             return d;
900         }
901       }
902     }
903   }
904   */
905 
906   const Byte *lim = cur + lenLimit;
907   son[_cyclicBufferPos] = curMatch;
908 
909   do
910   {
911     UInt32 delta;
912 
913     if (curMatch == 0)
914       break;
915     // if (curMatch2 >= curMatch) return NULL;
916     delta = pos - curMatch;
917     if (delta >= _cyclicBufferSize)
918       break;
919     {
920       ptrdiff_t diff;
921       curMatch = son[_cyclicBufferPos - delta + ((delta > _cyclicBufferPos) ? _cyclicBufferSize : 0)];
922       diff = (ptrdiff_t)0 - (ptrdiff_t)delta;
923       if (cur[maxLen] == cur[(ptrdiff_t)maxLen + diff])
924       {
925         const Byte *c = cur;
926         while (*c == c[diff])
927         {
928           if (++c == lim)
929           {
930             d[0] = (UInt32)(lim - cur);
931             d[1] = delta - 1;
932             return d + 2;
933           }
934         }
935         {
936           const unsigned len = (unsigned)(c - cur);
937           if (maxLen < len)
938           {
939             maxLen = len;
940             d[0] = (UInt32)len;
941             d[1] = delta - 1;
942             d += 2;
943           }
944         }
945       }
946     }
947   }
948   while (--cutValue);
949 
950   return d;
951 }
952 
953 
954 Z7_FORCE_INLINE
GetMatchesSpec1(UInt32 lenLimit,UInt32 curMatch,UInt32 pos,const Byte * cur,CLzRef * son,size_t _cyclicBufferPos,UInt32 _cyclicBufferSize,UInt32 cutValue,UInt32 * d,UInt32 maxLen)955 UInt32 * GetMatchesSpec1(UInt32 lenLimit, UInt32 curMatch, UInt32 pos, const Byte *cur, CLzRef *son,
956     size_t _cyclicBufferPos, UInt32 _cyclicBufferSize, UInt32 cutValue,
957     UInt32 *d, UInt32 maxLen)
958 {
959   CLzRef *ptr0 = son + ((size_t)_cyclicBufferPos << 1) + 1;
960   CLzRef *ptr1 = son + ((size_t)_cyclicBufferPos << 1);
961   unsigned len0 = 0, len1 = 0;
962 
963   UInt32 cmCheck;
964 
965   // if (curMatch >= pos) { *ptr0 = *ptr1 = kEmptyHashValue; return NULL; }
966 
967   cmCheck = (UInt32)(pos - _cyclicBufferSize);
968   if ((UInt32)pos <= _cyclicBufferSize)
969     cmCheck = 0;
970 
971   if (cmCheck < curMatch)
972   do
973   {
974     const UInt32 delta = pos - curMatch;
975     {
976       CLzRef *pair = son + ((size_t)(_cyclicBufferPos - delta + ((delta > _cyclicBufferPos) ? _cyclicBufferSize : 0)) << 1);
977       const Byte *pb = cur - delta;
978       unsigned len = (len0 < len1 ? len0 : len1);
979       const UInt32 pair0 = pair[0];
980       if (pb[len] == cur[len])
981       {
982         if (++len != lenLimit && pb[len] == cur[len])
983           while (++len != lenLimit)
984             if (pb[len] != cur[len])
985               break;
986         if (maxLen < len)
987         {
988           maxLen = (UInt32)len;
989           *d++ = (UInt32)len;
990           *d++ = delta - 1;
991           if (len == lenLimit)
992           {
993             *ptr1 = pair0;
994             *ptr0 = pair[1];
995             return d;
996           }
997         }
998       }
999       if (pb[len] < cur[len])
1000       {
1001         *ptr1 = curMatch;
1002         // const UInt32 curMatch2 = pair[1];
1003         // if (curMatch2 >= curMatch) { *ptr0 = *ptr1 = kEmptyHashValue;  return NULL; }
1004         // curMatch = curMatch2;
1005         curMatch = pair[1];
1006         ptr1 = pair + 1;
1007         len1 = len;
1008       }
1009       else
1010       {
1011         *ptr0 = curMatch;
1012         curMatch = pair[0];
1013         ptr0 = pair;
1014         len0 = len;
1015       }
1016     }
1017   }
1018   while(--cutValue && cmCheck < curMatch);
1019 
1020   *ptr0 = *ptr1 = kEmptyHashValue;
1021   return d;
1022 }
1023 
1024 
SkipMatchesSpec(UInt32 lenLimit,UInt32 curMatch,UInt32 pos,const Byte * cur,CLzRef * son,size_t _cyclicBufferPos,UInt32 _cyclicBufferSize,UInt32 cutValue)1025 static void SkipMatchesSpec(UInt32 lenLimit, UInt32 curMatch, UInt32 pos, const Byte *cur, CLzRef *son,
1026     size_t _cyclicBufferPos, UInt32 _cyclicBufferSize, UInt32 cutValue)
1027 {
1028   CLzRef *ptr0 = son + ((size_t)_cyclicBufferPos << 1) + 1;
1029   CLzRef *ptr1 = son + ((size_t)_cyclicBufferPos << 1);
1030   unsigned len0 = 0, len1 = 0;
1031 
1032   UInt32 cmCheck;
1033 
1034   cmCheck = (UInt32)(pos - _cyclicBufferSize);
1035   if ((UInt32)pos <= _cyclicBufferSize)
1036     cmCheck = 0;
1037 
1038   if (// curMatch >= pos ||  // failure
1039       cmCheck < curMatch)
1040   do
1041   {
1042     const UInt32 delta = pos - curMatch;
1043     {
1044       CLzRef *pair = son + ((size_t)(_cyclicBufferPos - delta + ((delta > _cyclicBufferPos) ? _cyclicBufferSize : 0)) << 1);
1045       const Byte *pb = cur - delta;
1046       unsigned len = (len0 < len1 ? len0 : len1);
1047       if (pb[len] == cur[len])
1048       {
1049         while (++len != lenLimit)
1050           if (pb[len] != cur[len])
1051             break;
1052         {
1053           if (len == lenLimit)
1054           {
1055             *ptr1 = pair[0];
1056             *ptr0 = pair[1];
1057             return;
1058           }
1059         }
1060       }
1061       if (pb[len] < cur[len])
1062       {
1063         *ptr1 = curMatch;
1064         curMatch = pair[1];
1065         ptr1 = pair + 1;
1066         len1 = len;
1067       }
1068       else
1069       {
1070         *ptr0 = curMatch;
1071         curMatch = pair[0];
1072         ptr0 = pair;
1073         len0 = len;
1074       }
1075     }
1076   }
1077   while(--cutValue && cmCheck < curMatch);
1078 
1079   *ptr0 = *ptr1 = kEmptyHashValue;
1080   return;
1081 }
1082 
1083 
1084 #define MOVE_POS \
1085   ++p->cyclicBufferPos; \
1086   p->buffer++; \
1087   { const UInt32 pos1 = p->pos + 1; p->pos = pos1; if (pos1 == p->posLimit) MatchFinder_CheckLimits(p); }
1088 
1089 #define MOVE_POS_RET MOVE_POS return distances;
1090 
1091 Z7_NO_INLINE
MatchFinder_MovePos(CMatchFinder * p)1092 static void MatchFinder_MovePos(CMatchFinder *p)
1093 {
1094   /* we go here at the end of stream data, when (avail < num_hash_bytes)
1095      We don't update sons[cyclicBufferPos << btMode].
1096      So (sons) record will contain junk. And we cannot resume match searching
1097      to normal operation, even if we will provide more input data in buffer.
1098      p->sons[p->cyclicBufferPos << p->btMode] = 0;  // kEmptyHashValue
1099      if (p->btMode)
1100         p->sons[(p->cyclicBufferPos << p->btMode) + 1] = 0;  // kEmptyHashValue
1101   */
1102   MOVE_POS
1103 }
1104 
1105 #define GET_MATCHES_HEADER2(minLen, ret_op) \
1106   unsigned lenLimit; UInt32 hv; const Byte *cur; UInt32 curMatch; \
1107   lenLimit = (unsigned)p->lenLimit; { if (lenLimit < minLen) { MatchFinder_MovePos(p); ret_op; }} \
1108   cur = p->buffer;
1109 
1110 #define GET_MATCHES_HEADER(minLen) GET_MATCHES_HEADER2(minLen, return distances)
1111 #define SKIP_HEADER(minLen)   do { GET_MATCHES_HEADER2(minLen, continue)
1112 
1113 #define MF_PARAMS(p)  lenLimit, curMatch, p->pos, p->buffer, p->son, p->cyclicBufferPos, p->cyclicBufferSize, p->cutValue
1114 
1115 #define SKIP_FOOTER  SkipMatchesSpec(MF_PARAMS(p)); MOVE_POS } while (--num);
1116 
1117 #define GET_MATCHES_FOOTER_BASE(_maxLen_, func) \
1118   distances = func(MF_PARAMS(p), \
1119   distances, (UInt32)_maxLen_); MOVE_POS_RET
1120 
1121 #define GET_MATCHES_FOOTER_BT(_maxLen_) \
1122   GET_MATCHES_FOOTER_BASE(_maxLen_, GetMatchesSpec1)
1123 
1124 #define GET_MATCHES_FOOTER_HC(_maxLen_) \
1125   GET_MATCHES_FOOTER_BASE(_maxLen_, Hc_GetMatchesSpec)
1126 
1127 
1128 
1129 #define UPDATE_maxLen { \
1130     const ptrdiff_t diff = (ptrdiff_t)0 - (ptrdiff_t)d2; \
1131     const Byte *c = cur + maxLen; \
1132     const Byte *lim = cur + lenLimit; \
1133     for (; c != lim; c++) if (*(c + diff) != *c) break; \
1134     maxLen = (unsigned)(c - cur); }
1135 
Bt2_MatchFinder_GetMatches(CMatchFinder * p,UInt32 * distances)1136 static UInt32* Bt2_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances)
1137 {
1138   GET_MATCHES_HEADER(2)
1139   HASH2_CALC
1140   curMatch = p->hash[hv];
1141   p->hash[hv] = p->pos;
1142   GET_MATCHES_FOOTER_BT(1)
1143 }
1144 
Bt3Zip_MatchFinder_GetMatches(CMatchFinder * p,UInt32 * distances)1145 UInt32* Bt3Zip_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances)
1146 {
1147   GET_MATCHES_HEADER(3)
1148   HASH_ZIP_CALC
1149   curMatch = p->hash[hv];
1150   p->hash[hv] = p->pos;
1151   GET_MATCHES_FOOTER_BT(2)
1152 }
1153 
1154 
1155 #define SET_mmm  \
1156   mmm = p->cyclicBufferSize; \
1157   if (pos < mmm) \
1158     mmm = pos;
1159 
1160 
Bt3_MatchFinder_GetMatches(CMatchFinder * p,UInt32 * distances)1161 static UInt32* Bt3_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances)
1162 {
1163   UInt32 mmm;
1164   UInt32 h2, d2, pos;
1165   unsigned maxLen;
1166   UInt32 *hash;
1167   GET_MATCHES_HEADER(3)
1168 
1169   HASH3_CALC
1170 
1171   hash = p->hash;
1172   pos = p->pos;
1173 
1174   d2 = pos - hash[h2];
1175 
1176   curMatch = (hash + kFix3HashSize)[hv];
1177 
1178   hash[h2] = pos;
1179   (hash + kFix3HashSize)[hv] = pos;
1180 
1181   SET_mmm
1182 
1183   maxLen = 2;
1184 
1185   if (d2 < mmm && *(cur - d2) == *cur)
1186   {
1187     UPDATE_maxLen
1188     distances[0] = (UInt32)maxLen;
1189     distances[1] = d2 - 1;
1190     distances += 2;
1191     if (maxLen == lenLimit)
1192     {
1193       SkipMatchesSpec(MF_PARAMS(p));
1194       MOVE_POS_RET
1195     }
1196   }
1197 
1198   GET_MATCHES_FOOTER_BT(maxLen)
1199 }
1200 
1201 
Bt4_MatchFinder_GetMatches(CMatchFinder * p,UInt32 * distances)1202 static UInt32* Bt4_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances)
1203 {
1204   UInt32 mmm;
1205   UInt32 h2, h3, d2, d3, pos;
1206   unsigned maxLen;
1207   UInt32 *hash;
1208   GET_MATCHES_HEADER(4)
1209 
1210   HASH4_CALC
1211 
1212   hash = p->hash;
1213   pos = p->pos;
1214 
1215   d2 = pos - hash                  [h2];
1216   d3 = pos - (hash + kFix3HashSize)[h3];
1217   curMatch = (hash + kFix4HashSize)[hv];
1218 
1219   hash                  [h2] = pos;
1220   (hash + kFix3HashSize)[h3] = pos;
1221   (hash + kFix4HashSize)[hv] = pos;
1222 
1223   SET_mmm
1224 
1225   maxLen = 3;
1226 
1227   for (;;)
1228   {
1229     if (d2 < mmm && *(cur - d2) == *cur)
1230     {
1231       distances[0] = 2;
1232       distances[1] = d2 - 1;
1233       distances += 2;
1234       if (*(cur - d2 + 2) == cur[2])
1235       {
1236         // distances[-2] = 3;
1237       }
1238       else if (d3 < mmm && *(cur - d3) == *cur)
1239       {
1240         d2 = d3;
1241         distances[1] = d3 - 1;
1242         distances += 2;
1243       }
1244       else
1245         break;
1246     }
1247     else if (d3 < mmm && *(cur - d3) == *cur)
1248     {
1249       d2 = d3;
1250       distances[1] = d3 - 1;
1251       distances += 2;
1252     }
1253     else
1254       break;
1255 
1256     UPDATE_maxLen
1257     distances[-2] = (UInt32)maxLen;
1258     if (maxLen == lenLimit)
1259     {
1260       SkipMatchesSpec(MF_PARAMS(p));
1261       MOVE_POS_RET
1262     }
1263     break;
1264   }
1265 
1266   GET_MATCHES_FOOTER_BT(maxLen)
1267 }
1268 
1269 
Bt5_MatchFinder_GetMatches(CMatchFinder * p,UInt32 * distances)1270 static UInt32* Bt5_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances)
1271 {
1272   UInt32 mmm;
1273   UInt32 h2, h3, d2, d3, maxLen, pos;
1274   UInt32 *hash;
1275   GET_MATCHES_HEADER(5)
1276 
1277   HASH5_CALC
1278 
1279   hash = p->hash;
1280   pos = p->pos;
1281 
1282   d2 = pos - hash                  [h2];
1283   d3 = pos - (hash + kFix3HashSize)[h3];
1284   // d4 = pos - (hash + kFix4HashSize)[h4];
1285 
1286   curMatch = (hash + kFix5HashSize)[hv];
1287 
1288   hash                  [h2] = pos;
1289   (hash + kFix3HashSize)[h3] = pos;
1290   // (hash + kFix4HashSize)[h4] = pos;
1291   (hash + kFix5HashSize)[hv] = pos;
1292 
1293   SET_mmm
1294 
1295   maxLen = 4;
1296 
1297   for (;;)
1298   {
1299     if (d2 < mmm && *(cur - d2) == *cur)
1300     {
1301       distances[0] = 2;
1302       distances[1] = d2 - 1;
1303       distances += 2;
1304       if (*(cur - d2 + 2) == cur[2])
1305       {
1306       }
1307       else if (d3 < mmm && *(cur - d3) == *cur)
1308       {
1309         distances[1] = d3 - 1;
1310         distances += 2;
1311         d2 = d3;
1312       }
1313       else
1314         break;
1315     }
1316     else if (d3 < mmm && *(cur - d3) == *cur)
1317     {
1318       distances[1] = d3 - 1;
1319       distances += 2;
1320       d2 = d3;
1321     }
1322     else
1323       break;
1324 
1325     distances[-2] = 3;
1326     if (*(cur - d2 + 3) != cur[3])
1327       break;
1328     UPDATE_maxLen
1329     distances[-2] = (UInt32)maxLen;
1330     if (maxLen == lenLimit)
1331     {
1332       SkipMatchesSpec(MF_PARAMS(p));
1333       MOVE_POS_RET
1334     }
1335     break;
1336   }
1337 
1338   GET_MATCHES_FOOTER_BT(maxLen)
1339 }
1340 
1341 
Hc4_MatchFinder_GetMatches(CMatchFinder * p,UInt32 * distances)1342 static UInt32* Hc4_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances)
1343 {
1344   UInt32 mmm;
1345   UInt32 h2, h3, d2, d3, pos;
1346   unsigned maxLen;
1347   UInt32 *hash;
1348   GET_MATCHES_HEADER(4)
1349 
1350   HASH4_CALC
1351 
1352   hash = p->hash;
1353   pos = p->pos;
1354 
1355   d2 = pos - hash                  [h2];
1356   d3 = pos - (hash + kFix3HashSize)[h3];
1357   curMatch = (hash + kFix4HashSize)[hv];
1358 
1359   hash                  [h2] = pos;
1360   (hash + kFix3HashSize)[h3] = pos;
1361   (hash + kFix4HashSize)[hv] = pos;
1362 
1363   SET_mmm
1364 
1365   maxLen = 3;
1366 
1367   for (;;)
1368   {
1369     if (d2 < mmm && *(cur - d2) == *cur)
1370     {
1371       distances[0] = 2;
1372       distances[1] = d2 - 1;
1373       distances += 2;
1374       if (*(cur - d2 + 2) == cur[2])
1375       {
1376         // distances[-2] = 3;
1377       }
1378       else if (d3 < mmm && *(cur - d3) == *cur)
1379       {
1380         d2 = d3;
1381         distances[1] = d3 - 1;
1382         distances += 2;
1383       }
1384       else
1385         break;
1386     }
1387     else if (d3 < mmm && *(cur - d3) == *cur)
1388     {
1389       d2 = d3;
1390       distances[1] = d3 - 1;
1391       distances += 2;
1392     }
1393     else
1394       break;
1395 
1396     UPDATE_maxLen
1397     distances[-2] = (UInt32)maxLen;
1398     if (maxLen == lenLimit)
1399     {
1400       p->son[p->cyclicBufferPos] = curMatch;
1401       MOVE_POS_RET
1402     }
1403     break;
1404   }
1405 
1406   GET_MATCHES_FOOTER_HC(maxLen)
1407 }
1408 
1409 
Hc5_MatchFinder_GetMatches(CMatchFinder * p,UInt32 * distances)1410 static UInt32 * Hc5_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances)
1411 {
1412   UInt32 mmm;
1413   UInt32 h2, h3, d2, d3, maxLen, pos;
1414   UInt32 *hash;
1415   GET_MATCHES_HEADER(5)
1416 
1417   HASH5_CALC
1418 
1419   hash = p->hash;
1420   pos = p->pos;
1421 
1422   d2 = pos - hash                  [h2];
1423   d3 = pos - (hash + kFix3HashSize)[h3];
1424   // d4 = pos - (hash + kFix4HashSize)[h4];
1425 
1426   curMatch = (hash + kFix5HashSize)[hv];
1427 
1428   hash                  [h2] = pos;
1429   (hash + kFix3HashSize)[h3] = pos;
1430   // (hash + kFix4HashSize)[h4] = pos;
1431   (hash + kFix5HashSize)[hv] = pos;
1432 
1433   SET_mmm
1434 
1435   maxLen = 4;
1436 
1437   for (;;)
1438   {
1439     if (d2 < mmm && *(cur - d2) == *cur)
1440     {
1441       distances[0] = 2;
1442       distances[1] = d2 - 1;
1443       distances += 2;
1444       if (*(cur - d2 + 2) == cur[2])
1445       {
1446       }
1447       else if (d3 < mmm && *(cur - d3) == *cur)
1448       {
1449         distances[1] = d3 - 1;
1450         distances += 2;
1451         d2 = d3;
1452       }
1453       else
1454         break;
1455     }
1456     else if (d3 < mmm && *(cur - d3) == *cur)
1457     {
1458       distances[1] = d3 - 1;
1459       distances += 2;
1460       d2 = d3;
1461     }
1462     else
1463       break;
1464 
1465     distances[-2] = 3;
1466     if (*(cur - d2 + 3) != cur[3])
1467       break;
1468     UPDATE_maxLen
1469     distances[-2] = maxLen;
1470     if (maxLen == lenLimit)
1471     {
1472       p->son[p->cyclicBufferPos] = curMatch;
1473       MOVE_POS_RET
1474     }
1475     break;
1476   }
1477 
1478   GET_MATCHES_FOOTER_HC(maxLen)
1479 }
1480 
1481 
Hc3Zip_MatchFinder_GetMatches(CMatchFinder * p,UInt32 * distances)1482 UInt32* Hc3Zip_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances)
1483 {
1484   GET_MATCHES_HEADER(3)
1485   HASH_ZIP_CALC
1486   curMatch = p->hash[hv];
1487   p->hash[hv] = p->pos;
1488   GET_MATCHES_FOOTER_HC(2)
1489 }
1490 
1491 
Bt2_MatchFinder_Skip(CMatchFinder * p,UInt32 num)1492 static void Bt2_MatchFinder_Skip(CMatchFinder *p, UInt32 num)
1493 {
1494   SKIP_HEADER(2)
1495   {
1496     HASH2_CALC
1497     curMatch = p->hash[hv];
1498     p->hash[hv] = p->pos;
1499   }
1500   SKIP_FOOTER
1501 }
1502 
Bt3Zip_MatchFinder_Skip(CMatchFinder * p,UInt32 num)1503 void Bt3Zip_MatchFinder_Skip(CMatchFinder *p, UInt32 num)
1504 {
1505   SKIP_HEADER(3)
1506   {
1507     HASH_ZIP_CALC
1508     curMatch = p->hash[hv];
1509     p->hash[hv] = p->pos;
1510   }
1511   SKIP_FOOTER
1512 }
1513 
Bt3_MatchFinder_Skip(CMatchFinder * p,UInt32 num)1514 static void Bt3_MatchFinder_Skip(CMatchFinder *p, UInt32 num)
1515 {
1516   SKIP_HEADER(3)
1517   {
1518     UInt32 h2;
1519     UInt32 *hash;
1520     HASH3_CALC
1521     hash = p->hash;
1522     curMatch = (hash + kFix3HashSize)[hv];
1523     hash[h2] =
1524     (hash + kFix3HashSize)[hv] = p->pos;
1525   }
1526   SKIP_FOOTER
1527 }
1528 
Bt4_MatchFinder_Skip(CMatchFinder * p,UInt32 num)1529 static void Bt4_MatchFinder_Skip(CMatchFinder *p, UInt32 num)
1530 {
1531   SKIP_HEADER(4)
1532   {
1533     UInt32 h2, h3;
1534     UInt32 *hash;
1535     HASH4_CALC
1536     hash = p->hash;
1537     curMatch = (hash + kFix4HashSize)[hv];
1538     hash                  [h2] =
1539     (hash + kFix3HashSize)[h3] =
1540     (hash + kFix4HashSize)[hv] = p->pos;
1541   }
1542   SKIP_FOOTER
1543 }
1544 
Bt5_MatchFinder_Skip(CMatchFinder * p,UInt32 num)1545 static void Bt5_MatchFinder_Skip(CMatchFinder *p, UInt32 num)
1546 {
1547   SKIP_HEADER(5)
1548   {
1549     UInt32 h2, h3;
1550     UInt32 *hash;
1551     HASH5_CALC
1552     hash = p->hash;
1553     curMatch = (hash + kFix5HashSize)[hv];
1554     hash                  [h2] =
1555     (hash + kFix3HashSize)[h3] =
1556     // (hash + kFix4HashSize)[h4] =
1557     (hash + kFix5HashSize)[hv] = p->pos;
1558   }
1559   SKIP_FOOTER
1560 }
1561 
1562 
1563 #define HC_SKIP_HEADER(minLen) \
1564     do { if (p->lenLimit < minLen) { MatchFinder_MovePos(p); num--; continue; } { \
1565     const Byte *cur; \
1566     UInt32 *hash; \
1567     UInt32 *son; \
1568     UInt32 pos = p->pos; \
1569     UInt32 num2 = num; \
1570     /* (p->pos == p->posLimit) is not allowed here !!! */ \
1571     { const UInt32 rem = p->posLimit - pos; if (num2 > rem) num2 = rem; } \
1572     num -= num2; \
1573     { const UInt32 cycPos = p->cyclicBufferPos; \
1574       son = p->son + cycPos; \
1575       p->cyclicBufferPos = cycPos + num2; } \
1576     cur = p->buffer; \
1577     hash = p->hash; \
1578     do { \
1579     UInt32 curMatch; \
1580     UInt32 hv;
1581 
1582 
1583 #define HC_SKIP_FOOTER \
1584     cur++;  pos++;  *son++ = curMatch; \
1585     } while (--num2); \
1586     p->buffer = cur; \
1587     p->pos = pos; \
1588     if (pos == p->posLimit) MatchFinder_CheckLimits(p); \
1589     }} while(num); \
1590 
1591 
Hc4_MatchFinder_Skip(CMatchFinder * p,UInt32 num)1592 static void Hc4_MatchFinder_Skip(CMatchFinder *p, UInt32 num)
1593 {
1594   HC_SKIP_HEADER(4)
1595 
1596     UInt32 h2, h3;
1597     HASH4_CALC
1598     curMatch = (hash + kFix4HashSize)[hv];
1599     hash                  [h2] =
1600     (hash + kFix3HashSize)[h3] =
1601     (hash + kFix4HashSize)[hv] = pos;
1602 
1603   HC_SKIP_FOOTER
1604 }
1605 
1606 
Hc5_MatchFinder_Skip(CMatchFinder * p,UInt32 num)1607 static void Hc5_MatchFinder_Skip(CMatchFinder *p, UInt32 num)
1608 {
1609   HC_SKIP_HEADER(5)
1610 
1611     UInt32 h2, h3;
1612     HASH5_CALC
1613     curMatch = (hash + kFix5HashSize)[hv];
1614     hash                  [h2] =
1615     (hash + kFix3HashSize)[h3] =
1616     // (hash + kFix4HashSize)[h4] =
1617     (hash + kFix5HashSize)[hv] = pos;
1618 
1619   HC_SKIP_FOOTER
1620 }
1621 
1622 
Hc3Zip_MatchFinder_Skip(CMatchFinder * p,UInt32 num)1623 void Hc3Zip_MatchFinder_Skip(CMatchFinder *p, UInt32 num)
1624 {
1625   HC_SKIP_HEADER(3)
1626 
1627     HASH_ZIP_CALC
1628     curMatch = hash[hv];
1629     hash[hv] = pos;
1630 
1631   HC_SKIP_FOOTER
1632 }
1633 
1634 
MatchFinder_CreateVTable(CMatchFinder * p,IMatchFinder2 * vTable)1635 void MatchFinder_CreateVTable(CMatchFinder *p, IMatchFinder2 *vTable)
1636 {
1637   vTable->Init = (Mf_Init_Func)MatchFinder_Init;
1638   vTable->GetNumAvailableBytes = (Mf_GetNumAvailableBytes_Func)MatchFinder_GetNumAvailableBytes;
1639   vTable->GetPointerToCurrentPos = (Mf_GetPointerToCurrentPos_Func)MatchFinder_GetPointerToCurrentPos;
1640   if (!p->btMode)
1641   {
1642     if (p->numHashBytes <= 4)
1643     {
1644       vTable->GetMatches = (Mf_GetMatches_Func)Hc4_MatchFinder_GetMatches;
1645       vTable->Skip = (Mf_Skip_Func)Hc4_MatchFinder_Skip;
1646     }
1647     else
1648     {
1649       vTable->GetMatches = (Mf_GetMatches_Func)Hc5_MatchFinder_GetMatches;
1650       vTable->Skip = (Mf_Skip_Func)Hc5_MatchFinder_Skip;
1651     }
1652   }
1653   else if (p->numHashBytes == 2)
1654   {
1655     vTable->GetMatches = (Mf_GetMatches_Func)Bt2_MatchFinder_GetMatches;
1656     vTable->Skip = (Mf_Skip_Func)Bt2_MatchFinder_Skip;
1657   }
1658   else if (p->numHashBytes == 3)
1659   {
1660     vTable->GetMatches = (Mf_GetMatches_Func)Bt3_MatchFinder_GetMatches;
1661     vTable->Skip = (Mf_Skip_Func)Bt3_MatchFinder_Skip;
1662   }
1663   else if (p->numHashBytes == 4)
1664   {
1665     vTable->GetMatches = (Mf_GetMatches_Func)Bt4_MatchFinder_GetMatches;
1666     vTable->Skip = (Mf_Skip_Func)Bt4_MatchFinder_Skip;
1667   }
1668   else
1669   {
1670     vTable->GetMatches = (Mf_GetMatches_Func)Bt5_MatchFinder_GetMatches;
1671     vTable->Skip = (Mf_Skip_Func)Bt5_MatchFinder_Skip;
1672   }
1673 }
1674 
1675 
1676 
LzFindPrepare(void)1677 void LzFindPrepare(void)
1678 {
1679   #ifndef FORCE_LZFIND_SATUR_SUB_128
1680   #ifdef USE_LZFIND_SATUR_SUB_128
1681   LZFIND_SATUR_SUB_CODE_FUNC f = NULL;
1682   #ifdef MY_CPU_ARM_OR_ARM64
1683   {
1684     if (CPU_IsSupported_NEON())
1685     {
1686       // #pragma message ("=== LzFind NEON")
1687       PRF(printf("\n=== LzFind NEON\n"));
1688       f = LzFind_SaturSub_128;
1689     }
1690     // f = 0; // for debug
1691   }
1692   #else // MY_CPU_ARM_OR_ARM64
1693   if (CPU_IsSupported_SSE41())
1694   {
1695     // #pragma message ("=== LzFind SSE41")
1696     PRF(printf("\n=== LzFind SSE41\n"));
1697     f = LzFind_SaturSub_128;
1698 
1699     #ifdef USE_LZFIND_SATUR_SUB_256
1700     if (CPU_IsSupported_AVX2())
1701     {
1702       // #pragma message ("=== LzFind AVX2")
1703       PRF(printf("\n=== LzFind AVX2\n"));
1704       f = LzFind_SaturSub_256;
1705     }
1706     #endif
1707   }
1708   #endif // MY_CPU_ARM_OR_ARM64
1709   g_LzFind_SaturSub = f;
1710   #endif // USE_LZFIND_SATUR_SUB_128
1711   #endif // FORCE_LZFIND_SATUR_SUB_128
1712 }
1713 
1714 
1715 #undef MOVE_POS
1716 #undef MOVE_POS_RET
1717 #undef PRF
1718