• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* ZstdDec.c -- Zstd Decoder
2 2024-01-21 : the code was developed by Igor Pavlov, using Zstandard format
3              specification and original zstd decoder code as reference code.
4 original zstd decoder code: Copyright (c) Facebook, Inc. All rights reserved.
5 This source code is licensed under BSD 3-Clause License.
6 */
7 
8 #include "Precomp.h"
9 
10 #include <string.h>
11 #include <stdlib.h>
12 // #include <stdio.h>
13 
14 #include "Alloc.h"
15 #include "Xxh64.h"
16 #include "ZstdDec.h"
17 #include "CpuArch.h"
18 
19 #if defined(MY_CPU_ARM64)
20 #include <arm_neon.h>
21 #endif
22 
23 /* original-zstd still doesn't support window larger than 2 GiB.
24    So we also limit our decoder for 2 GiB window: */
25 #if defined(MY_CPU_64BIT) && 0 == 1
26   #define MAX_WINDOW_SIZE_LOG  41
27 #else
28   #define MAX_WINDOW_SIZE_LOG  31
29 #endif
30 
31 typedef
32   #if MAX_WINDOW_SIZE_LOG < 32
33     UInt32
34   #else
35     size_t
36   #endif
37     CZstdDecOffset;
38 
39 // for debug: simpler and smaller code but slow:
40 // #define Z7_ZSTD_DEC_USE_HUF_STREAM1_ALWAYS
41 
42 // #define SHOW_STAT
43 #ifdef SHOW_STAT
44 #include <stdio.h>
45 static unsigned g_Num_Blocks_Compressed = 0;
46 static unsigned g_Num_Blocks_memcpy = 0;
47 static unsigned g_Num_Wrap_memmove_Num = 0;
48 static unsigned g_Num_Wrap_memmove_Bytes = 0;
49 static unsigned g_NumSeqs_total = 0;
50 // static unsigned g_NumCopy = 0;
51 static unsigned g_NumOver = 0;
52 static unsigned g_NumOver2 = 0;
53 static unsigned g_Num_Match = 0;
54 static unsigned g_Num_Lits = 0;
55 static unsigned g_Num_LitsBig = 0;
56 static unsigned g_Num_Lit0 = 0;
57 static unsigned g_Num_Rep0 = 0;
58 static unsigned g_Num_Rep1 = 0;
59 static unsigned g_Num_Rep2 = 0;
60 static unsigned g_Num_Rep3 = 0;
61 static unsigned g_Num_Threshold_0 = 0;
62 static unsigned g_Num_Threshold_1 = 0;
63 static unsigned g_Num_Threshold_0sum = 0;
64 static unsigned g_Num_Threshold_1sum = 0;
65 #define STAT_UPDATE(v) v
66 #else
67 #define STAT_UPDATE(v)
68 #endif
69 #define STAT_INC(v)  STAT_UPDATE(v++;)
70 
71 
72 typedef struct
73 {
74   const Byte *ptr;
75   size_t len;
76 }
77 CInBufPair;
78 
79 
80 #if defined(MY_CPU_ARM_OR_ARM64) || defined(MY_CPU_X86_OR_AMD64)
81   #if (defined(__clang__) && (__clang_major__ >= 6)) \
82    || (defined(__GNUC__) && (__GNUC__ >= 6))
83     // disable for debug:
84     #define Z7_ZSTD_DEC_USE_BSR
85   #elif defined(_MSC_VER) && (_MSC_VER >= 1300)
86     // #if defined(MY_CPU_ARM_OR_ARM64)
87     #if (_MSC_VER >= 1600)
88       #include <intrin.h>
89     #endif
90     // disable for debug:
91     #define Z7_ZSTD_DEC_USE_BSR
92   #endif
93 #endif
94 
95 #ifdef Z7_ZSTD_DEC_USE_BSR
96   #if defined(__clang__) || defined(__GNUC__)
97     #define MY_clz(x)  ((unsigned)__builtin_clz((UInt32)x))
98   #else  // #if defined(_MSC_VER)
99     #ifdef MY_CPU_ARM_OR_ARM64
100       #define MY_clz  _CountLeadingZeros
101     #endif // MY_CPU_X86_OR_AMD64
102   #endif // _MSC_VER
103 #elif !defined(Z7_ZSTD_DEC_USE_LOG_TABLE)
104   #define Z7_ZSTD_DEC_USE_LOG_TABLE
105 #endif
106 
107 
108 static
109 Z7_FORCE_INLINE
GetHighestSetBit_32_nonzero_big(UInt32 num)110 unsigned GetHighestSetBit_32_nonzero_big(UInt32 num)
111 {
112   // (num != 0)
113   #ifdef MY_clz
114     return 31 - MY_clz(num);
115   #elif defined(Z7_ZSTD_DEC_USE_BSR)
116   {
117     unsigned long zz;
118     _BitScanReverse(&zz, num);
119     return zz;
120   }
121   #else
122   {
123     int i = -1;
124     for (;;)
125     {
126       i++;
127       num >>= 1;
128       if (num == 0)
129         return (unsigned)i;
130     }
131   }
132   #endif
133 }
134 
135 #ifdef Z7_ZSTD_DEC_USE_LOG_TABLE
136 
137 #define R1(a)  a, a
138 #define R2(a)  R1(a), R1(a)
139 #define R3(a)  R2(a), R2(a)
140 #define R4(a)  R3(a), R3(a)
141 #define R5(a)  R4(a), R4(a)
142 #define R6(a)  R5(a), R5(a)
143 #define R7(a)  R6(a), R6(a)
144 #define R8(a)  R7(a), R7(a)
145 #define R9(a)  R8(a), R8(a)
146 
147 #define Z7_ZSTD_FSE_MAX_ACCURACY  9
148 // states[] values in FSE_Generate() can use (Z7_ZSTD_FSE_MAX_ACCURACY + 1) bits.
149 static const Byte k_zstd_LogTable[2 << Z7_ZSTD_FSE_MAX_ACCURACY] =
150 {
151   R1(0), R1(1), R2(2), R3(3), R4(4), R5(5), R6(6), R7(7), R8(8), R9(9)
152 };
153 
154 #define GetHighestSetBit_32_nonzero_small(num)  (k_zstd_LogTable[num])
155 #else
156 #define GetHighestSetBit_32_nonzero_small  GetHighestSetBit_32_nonzero_big
157 #endif
158 
159 
160 #ifdef MY_clz
161   #define UPDATE_BIT_OFFSET_FOR_PADDING(b, bitOffset) \
162     bitOffset -= (CBitCtr)(MY_clz(b) - 23);
163 #elif defined(Z7_ZSTD_DEC_USE_BSR)
164   #define UPDATE_BIT_OFFSET_FOR_PADDING(b, bitOffset) \
165     { unsigned long zz;  _BitScanReverse(&zz, b);  bitOffset -= 8;  bitOffset += zz; }
166 #else
167   #define UPDATE_BIT_OFFSET_FOR_PADDING(b, bitOffset) \
168     for (;;) { bitOffset--;  if (b & 0x80) { break; }  b <<= 1; }
169 #endif
170 
171 #define SET_bitOffset_TO_PAD(bitOffset, src, srcLen) \
172 { \
173   unsigned lastByte = (src)[(size_t)(srcLen) - 1]; \
174   if (lastByte == 0) return SZ_ERROR_DATA; \
175   bitOffset = (CBitCtr)((srcLen) * 8); \
176   UPDATE_BIT_OFFSET_FOR_PADDING(lastByte, bitOffset) \
177 }
178 
179 #ifndef Z7_ZSTD_DEC_USE_HUF_STREAM1_ALWAYS
180 
181 #define SET_bitOffset_TO_PAD_and_SET_BIT_SIZE(bitOffset, src, srcLen_res) \
182 { \
183   unsigned lastByte = (src)[(size_t)(srcLen_res) - 1]; \
184   if (lastByte == 0) return SZ_ERROR_DATA; \
185   srcLen_res *= 8; \
186   bitOffset = (CBitCtr)srcLen_res; \
187   UPDATE_BIT_OFFSET_FOR_PADDING(lastByte, bitOffset) \
188 }
189 
190 #endif
191 
192 /*
193 typedef Int32 CBitCtr_signed;
194 typedef Int32 CBitCtr;
195 */
196 // /*
197 typedef ptrdiff_t CBitCtr_signed;
198 typedef ptrdiff_t CBitCtr;
199 // */
200 
201 
202 #define MATCH_LEN_MIN  3
203 #define kBlockSizeMax  (1u << 17)
204 
205 // #define Z7_ZSTD_DEC_PRINT_TABLE
206 
207 #ifdef Z7_ZSTD_DEC_PRINT_TABLE
208 #define NUM_OFFSET_SYMBOLS_PREDEF 29
209 #endif
210 #define NUM_OFFSET_SYMBOLS_MAX    (MAX_WINDOW_SIZE_LOG + 1)  // 32
211 #define NUM_LL_SYMBOLS            36
212 #define NUM_ML_SYMBOLS            53
213 #define FSE_NUM_SYMBOLS_MAX       53  // NUM_ML_SYMBOLS
214 
215 // /*
216 #if !defined(MY_CPU_X86) || defined(__PIC__) || defined(MY_CPU_64BIT)
217 #define Z7_ZSTD_DEC_USE_BASES_IN_OBJECT
218 #endif
219 // */
220 // for debug:
221 // #define Z7_ZSTD_DEC_USE_BASES_LOCAL
222 // #define Z7_ZSTD_DEC_USE_BASES_IN_OBJECT
223 
224 #define GLOBAL_TABLE(n)  k_ ## n
225 
226 #if defined(Z7_ZSTD_DEC_USE_BASES_LOCAL)
227   #define BASES_TABLE(n)  a_ ## n
228 #elif defined(Z7_ZSTD_DEC_USE_BASES_IN_OBJECT)
229   #define BASES_TABLE(n)  p->m_ ## n
230 #else
231   #define BASES_TABLE(n)  GLOBAL_TABLE(n)
232 #endif
233 
234 #define Z7_ZSTD_DEC_USE_ML_PLUS3
235 
236 #if defined(Z7_ZSTD_DEC_USE_BASES_LOCAL) || \
237     defined(Z7_ZSTD_DEC_USE_BASES_IN_OBJECT)
238 
239 #define SEQ_EXTRA_TABLES(n) \
240   Byte   n ## SEQ_LL_EXTRA [NUM_LL_SYMBOLS]; \
241   Byte   n ## SEQ_ML_EXTRA [NUM_ML_SYMBOLS]; \
242   UInt32 n ## SEQ_LL_BASES [NUM_LL_SYMBOLS]; \
243   UInt32 n ## SEQ_ML_BASES [NUM_ML_SYMBOLS]; \
244 
245 #define Z7_ZSTD_DEC_USE_BASES_CALC
246 
247 #ifdef Z7_ZSTD_DEC_USE_BASES_CALC
248 
249   #define FILL_LOC_BASES(n, startSum) \
250     { unsigned i; UInt32 sum = startSum; \
251       for (i = 0; i != Z7_ARRAY_SIZE(GLOBAL_TABLE(n ## _EXTRA)); i++) \
252       { const unsigned a = GLOBAL_TABLE(n ## _EXTRA)[i]; \
253         BASES_TABLE(n ## _BASES)[i] = sum; \
254         /* if (sum != GLOBAL_TABLE(n ## _BASES)[i]) exit(1); */ \
255         sum += (UInt32)1 << a; \
256         BASES_TABLE(n ## _EXTRA)[i] = (Byte)a; }}
257 
258   #define FILL_LOC_BASES_ALL \
259       FILL_LOC_BASES (SEQ_LL, 0) \
260       FILL_LOC_BASES (SEQ_ML, MATCH_LEN_MIN) \
261 
262 #else
263   #define COPY_GLOBAL_ARR(n)  \
264     memcpy(BASES_TABLE(n), GLOBAL_TABLE(n), sizeof(GLOBAL_TABLE(n)));
265   #define FILL_LOC_BASES_ALL \
266     COPY_GLOBAL_ARR (SEQ_LL_EXTRA) \
267     COPY_GLOBAL_ARR (SEQ_ML_EXTRA) \
268     COPY_GLOBAL_ARR (SEQ_LL_BASES) \
269     COPY_GLOBAL_ARR (SEQ_ML_BASES) \
270 
271 #endif
272 
273 #endif
274 
275 
276 
277 /// The sequence decoding baseline and number of additional bits to read/add
278 #if !defined(Z7_ZSTD_DEC_USE_BASES_CALC)
279 static const UInt32 GLOBAL_TABLE(SEQ_LL_BASES) [NUM_LL_SYMBOLS] =
280 {
281   0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
282   16, 18, 20, 22, 24, 28, 32, 40, 48, 64, 0x80, 0x100, 0x200, 0x400, 0x800, 0x1000,
283   0x2000, 0x4000, 0x8000, 0x10000
284 };
285 #endif
286 
287 static const Byte GLOBAL_TABLE(SEQ_LL_EXTRA) [NUM_LL_SYMBOLS] =
288 {
289   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
290   1, 1, 1, 1, 2, 2, 3, 3, 4, 6, 7, 8, 9, 10, 11, 12,
291   13, 14, 15, 16
292 };
293 
294 #if !defined(Z7_ZSTD_DEC_USE_BASES_CALC)
295 static const UInt32 GLOBAL_TABLE(SEQ_ML_BASES) [NUM_ML_SYMBOLS] =
296 {
297   3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
298   19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
299   35, 37, 39, 41, 43, 47, 51, 59, 67, 83, 99, 0x83, 0x103, 0x203, 0x403, 0x803,
300   0x1003, 0x2003, 0x4003, 0x8003, 0x10003
301 };
302 #endif
303 
304 static const Byte GLOBAL_TABLE(SEQ_ML_EXTRA) [NUM_ML_SYMBOLS] =
305 {
306   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
307   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
308   1, 1, 1, 1, 2, 2, 3, 3, 4, 4, 5, 7, 8, 9, 10, 11,
309   12, 13, 14, 15, 16
310 };
311 
312 
313 #ifdef Z7_ZSTD_DEC_PRINT_TABLE
314 
315 static const Int16 SEQ_LL_PREDEF_DIST [NUM_LL_SYMBOLS] =
316 {
317   4, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1,
318   2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 1, 1, 1, 1, 1,
319  -1,-1,-1,-1
320 };
321 static const Int16 SEQ_OFFSET_PREDEF_DIST [NUM_OFFSET_SYMBOLS_PREDEF] =
322 {
323   1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1,
324   1, 1, 1, 1, 1, 1, 1, 1,-1,-1,-1,-1,-1
325 };
326 static const Int16 SEQ_ML_PREDEF_DIST [NUM_ML_SYMBOLS] =
327 {
328   1, 4, 3, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1,
329   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
330   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,-1,-1,
331  -1,-1,-1,-1,-1
332 };
333 
334 #endif
335 
336 // typedef int FastInt;
337 // typedef Int32 FastInt32;
338 typedef unsigned FastInt;
339 typedef UInt32 FastInt32;
340 typedef FastInt32 CFseRecord;
341 
342 
343 #define FSE_REC_LEN_OFFSET    8
344 #define FSE_REC_STATE_OFFSET  16
345 #define GET_FSE_REC_SYM(st)   ((Byte)(st))
346 #define GET_FSE_REC_LEN(st)   ((Byte)((st) >> FSE_REC_LEN_OFFSET))
347 #define GET_FSE_REC_STATE(st) ((st) >> FSE_REC_STATE_OFFSET)
348 
349 // #define FSE_REC_SYM_MASK      (0xff)
350 // #define GET_FSE_REC_SYM(st)   (st & FSE_REC_SYM_MASK)
351 
352 #define W_BASE(state, len, sym) \
353     (((UInt32)state << (4 + FSE_REC_STATE_OFFSET)) + \
354     (len << FSE_REC_LEN_OFFSET) + (sym))
355 #define W(state, len, sym)  W_BASE(state, len, sym)
356 static const CFseRecord k_PredefRecords_LL[1 << 6] = {
357 W(0,4, 0),W(1,4, 0),W(2,5, 1),W(0,5, 3),W(0,5, 4),W(0,5, 6),W(0,5, 7),W(0,5, 9),
358 W(0,5,10),W(0,5,12),W(0,6,14),W(0,5,16),W(0,5,18),W(0,5,19),W(0,5,21),W(0,5,22),
359 W(0,5,24),W(2,5,25),W(0,5,26),W(0,6,27),W(0,6,29),W(0,6,31),W(2,4, 0),W(0,4, 1),
360 W(0,5, 2),W(2,5, 4),W(0,5, 5),W(2,5, 7),W(0,5, 8),W(2,5,10),W(0,5,11),W(0,6,13),
361 W(2,5,16),W(0,5,17),W(2,5,19),W(0,5,20),W(2,5,22),W(0,5,23),W(0,4,25),W(1,4,25),
362 W(2,5,26),W(0,6,28),W(0,6,30),W(3,4, 0),W(1,4, 1),W(2,5, 2),W(2,5, 3),W(2,5, 5),
363 W(2,5, 6),W(2,5, 8),W(2,5, 9),W(2,5,11),W(2,5,12),W(0,6,15),W(2,5,17),W(2,5,18),
364 W(2,5,20),W(2,5,21),W(2,5,23),W(2,5,24),W(0,6,35),W(0,6,34),W(0,6,33),W(0,6,32)
365 };
366 static const CFseRecord k_PredefRecords_OF[1 << 5] = {
367 W(0,5, 0),W(0,4, 6),W(0,5, 9),W(0,5,15),W(0,5,21),W(0,5, 3),W(0,4, 7),W(0,5,12),
368 W(0,5,18),W(0,5,23),W(0,5, 5),W(0,4, 8),W(0,5,14),W(0,5,20),W(0,5, 2),W(1,4, 7),
369 W(0,5,11),W(0,5,17),W(0,5,22),W(0,5, 4),W(1,4, 8),W(0,5,13),W(0,5,19),W(0,5, 1),
370 W(1,4, 6),W(0,5,10),W(0,5,16),W(0,5,28),W(0,5,27),W(0,5,26),W(0,5,25),W(0,5,24)
371 };
372 #if defined(Z7_ZSTD_DEC_USE_ML_PLUS3)
373 #undef W
374 #define W(state, len, sym)  W_BASE(state, len, (sym + MATCH_LEN_MIN))
375 #endif
376 static const CFseRecord k_PredefRecords_ML[1 << 6] = {
377 W(0,6, 0),W(0,4, 1),W(2,5, 2),W(0,5, 3),W(0,5, 5),W(0,5, 6),W(0,5, 8),W(0,6,10),
378 W(0,6,13),W(0,6,16),W(0,6,19),W(0,6,22),W(0,6,25),W(0,6,28),W(0,6,31),W(0,6,33),
379 W(0,6,35),W(0,6,37),W(0,6,39),W(0,6,41),W(0,6,43),W(0,6,45),W(1,4, 1),W(0,4, 2),
380 W(2,5, 3),W(0,5, 4),W(2,5, 6),W(0,5, 7),W(0,6, 9),W(0,6,12),W(0,6,15),W(0,6,18),
381 W(0,6,21),W(0,6,24),W(0,6,27),W(0,6,30),W(0,6,32),W(0,6,34),W(0,6,36),W(0,6,38),
382 W(0,6,40),W(0,6,42),W(0,6,44),W(2,4, 1),W(3,4, 1),W(1,4, 2),W(2,5, 4),W(2,5, 5),
383 W(2,5, 7),W(2,5, 8),W(0,6,11),W(0,6,14),W(0,6,17),W(0,6,20),W(0,6,23),W(0,6,26),
384 W(0,6,29),W(0,6,52),W(0,6,51),W(0,6,50),W(0,6,49),W(0,6,48),W(0,6,47),W(0,6,46)
385 };
386 
387 
388 // sum of freqs[] must be correct
389 // (numSyms != 0)
390 // (accuracy >= 5)
391 static
392 Z7_NO_INLINE
393 // Z7_FORCE_INLINE
FSE_Generate(CFseRecord * table,const Int16 * const freqs,const size_t numSyms,const unsigned accuracy,UInt32 delta)394 void FSE_Generate(CFseRecord *table,
395     const Int16 *const freqs, const size_t numSyms,
396     const unsigned accuracy, UInt32 delta)
397 {
398   size_t size = (size_t)1 << accuracy;
399   // max value in states[x] is ((1 << accuracy) * 2)
400   UInt16 states[FSE_NUM_SYMBOLS_MAX];
401   {
402     /* Symbols with "less than 1" probability get a single cell,
403        starting from the end of the table.
404        These symbols define a full state reset, reading (accuracy) bits. */
405     size_t threshold = size;
406     {
407       size_t s = 0;
408       do
409         if (freqs[s] == -1)
410         {
411           table[--threshold] = (CFseRecord)s;
412           states[s] = 1;
413         }
414       while (++s != numSyms);
415     }
416 
417     #ifdef SHOW_STAT
418     if (threshold == size)
419     {
420       STAT_INC(g_Num_Threshold_0)
421       STAT_UPDATE(g_Num_Threshold_0sum += (unsigned)size;)
422     }
423     else
424     {
425       STAT_INC(g_Num_Threshold_1)
426       STAT_UPDATE(g_Num_Threshold_1sum += (unsigned)size;)
427     }
428     #endif
429 
430     // { unsigned uuu; for (uuu = 0; uuu < 400; uuu++)
431     {
432       // Each (symbol) gets freqs[symbol] cells.
433       // Cell allocation is spread, not linear.
434       const size_t step = (size >> 1) + (size >> 3) + 3;
435       size_t pos = 0;
436       // const unsigned mask = size - 1;
437       /*
438       if (threshold == size)
439       {
440         size_t s = 0;
441         size--;
442         do
443         {
444           int freq = freqs[s];
445           if (freq <= 0)
446             continue;
447           states[s] = (UInt16)freq;
448           do
449           {
450             table[pos] (CFseRecord)s;
451             pos = (pos + step) & size; // & mask;
452           }
453           while (--freq);
454         }
455         while (++s != numSyms);
456       }
457       else
458       */
459       {
460         size_t s = 0;
461         size--;
462         do
463         {
464           int freq = freqs[s];
465           if (freq <= 0)
466             continue;
467           states[s] = (UInt16)freq;
468           do
469           {
470             table[pos] = (CFseRecord)s;
471             // we skip position, if it's already occupied by a "less than 1" probability symbol.
472             // (step) is coprime to table size, so the cycle will visit each position exactly once
473             do
474               pos = (pos + step) & size; // & mask;
475             while (pos >= threshold);
476           }
477           while (--freq);
478         }
479         while (++s != numSyms);
480       }
481       size++;
482       // (pos != 0) is unexpected case that means that freqs[] are not correct.
483       // so it's some failure in code (for example, incorrect predefined freq[] table)
484       // if (pos != 0) return SZ_ERROR_FAIL;
485     }
486     // }
487   }
488   {
489     const CFseRecord * const limit = table + size;
490     delta = ((UInt32)size << FSE_REC_STATE_OFFSET) - delta;
491     /* State increases by symbol over time, decreasing number of bits.
492        Baseline increases until the bit threshold is passed, at which point it resets to 0 */
493     do
494     {
495       #define TABLE_ITER(a) \
496       { \
497         const FastInt sym = (FastInt)table[a]; \
498         const unsigned nextState = states[sym]; \
499         unsigned nb; \
500         states[sym] = (UInt16)(nextState + 1); \
501         nb = accuracy - GetHighestSetBit_32_nonzero_small(nextState); \
502         table[a] = (CFseRecord)(sym - delta \
503             + ((UInt32)nb << FSE_REC_LEN_OFFSET) \
504             + ((UInt32)nextState << FSE_REC_STATE_OFFSET << nb)); \
505       }
506       TABLE_ITER(0)
507       TABLE_ITER(1)
508       table += 2;
509     }
510     while (table != limit);
511   }
512 }
513 
514 
515 #ifdef Z7_ZSTD_DEC_PRINT_TABLE
516 
Print_Predef(unsigned predefAccuracy,const unsigned numSymsPredef,const Int16 * const predefFreqs,const CFseRecord * checkTable)517 static void Print_Predef(unsigned predefAccuracy,
518     const unsigned numSymsPredef,
519     const Int16 * const predefFreqs,
520     const CFseRecord *checkTable)
521 {
522   CFseRecord table[1 << 6];
523   unsigned i;
524   FSE_Generate(table, predefFreqs, numSymsPredef, predefAccuracy,
525         #if defined(Z7_ZSTD_DEC_USE_ML_PLUS3)
526           numSymsPredef == NUM_ML_SYMBOLS ? MATCH_LEN_MIN :
527         #endif
528           0
529     );
530   if (memcmp(table, checkTable, sizeof(UInt32) << predefAccuracy) != 0)
531     exit(1);
532   for (i = 0; i < (1u << predefAccuracy); i++)
533   {
534     const UInt32 v = table[i];
535     const unsigned state = (unsigned)(GET_FSE_REC_STATE(v));
536     if (state & 0xf)
537       exit(1);
538     if (i != 0)
539     {
540       printf(",");
541       if (i % 8 == 0)
542         printf("\n");
543     }
544     printf("W(%d,%d,%2d)",
545         (unsigned)(state >> 4),
546         (unsigned)((v >> FSE_REC_LEN_OFFSET) & 0xff),
547         (unsigned)GET_FSE_REC_SYM(v));
548   }
549   printf("\n\n");
550 }
551 
552 #endif
553 
554 
555 #define GET16(dest, p)  { const Byte *ptr = p;  dest = GetUi16(ptr); }
556 #define GET32(dest, p)  { const Byte *ptr = p;  dest = GetUi32(ptr); }
557 
558 // (1 <= numBits <= 9)
559 #define FORWARD_READ_BITS(destVal, numBits, mask) \
560   { const CBitCtr_signed bos3 = (bitOffset) >> 3; \
561     if (bos3 >= 0) return SZ_ERROR_DATA; \
562     GET16(destVal, src + bos3) \
563     destVal >>= (bitOffset) & 7; \
564     bitOffset += (CBitCtr_signed)(numBits); \
565     mask = (1u << (numBits)) - 1; \
566     destVal &= mask; \
567   }
568 
569 #define FORWARD_READ_1BIT(destVal) \
570   { const CBitCtr_signed bos3 = (bitOffset) >> 3; \
571     if (bos3 >= 0) return SZ_ERROR_DATA; \
572     destVal = *(src + bos3); \
573     destVal >>= (bitOffset) & 7; \
574     (bitOffset)++; \
575     destVal &= 1; \
576   }
577 
578 
579 // in: (accuracyMax <= 9)
580 // at least 2 bytes will be processed from (in) stream.
581 // at return: (in->len > 0)
582 static
583 Z7_NO_INLINE
FSE_DecodeHeader(CFseRecord * const table,CInBufPair * const in,const unsigned accuracyMax,Byte * const accuracyRes,unsigned numSymbolsMax)584 SRes FSE_DecodeHeader(CFseRecord *const table,
585     CInBufPair *const in,
586     const unsigned accuracyMax,
587     Byte *const accuracyRes,
588     unsigned numSymbolsMax)
589 {
590   unsigned accuracy;
591   unsigned remain1;
592   unsigned syms;
593   Int16 freqs[FSE_NUM_SYMBOLS_MAX + 3]; // +3 for overwrite (repeat)
594   const Byte *src = in->ptr;
595   CBitCtr_signed bitOffset = (CBitCtr_signed)in->len - 1;
596   if (bitOffset <= 0)
597     return SZ_ERROR_DATA;
598   accuracy = *src & 0xf;
599   accuracy += 5;
600   if (accuracy > accuracyMax)
601     return SZ_ERROR_DATA;
602   *accuracyRes = (Byte)accuracy;
603   remain1 = (1u << accuracy) + 1; // (it's remain_freqs_sum + 1)
604   syms = 0;
605   src += bitOffset;  // src points to last byte
606   bitOffset = 4 - (bitOffset << 3);
607 
608   for (;;)
609   {
610     // (2 <= remain1)
611     const unsigned bits = GetHighestSetBit_32_nonzero_small((unsigned)remain1);
612     // (1 <= bits <= accuracy)
613     unsigned val; // it must be unsigned or int
614     unsigned mask;
615     FORWARD_READ_BITS(val, bits, mask)
616     {
617       const unsigned val2 = remain1 + val - mask;
618       if (val2 > mask)
619       {
620         unsigned bit;
621         FORWARD_READ_1BIT(bit)
622         if (bit)
623           val = val2;
624       }
625     }
626     {
627       // (remain1 >= 2)
628       // (0 <= (int)val <= remain1)
629       val = (unsigned)((int)val - 1);
630       // val now is "probability" of symbol
631       // (probability == -1) means "less than 1" frequency.
632       // (-1 <= (int)val <= remain1 - 1)
633       freqs[syms++] = (Int16)(int)val;
634       if (val != 0)
635       {
636         remain1 -= (int)val < 0 ? 1u : (unsigned)val;
637         // remain1 -= val;
638         // val >>= (sizeof(val) * 8 - 2);
639         // remain1 -= val & 2;
640         // freqs[syms++] = (Int16)(int)val;
641         // syms++;
642         if (remain1 == 1)
643           break;
644         if (syms >= FSE_NUM_SYMBOLS_MAX)
645           return SZ_ERROR_DATA;
646       }
647       else // if (val == 0)
648       {
649         // freqs[syms++] = 0;
650         // syms++;
651         for (;;)
652         {
653           unsigned repeat;
654           FORWARD_READ_BITS(repeat, 2, mask)
655           freqs[syms    ] = 0;
656           freqs[syms + 1] = 0;
657           freqs[syms + 2] = 0;
658           syms += repeat;
659           if (syms >= FSE_NUM_SYMBOLS_MAX)
660             return SZ_ERROR_DATA;
661           if (repeat != 3)
662             break;
663         }
664       }
665     }
666   }
667 
668   if (syms > numSymbolsMax)
669     return SZ_ERROR_DATA;
670   bitOffset += 7;
671   bitOffset >>= 3;
672   if (bitOffset > 0)
673     return SZ_ERROR_DATA;
674   in->ptr = src + bitOffset;
675   in->len = (size_t)(1 - bitOffset);
676   {
677     // unsigned uuu; for (uuu = 0; uuu < 50; uuu++)
678     FSE_Generate(table, freqs, syms, accuracy,
679         #if defined(Z7_ZSTD_DEC_USE_ML_PLUS3)
680           numSymbolsMax == NUM_ML_SYMBOLS ? MATCH_LEN_MIN :
681         #endif
682           0
683         );
684   }
685   return SZ_OK;
686 }
687 
688 
689 // ---------- HUFFMAN ----------
690 
691 #define HUF_MAX_BITS    12
692 #define HUF_MAX_SYMBS   256
693 #define HUF_DUMMY_SIZE  (128 + 8 * 2)  // it must multiple of 8
694 // #define HUF_DUMMY_SIZE 0
695 #define HUF_TABLE_SIZE  ((2 << HUF_MAX_BITS) + HUF_DUMMY_SIZE)
696 #define HUF_GET_SYMBOLS(table)  ((table) + (1 << HUF_MAX_BITS) + HUF_DUMMY_SIZE)
697 // #define HUF_GET_LENS(table)  (table)
698 
699 typedef struct
700 {
701   // Byte table[HUF_TABLE_SIZE];
702   UInt64 table64[HUF_TABLE_SIZE / sizeof(UInt64)];
703 }
704 CZstdDecHufTable;
705 
706 /*
707 Input:
708   numSyms != 0
709   (bits) array size must be aligned for 2
710   if (numSyms & 1), then bits[numSyms] == 0,
711   Huffman tree must be correct before Huf_Build() call:
712     (sum (1/2^bits[i]) == 1).
713     && (bits[i] <= HUF_MAX_BITS)
714 */
715 static
716 Z7_FORCE_INLINE
Huf_Build(Byte * const table,const Byte * bits,const unsigned numSyms)717 void Huf_Build(Byte * const table,
718     const Byte *bits, const unsigned numSyms)
719 {
720   unsigned counts0[HUF_MAX_BITS + 1];
721   unsigned counts1[HUF_MAX_BITS + 1];
722   const Byte * const bitsEnd = bits + numSyms;
723   // /*
724   {
725     unsigned t;
726     for (t = 0; t < Z7_ARRAY_SIZE(counts0); t++) counts0[t] = 0;
727     for (t = 0; t < Z7_ARRAY_SIZE(counts1); t++) counts1[t] = 0;
728   }
729   // */
730   // memset(counts0, 0, sizeof(counts0));
731   // memset(counts1, 0, sizeof(counts1));
732   {
733     const Byte *bits2 = bits;
734     // we access additional bits[symbol] if (numSyms & 1)
735     do
736     {
737       counts0[bits2[0]]++;
738       counts1[bits2[1]]++;
739     }
740     while ((bits2 += 2) < bitsEnd);
741   }
742   {
743     unsigned r = 0;
744     unsigned i = HUF_MAX_BITS;
745     // Byte *lens = HUF_GET_LENS(symbols);
746     do
747     {
748       const unsigned num = (counts0[i] + counts1[i]) << (HUF_MAX_BITS - i);
749       counts0[i] = r;
750       if (num)
751       {
752         Byte *lens = &table[r];
753         r += num;
754         memset(lens, (int)i, num);
755       }
756     }
757     while (--i);
758     counts0[0] = 0; // for speculated loads
759     // no need for check:
760     // if (r != (UInt32)1 << HUF_MAX_BITS) exit(0);
761   }
762   {
763     #ifdef MY_CPU_64BIT
764       UInt64
765     #else
766       UInt32
767     #endif
768         v = 0;
769     Byte *symbols = HUF_GET_SYMBOLS(table);
770     do
771     {
772       const unsigned nb = *bits++;
773       if (nb)
774       {
775         const unsigned code = counts0[nb];
776         const unsigned num = (1u << HUF_MAX_BITS) >> nb;
777         counts0[nb] = code + num;
778         // memset(&symbols[code], i, num);
779         // /*
780         {
781           Byte *s2 = &symbols[code];
782           if (num <= 2)
783           {
784             s2[0] = (Byte)v;
785             s2[(size_t)num - 1] = (Byte)v;
786           }
787           else if (num <= 8)
788           {
789             *(UInt32 *)(void *)s2 = (UInt32)v;
790             *(UInt32 *)(void *)(s2 + (size_t)num - 4) = (UInt32)v;
791           }
792           else
793           {
794             #ifdef MY_CPU_64BIT
795               UInt64 *s = (UInt64 *)(void *)s2;
796               const UInt64 *lim = (UInt64 *)(void *)(s2 + num);
797               do
798               {
799                 s[0] = v;  s[1] = v;  s += 2;
800               }
801               while (s != lim);
802             #else
803               UInt32 *s = (UInt32 *)(void *)s2;
804               const UInt32 *lim = (const UInt32 *)(const void *)(s2 + num);
805               do
806               {
807                 s[0] = v;  s[1] = v;  s += 2;
808                 s[0] = v;  s[1] = v;  s += 2;
809               }
810               while (s != lim);
811             #endif
812           }
813         }
814         // */
815       }
816       v +=
817         #ifdef MY_CPU_64BIT
818           0x0101010101010101;
819         #else
820           0x01010101;
821         #endif
822     }
823     while (bits != bitsEnd);
824   }
825 }
826 
827 
828 
829 // how many bytes (src) was moved back from original value.
830 // we need (HUF_SRC_OFFSET == 3) for optimized 32-bit memory access
831 #define HUF_SRC_OFFSET  3
832 
833 // v <<= 8 - (bitOffset & 7) + numBits;
834 // v >>= 32 - HUF_MAX_BITS;
835 #define HUF_GET_STATE(v, bitOffset, numBits) \
836   GET32(v, src + (HUF_SRC_OFFSET - 3) + ((CBitCtr_signed)bitOffset >> 3)) \
837   v >>= 32 - HUF_MAX_BITS - 8 + ((unsigned)bitOffset & 7) - numBits; \
838   v &= (1u << HUF_MAX_BITS) - 1; \
839 
840 
841 #ifndef Z7_ZSTD_DEC_USE_HUF_STREAM1_ALWAYS
842 #if defined(MY_CPU_AMD64) && defined(_MSC_VER) && _MSC_VER == 1400 \
843   || !defined(MY_CPU_X86_OR_AMD64) \
844   // || 1 == 1 /* for debug : to force STREAM4_PRELOAD mode */
845   // we need big number (>=16) of registers for PRELOAD4
846   #define Z7_ZSTD_DEC_USE_HUF_STREAM4_PRELOAD4
847   // #define Z7_ZSTD_DEC_USE_HUF_STREAM4_PRELOAD2 // for debug
848 #endif
849 #endif
850 
851 // for debug: simpler and smaller code but slow:
852 // #define Z7_ZSTD_DEC_USE_HUF_STREAM1_SIMPLE
853 
854 #if  defined(Z7_ZSTD_DEC_USE_HUF_STREAM1_SIMPLE) || \
855     !defined(Z7_ZSTD_DEC_USE_HUF_STREAM1_ALWAYS)
856 
857 #define HUF_DECODE(bitOffset, dest) \
858 { \
859   UInt32 v; \
860   HUF_GET_STATE(v, bitOffset, 0) \
861   bitOffset -= table[v]; \
862   *(dest) = symbols[v]; \
863   if ((CBitCtr_signed)bitOffset < 0) return SZ_ERROR_DATA; \
864 }
865 
866 #endif
867 
868 #if !defined(Z7_ZSTD_DEC_USE_HUF_STREAM1_SIMPLE) || \
869      defined(Z7_ZSTD_DEC_USE_HUF_STREAM4_PRELOAD4) || \
870      defined(Z7_ZSTD_DEC_USE_HUF_STREAM4_PRELOAD2) \
871 
872 #define HUF_DECODE_2_INIT(v, bitOffset) \
873   HUF_GET_STATE(v, bitOffset, 0)
874 
875 #define HUF_DECODE_2(v, bitOffset, dest) \
876 { \
877   unsigned numBits; \
878   numBits = table[v]; \
879   *(dest) = symbols[v]; \
880   HUF_GET_STATE(v, bitOffset, numBits) \
881   bitOffset -= (CBitCtr)numBits; \
882   if ((CBitCtr_signed)bitOffset < 0) return SZ_ERROR_DATA; \
883 }
884 
885 #endif
886 
887 
888 // src == ptr - HUF_SRC_OFFSET
889 // we are allowed to access 3 bytes before start of input buffer
890 static
891 Z7_NO_INLINE
Huf_Decompress_1stream(const Byte * const table,const Byte * src,const size_t srcLen,Byte * dest,const size_t destLen)892 SRes Huf_Decompress_1stream(const Byte * const table,
893     const Byte *src, const size_t srcLen,
894     Byte *dest, const size_t destLen)
895 {
896   CBitCtr bitOffset;
897   if (srcLen == 0)
898     return SZ_ERROR_DATA;
899   SET_bitOffset_TO_PAD (bitOffset, src + HUF_SRC_OFFSET, srcLen)
900   if (destLen)
901   {
902     const Byte *symbols = HUF_GET_SYMBOLS(table);
903     const Byte *destLim = dest + destLen;
904     #ifdef Z7_ZSTD_DEC_USE_HUF_STREAM1_SIMPLE
905     {
906       do
907       {
908         HUF_DECODE (bitOffset, dest)
909       }
910       while (++dest != destLim);
911     }
912     #else
913     {
914       UInt32 v;
915       HUF_DECODE_2_INIT (v, bitOffset)
916       do
917       {
918         HUF_DECODE_2 (v, bitOffset, dest)
919       }
920       while (++dest != destLim);
921     }
922     #endif
923   }
924   return bitOffset == 0 ? SZ_OK : SZ_ERROR_DATA;
925 }
926 
927 
928 // for debug : it reduces register pressure : by array copy can be slow :
929 // #define Z7_ZSTD_DEC_USE_HUF_LOCAL
930 
931 // src == ptr + (6 - HUF_SRC_OFFSET)
932 // srcLen >= 10
933 // we are allowed to access 3 bytes before start of input buffer
934 static
935 Z7_NO_INLINE
Huf_Decompress_4stream(const Byte * const table2,const Byte * src,size_t srcLen,Byte * dest,size_t destLen)936 SRes Huf_Decompress_4stream(const Byte * const
937   #ifdef Z7_ZSTD_DEC_USE_HUF_LOCAL
938     table2,
939   #else
940     table,
941   #endif
942     const Byte *src, size_t srcLen,
943     Byte *dest, size_t destLen)
944 {
945  #ifdef Z7_ZSTD_DEC_USE_HUF_LOCAL
946   Byte table[HUF_TABLE_SIZE];
947  #endif
948   UInt32 sizes[3];
949   const size_t delta = (destLen + 3) / 4;
950   if ((sizes[0] = GetUi16(src + (0 + HUF_SRC_OFFSET - 6))) == 0) return SZ_ERROR_DATA;
951   if ((sizes[1] = GetUi16(src + (2 + HUF_SRC_OFFSET - 6))) == 0) return SZ_ERROR_DATA;
952   sizes[1] += sizes[0];
953   if ((sizes[2] = GetUi16(src + (4 + HUF_SRC_OFFSET - 6))) == 0) return SZ_ERROR_DATA;
954   sizes[2] += sizes[1];
955   srcLen -= 6;
956   if (srcLen <= sizes[2])
957     return SZ_ERROR_DATA;
958 
959  #ifdef Z7_ZSTD_DEC_USE_HUF_LOCAL
960   {
961     // unsigned i = 0; for(; i < 1000; i++)
962     memcpy(table, table2, HUF_TABLE_SIZE);
963   }
964  #endif
965 
966   #ifndef Z7_ZSTD_DEC_USE_HUF_STREAM1_ALWAYS
967   {
968     CBitCtr bitOffset_0,
969             bitOffset_1,
970             bitOffset_2,
971             bitOffset_3;
972     {
973       SET_bitOffset_TO_PAD_and_SET_BIT_SIZE (bitOffset_0, src + HUF_SRC_OFFSET, sizes[0])
974       SET_bitOffset_TO_PAD_and_SET_BIT_SIZE (bitOffset_1, src + HUF_SRC_OFFSET, sizes[1])
975       SET_bitOffset_TO_PAD_and_SET_BIT_SIZE (bitOffset_2, src + HUF_SRC_OFFSET, sizes[2])
976       SET_bitOffset_TO_PAD                  (bitOffset_3, src + HUF_SRC_OFFSET, srcLen)
977     }
978     {
979       const Byte * const symbols = HUF_GET_SYMBOLS(table);
980       Byte *destLim = dest + destLen - delta * 3;
981 
982       if (dest != destLim)
983     #ifdef Z7_ZSTD_DEC_USE_HUF_STREAM4_PRELOAD4
984       {
985         UInt32 v_0, v_1, v_2, v_3;
986         HUF_DECODE_2_INIT (v_0, bitOffset_0)
987         HUF_DECODE_2_INIT (v_1, bitOffset_1)
988         HUF_DECODE_2_INIT (v_2, bitOffset_2)
989         HUF_DECODE_2_INIT (v_3, bitOffset_3)
990         // #define HUF_DELTA (1 << 17) / 4
991         do
992         {
993           HUF_DECODE_2 (v_3, bitOffset_3, dest + delta * 3)
994           HUF_DECODE_2 (v_2, bitOffset_2, dest + delta * 2)
995           HUF_DECODE_2 (v_1, bitOffset_1, dest + delta)
996           HUF_DECODE_2 (v_0, bitOffset_0, dest)
997         }
998         while (++dest != destLim);
999         /*
1000         {// unsigned y = 0; for (;y < 1; y++)
1001         {
1002           const size_t num = destLen - delta * 3;
1003           Byte *orig = dest - num;
1004           memmove (orig + delta    , orig + HUF_DELTA,     num);
1005           memmove (orig + delta * 2, orig + HUF_DELTA * 2, num);
1006           memmove (orig + delta * 3, orig + HUF_DELTA * 3, num);
1007         }}
1008         */
1009       }
1010     #elif defined(Z7_ZSTD_DEC_USE_HUF_STREAM4_PRELOAD2)
1011       {
1012         UInt32 v_0, v_1, v_2, v_3;
1013         HUF_DECODE_2_INIT (v_0, bitOffset_0)
1014         HUF_DECODE_2_INIT (v_1, bitOffset_1)
1015         do
1016         {
1017           HUF_DECODE_2 (v_0, bitOffset_0, dest)
1018           HUF_DECODE_2 (v_1, bitOffset_1, dest + delta)
1019         }
1020         while (++dest != destLim);
1021         dest = destLim - (destLen - delta * 3);
1022         dest += delta * 2;
1023         destLim += delta * 2;
1024         HUF_DECODE_2_INIT (v_2, bitOffset_2)
1025         HUF_DECODE_2_INIT (v_3, bitOffset_3)
1026         do
1027         {
1028           HUF_DECODE_2 (v_2, bitOffset_2, dest)
1029           HUF_DECODE_2 (v_3, bitOffset_3, dest + delta)
1030         }
1031         while (++dest != destLim);
1032         dest -= delta * 2;
1033         destLim -= delta * 2;
1034       }
1035     #else
1036       {
1037         do
1038         {
1039           HUF_DECODE (bitOffset_3, dest + delta * 3)
1040           HUF_DECODE (bitOffset_2, dest + delta * 2)
1041           HUF_DECODE (bitOffset_1, dest + delta)
1042           HUF_DECODE (bitOffset_0, dest)
1043         }
1044         while (++dest != destLim);
1045       }
1046     #endif
1047 
1048       if (bitOffset_3 != (CBitCtr)sizes[2])
1049         return SZ_ERROR_DATA;
1050       if (destLen &= 3)
1051       {
1052         destLim = dest + 4 - destLen;
1053         do
1054         {
1055           HUF_DECODE (bitOffset_2, dest + delta * 2)
1056           HUF_DECODE (bitOffset_1, dest + delta)
1057           HUF_DECODE (bitOffset_0, dest)
1058         }
1059         while (++dest != destLim);
1060       }
1061       if (   bitOffset_0 != 0
1062           || bitOffset_1 != (CBitCtr)sizes[0]
1063           || bitOffset_2 != (CBitCtr)sizes[1])
1064         return SZ_ERROR_DATA;
1065     }
1066   }
1067   #else // Z7_ZSTD_DEC_USE_HUF_STREAM1_ALWAYS
1068   {
1069     unsigned i;
1070     for (i = 0; i < 4; i++)
1071     {
1072       size_t d = destLen;
1073       size_t size = srcLen;
1074       if (i != 3)
1075       {
1076         d = delta;
1077         size = sizes[i];
1078       }
1079       if (i != 0)
1080         size -= sizes[i - 1];
1081       destLen -= d;
1082       RINOK(Huf_Decompress_1stream(table, src, size, dest, d))
1083       dest += d;
1084       src += size;
1085     }
1086   }
1087   #endif
1088 
1089   return SZ_OK;
1090 }
1091 
1092 
1093 
1094 // (in->len != 0)
1095 // we are allowed to access in->ptr[-3]
1096 // at least 2 bytes in (in->ptr) will be processed
Huf_DecodeTable(CZstdDecHufTable * const p,CInBufPair * const in)1097 static SRes Huf_DecodeTable(CZstdDecHufTable *const p, CInBufPair *const in)
1098 {
1099   Byte weights[HUF_MAX_SYMBS + 1];  // +1 for extra write for loop unroll
1100   unsigned numSyms;
1101   const unsigned header = *(in->ptr)++;
1102   in->len--;
1103   // memset(weights, 0, sizeof(weights));
1104   if (header >= 128)
1105   {
1106     // direct representation: 4 bits field (0-15) per weight
1107     numSyms = header - 127;
1108     // numSyms != 0
1109     {
1110       const size_t numBytes = (numSyms + 1) / 2;
1111       const Byte *const ws = in->ptr;
1112       size_t i = 0;
1113       if (in->len < numBytes)
1114         return SZ_ERROR_DATA;
1115       in->ptr += numBytes;
1116       in->len -= numBytes;
1117       do
1118       {
1119         const unsigned b = ws[i];
1120         weights[i * 2    ] = (Byte)(b >> 4);
1121         weights[i * 2 + 1] = (Byte)(b & 0xf);
1122       }
1123       while (++i != numBytes);
1124       /* 7ZIP: we can restore correct zero value for weights[numSyms],
1125          if we want to use zero values starting from numSyms in code below. */
1126       // weights[numSyms] = 0;
1127     }
1128   }
1129   else
1130   {
1131     #define MAX_ACCURACY_LOG_FOR_WEIGHTS 6
1132     CFseRecord table[1 << MAX_ACCURACY_LOG_FOR_WEIGHTS];
1133 
1134     Byte accuracy;
1135     const Byte *src;
1136     size_t srcLen;
1137     if (in->len < header)
1138       return SZ_ERROR_DATA;
1139     {
1140       CInBufPair fse_stream;
1141       fse_stream.len = header;
1142       fse_stream.ptr = in->ptr;
1143       in->ptr += header;
1144       in->len -= header;
1145       RINOK(FSE_DecodeHeader(table, &fse_stream,
1146           MAX_ACCURACY_LOG_FOR_WEIGHTS,
1147           &accuracy,
1148           16 // num weight symbols max (max-symbol is 15)
1149           ))
1150       // at least 2 bytes were processed in fse_stream.
1151       // (srcLen > 0) after FSE_DecodeHeader()
1152       // if (srcLen == 0) return SZ_ERROR_DATA;
1153       src = fse_stream.ptr;
1154       srcLen = fse_stream.len;
1155     }
1156     // we are allowed to access src[-5]
1157     {
1158       // unsigned yyy = 200; do {
1159       CBitCtr bitOffset;
1160       FastInt32 state1, state2;
1161       SET_bitOffset_TO_PAD (bitOffset, src, srcLen)
1162       state1 = accuracy;
1163       src -= state1 >> 2;  // src -= 1; // for GET16() optimization
1164       state1 <<= FSE_REC_LEN_OFFSET;
1165       state2 = state1;
1166       numSyms = 0;
1167       for (;;)
1168       {
1169         #define FSE_WEIGHT_DECODE(st) \
1170         { \
1171           const unsigned bits = GET_FSE_REC_LEN(st); \
1172           FastInt r; \
1173           GET16(r, src + (bitOffset >> 3)) \
1174           r >>= (unsigned)bitOffset & 7; \
1175           if ((CBitCtr_signed)(bitOffset -= (CBitCtr)bits) < 0) \
1176             { if (bitOffset + (CBitCtr)bits != 0) \
1177                 return SZ_ERROR_DATA; \
1178               break; } \
1179           r &= 0xff; \
1180           r >>= 8 - bits; \
1181           st = table[GET_FSE_REC_STATE(st) + r]; \
1182           weights[numSyms++] = (Byte)GET_FSE_REC_SYM(st); \
1183         }
1184         FSE_WEIGHT_DECODE (state1)
1185         FSE_WEIGHT_DECODE (state2)
1186         if (numSyms == HUF_MAX_SYMBS)
1187           return SZ_ERROR_DATA;
1188       }
1189       // src += (unsigned)accuracy >> 2; } while (--yyy);
1190     }
1191   }
1192 
1193   // Build using weights:
1194   {
1195     UInt32 sum = 0;
1196     {
1197       // numSyms >= 1
1198       unsigned i = 0;
1199       weights[numSyms] = 0;
1200       do
1201       {
1202         sum += ((UInt32)1 << weights[i    ]) & ~(UInt32)1;
1203         sum += ((UInt32)1 << weights[i + 1]) & ~(UInt32)1;
1204         i += 2;
1205       }
1206       while (i < numSyms);
1207       if (sum == 0)
1208         return SZ_ERROR_DATA;
1209     }
1210     {
1211       const unsigned maxBits = GetHighestSetBit_32_nonzero_big(sum) + 1;
1212       {
1213         const UInt32 left = ((UInt32)1 << maxBits) - sum;
1214         // (left != 0)
1215         // (left) must be power of 2 in correct stream
1216         if (left & (left - 1))
1217           return SZ_ERROR_DATA;
1218         weights[numSyms++] = (Byte)GetHighestSetBit_32_nonzero_big(left);
1219       }
1220       // if (numSyms & 1)
1221         weights[numSyms] = 0; // for loop unroll
1222       // numSyms >= 2
1223       {
1224         unsigned i = 0;
1225         do
1226         {
1227           /*
1228           #define WEIGHT_ITER(a) \
1229             { unsigned w = weights[i + (a)]; \
1230               const unsigned t = maxBits - w; \
1231               w = w ? t: w; \
1232               if (w > HUF_MAX_BITS) return SZ_ERROR_DATA; \
1233               weights[i + (a)] = (Byte)w; }
1234           */
1235           // /*
1236           #define WEIGHT_ITER(a) \
1237             { unsigned w = weights[i + (a)]; \
1238               if (w) {  \
1239                 w = maxBits - w; \
1240                 if (w > HUF_MAX_BITS) return SZ_ERROR_DATA; \
1241                 weights[i + (a)] = (Byte)w; }}
1242           // */
1243           WEIGHT_ITER(0)
1244           // WEIGHT_ITER(1)
1245           // i += 2;
1246         }
1247         while (++i != numSyms);
1248       }
1249     }
1250   }
1251   {
1252     // unsigned yyy; for (yyy = 0; yyy < 100; yyy++)
1253     Huf_Build((Byte *)(void *)p->table64, weights, numSyms);
1254   }
1255   return SZ_OK;
1256 }
1257 
1258 
1259 typedef enum
1260 {
1261   k_SeqMode_Predef = 0,
1262   k_SeqMode_RLE    = 1,
1263   k_SeqMode_FSE    = 2,
1264   k_SeqMode_Repeat = 3
1265 }
1266 z7_zstd_enum_SeqMode;
1267 
1268 // predefAccuracy == 5 for OFFSET symbols
1269 // predefAccuracy == 6 for MATCH/LIT LEN symbols
1270 static
1271 SRes
1272 Z7_NO_INLINE
1273 // Z7_FORCE_INLINE
FSE_Decode_SeqTable(CFseRecord * const table,CInBufPair * const in,unsigned predefAccuracy,Byte * const accuracyRes,unsigned numSymbolsMax,const CFseRecord * const predefs,const unsigned seqMode)1274 FSE_Decode_SeqTable(CFseRecord * const table,
1275     CInBufPair * const in,
1276     unsigned predefAccuracy,
1277     Byte * const accuracyRes,
1278     unsigned numSymbolsMax,
1279     const CFseRecord * const predefs,
1280     const unsigned seqMode)
1281 {
1282   // UNUSED_VAR(numSymsPredef)
1283   // UNUSED_VAR(predefFreqs)
1284   if (seqMode == k_SeqMode_FSE)
1285   {
1286     // unsigned y = 50; CInBufPair in2 = *in; do { *in = in2; RINOK(
1287     return
1288     FSE_DecodeHeader(table, in,
1289         predefAccuracy + 3, // accuracyMax
1290         accuracyRes,
1291         numSymbolsMax)
1292     ;
1293     // )} while (--y); return SZ_OK;
1294   }
1295   // numSymsMax = numSymsPredef + ((predefAccuracy & 1) * (32 - 29))); // numSymsMax
1296   // numSymsMax == 32 for offsets
1297 
1298   if (seqMode == k_SeqMode_Predef)
1299   {
1300     *accuracyRes = (Byte)predefAccuracy;
1301     memcpy(table, predefs, sizeof(UInt32) << predefAccuracy);
1302     return SZ_OK;
1303   }
1304 
1305   // (seqMode == k_SeqMode_RLE)
1306   if (in->len == 0)
1307     return SZ_ERROR_DATA;
1308   in->len--;
1309   {
1310     const Byte *ptr = in->ptr;
1311     const Byte sym = ptr[0];
1312     in->ptr = ptr + 1;
1313     table[0] = (FastInt32)sym
1314       #if defined(Z7_ZSTD_DEC_USE_ML_PLUS3)
1315         + (numSymbolsMax == NUM_ML_SYMBOLS ? MATCH_LEN_MIN : 0)
1316       #endif
1317       ;
1318     *accuracyRes = 0;
1319   }
1320   return SZ_OK;
1321 }
1322 
1323 
1324 typedef struct
1325 {
1326   CFseRecord of[1 << 8];
1327   CFseRecord ll[1 << 9];
1328   CFseRecord ml[1 << 9];
1329 }
1330 CZstdDecFseTables;
1331 
1332 
1333 typedef struct
1334 {
1335   Byte *win;
1336   SizeT cycSize;
1337   /*
1338     if (outBuf_fromCaller)  : cycSize = outBufSize_fromCaller
1339     else {
1340       if ( isCyclicMode) : cycSize = cyclic_buffer_size = (winSize + extra_space)
1341       if (!isCyclicMode) : cycSize = ContentSize,
1342       (isCyclicMode == true) if (ContetSize >= winSize) or ContetSize is unknown
1343     }
1344   */
1345   SizeT winPos;
1346 
1347   CZstdDecOffset reps[3];
1348 
1349   Byte ll_accuracy;
1350   Byte of_accuracy;
1351   Byte ml_accuracy;
1352   // Byte seqTables_wereSet;
1353   Byte litHuf_wasSet;
1354 
1355   Byte *literalsBase;
1356 
1357   size_t winSize;        // from header
1358   size_t totalOutCheck;  // totalOutCheck <= winSize
1359 
1360   #ifdef Z7_ZSTD_DEC_USE_BASES_IN_OBJECT
1361   SEQ_EXTRA_TABLES(m_)
1362   #endif
1363   // UInt64 _pad_Alignment;  // is not required now
1364   CZstdDecFseTables fse;
1365   CZstdDecHufTable huf;
1366 }
1367 CZstdDec1;
1368 
1369 #define ZstdDec1_GET_BLOCK_SIZE_LIMIT(p) \
1370   ((p)->winSize < kBlockSizeMax ? (UInt32)(p)->winSize : kBlockSizeMax)
1371 
1372 #define SEQ_TABLES_WERE_NOT_SET_ml_accuracy  1  // accuracy=1 is not used by zstd
1373 #define IS_SEQ_TABLES_WERE_SET(p)  (((p)->ml_accuracy != SEQ_TABLES_WERE_NOT_SET_ml_accuracy))
1374 // #define IS_SEQ_TABLES_WERE_SET(p)  ((p)->seqTables_wereSet)
1375 
1376 
ZstdDec1_Construct(CZstdDec1 * p)1377 static void ZstdDec1_Construct(CZstdDec1 *p)
1378 {
1379   #ifdef Z7_ZSTD_DEC_PRINT_TABLE
1380   Print_Predef(6, NUM_LL_SYMBOLS, SEQ_LL_PREDEF_DIST, k_PredefRecords_LL);
1381   Print_Predef(5, NUM_OFFSET_SYMBOLS_PREDEF, SEQ_OFFSET_PREDEF_DIST, k_PredefRecords_OF);
1382   Print_Predef(6, NUM_ML_SYMBOLS, SEQ_ML_PREDEF_DIST, k_PredefRecords_ML);
1383   #endif
1384 
1385   p->win = NULL;
1386   p->cycSize = 0;
1387   p->literalsBase = NULL;
1388   #ifdef Z7_ZSTD_DEC_USE_BASES_IN_OBJECT
1389   FILL_LOC_BASES_ALL
1390   #endif
1391 }
1392 
1393 
ZstdDec1_Init(CZstdDec1 * p)1394 static void ZstdDec1_Init(CZstdDec1 *p)
1395 {
1396   p->reps[0] = 1;
1397   p->reps[1] = 4;
1398   p->reps[2] = 8;
1399   // p->seqTables_wereSet = False;
1400   p->ml_accuracy = SEQ_TABLES_WERE_NOT_SET_ml_accuracy;
1401   p->litHuf_wasSet = False;
1402   p->totalOutCheck = 0;
1403 }
1404 
1405 
1406 
1407 #ifdef MY_CPU_LE_UNALIGN
1408   #define Z7_ZSTD_DEC_USE_UNALIGNED_COPY
1409 #endif
1410 
1411 #ifdef Z7_ZSTD_DEC_USE_UNALIGNED_COPY
1412 
1413   #define COPY_CHUNK_SIZE 16
1414 
1415     #define COPY_CHUNK_4_2(dest, src) \
1416     { \
1417       ((UInt32 *)(void *)dest)[0] = ((const UInt32 *)(const void *)src)[0]; \
1418       ((UInt32 *)(void *)dest)[1] = ((const UInt32 *)(const void *)src)[1]; \
1419       src += 4 * 2; \
1420       dest += 4 * 2; \
1421     }
1422 
1423   /* sse2 doesn't help here in GCC and CLANG.
1424      so we disabled sse2 here */
1425   /*
1426   #if defined(MY_CPU_AMD64)
1427     #define Z7_ZSTD_DEC_USE_SSE2
1428   #elif defined(MY_CPU_X86)
1429     #if defined(_MSC_VER) && _MSC_VER >= 1300 && defined(_M_IX86_FP) && (_M_IX86_FP >= 2) \
1430       || defined(__SSE2__) \
1431       // || 1 == 1  // for debug only
1432       #define Z7_ZSTD_DEC_USE_SSE2
1433     #endif
1434   #endif
1435   */
1436 
1437   #if defined(MY_CPU_ARM64)
1438     #define COPY_OFFSET_MIN  16
1439     #define COPY_CHUNK1(dest, src) \
1440     { \
1441       vst1q_u8((uint8_t *)(void *)dest, \
1442       vld1q_u8((const uint8_t *)(const void *)src)); \
1443       src += 16; \
1444       dest += 16; \
1445     }
1446 
1447     #define COPY_CHUNK(dest, src) \
1448     { \
1449       COPY_CHUNK1(dest, src) \
1450       if ((len -= COPY_CHUNK_SIZE) == 0) break; \
1451       COPY_CHUNK1(dest, src) \
1452     }
1453 
1454   #elif defined(Z7_ZSTD_DEC_USE_SSE2)
1455     #include <emmintrin.h> // sse2
1456     #define COPY_OFFSET_MIN  16
1457 
1458     #define COPY_CHUNK1(dest, src) \
1459     { \
1460       _mm_storeu_si128((__m128i *)(void *)dest, \
1461       _mm_loadu_si128((const __m128i *)(const void *)src)); \
1462       src += 16; \
1463       dest += 16; \
1464     }
1465 
1466     #define COPY_CHUNK(dest, src) \
1467     { \
1468       COPY_CHUNK1(dest, src) \
1469       if ((len -= COPY_CHUNK_SIZE) == 0) break; \
1470       COPY_CHUNK1(dest, src) \
1471     }
1472 
1473   #elif defined(MY_CPU_64BIT)
1474     #define COPY_OFFSET_MIN  8
1475 
1476     #define COPY_CHUNK(dest, src) \
1477     { \
1478       ((UInt64 *)(void *)dest)[0] = ((const UInt64 *)(const void *)src)[0]; \
1479       ((UInt64 *)(void *)dest)[1] = ((const UInt64 *)(const void *)src)[1]; \
1480       src += 8 * 2; \
1481       dest += 8 * 2; \
1482     }
1483 
1484   #else
1485     #define COPY_OFFSET_MIN  4
1486 
1487     #define COPY_CHUNK(dest, src) \
1488     { \
1489       COPY_CHUNK_4_2(dest, src); \
1490       COPY_CHUNK_4_2(dest, src); \
1491     }
1492 
1493   #endif
1494 #endif
1495 
1496 
1497 #ifndef COPY_CHUNK_SIZE
1498     #define COPY_OFFSET_MIN  4
1499     #define COPY_CHUNK_SIZE  8
1500     #define COPY_CHUNK_2(dest, src) \
1501     { \
1502       const Byte a0 = src[0]; \
1503       const Byte a1 = src[1]; \
1504       dest[0] = a0; \
1505       dest[1] = a1; \
1506       src += 2; \
1507       dest += 2; \
1508     }
1509     #define COPY_CHUNK(dest, src) \
1510     { \
1511       COPY_CHUNK_2(dest, src) \
1512       COPY_CHUNK_2(dest, src) \
1513       COPY_CHUNK_2(dest, src) \
1514       COPY_CHUNK_2(dest, src) \
1515     }
1516 #endif
1517 
1518 
1519 #define COPY_PREPARE \
1520   len += (COPY_CHUNK_SIZE - 1); \
1521   len &= ~(size_t)(COPY_CHUNK_SIZE - 1); \
1522   { if (len > rem) \
1523   { len = rem; \
1524     rem &= (COPY_CHUNK_SIZE - 1); \
1525     if (rem) {  \
1526       len -= rem; \
1527       Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE \
1528       do *dest++ = *src++; while (--rem); \
1529       if (len == 0) return; }}}
1530 
1531 #define COPY_CHUNKS \
1532 { \
1533   Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE \
1534   do { COPY_CHUNK(dest, src) } \
1535   while (len -= COPY_CHUNK_SIZE); \
1536 }
1537 
1538 // (len != 0)
1539 // (len <= rem)
1540 static
1541 Z7_FORCE_INLINE
1542 // Z7_ATTRIB_NO_VECTOR
CopyLiterals(Byte * dest,Byte const * src,size_t len,size_t rem)1543 void CopyLiterals(Byte *dest, Byte const *src, size_t len, size_t rem)
1544 {
1545   COPY_PREPARE
1546   COPY_CHUNKS
1547 }
1548 
1549 
1550 /* we can define Z7_STD_DEC_USE_AFTER_CYC_BUF, if we want to use additional
1551    space after cycSize that can be used to reduce the code in CopyMatch(): */
1552 // for debug:
1553 // #define Z7_STD_DEC_USE_AFTER_CYC_BUF
1554 
1555 /*
1556 CopyMatch()
1557 if wrap (offset > winPos)
1558 {
1559   then we have at least (COPY_CHUNK_SIZE) avail in (dest) before we will overwrite (src):
1560   (cycSize >= offset + COPY_CHUNK_SIZE)
1561   if defined(Z7_STD_DEC_USE_AFTER_CYC_BUF)
1562     we are allowed to read win[cycSize + COPY_CHUNK_SIZE - 1],
1563 }
1564 (len != 0)
1565 */
1566 static
1567 Z7_FORCE_INLINE
1568 // Z7_ATTRIB_NO_VECTOR
CopyMatch(size_t offset,size_t len,Byte * win,size_t winPos,size_t rem,const size_t cycSize)1569 void CopyMatch(size_t offset, size_t len,
1570     Byte *win, size_t winPos, size_t rem, const size_t cycSize)
1571 {
1572   Byte *dest = win + winPos;
1573   const Byte *src;
1574   // STAT_INC(g_NumCopy)
1575 
1576   if (offset > winPos)
1577   {
1578     size_t back = offset - winPos;
1579     // src = win + cycSize - back;
1580     // cycSize -= offset;
1581     STAT_INC(g_NumOver)
1582     src = dest + (cycSize - offset);
1583     // (src >= dest) here
1584    #ifdef Z7_STD_DEC_USE_AFTER_CYC_BUF
1585     if (back < len)
1586     {
1587    #else
1588     if (back < len + (COPY_CHUNK_SIZE - 1))
1589     {
1590       if (back >= len)
1591       {
1592         Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
1593         do
1594           *dest++ = *src++;
1595         while (--len);
1596         return;
1597       }
1598    #endif
1599       // back < len
1600       STAT_INC(g_NumOver2)
1601       len -= back;
1602       rem -= back;
1603       Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
1604       do
1605         *dest++ = *src++;
1606       while (--back);
1607       src = dest - offset;
1608       // src = win;
1609       // we go to MAIN-COPY
1610     }
1611   }
1612   else
1613     src = dest - offset;
1614 
1615   // len != 0
1616   // do *dest++ = *src++; while (--len); return;
1617 
1618   // --- MAIN COPY ---
1619   // if (src >= dest), then ((size_t)(src - dest) >= COPY_CHUNK_SIZE)
1620   //   so we have at least COPY_CHUNK_SIZE space before overlap for writing.
1621   COPY_PREPARE
1622 
1623   /* now (len == COPY_CHUNK_SIZE * x)
1624      so we can unroll for aligned copy */
1625   {
1626     // const unsigned b0 = src[0];
1627     // (COPY_OFFSET_MIN >= 4)
1628 
1629     if (offset >= COPY_OFFSET_MIN)
1630     {
1631       COPY_CHUNKS
1632       // return;
1633     }
1634     else
1635   #if (COPY_OFFSET_MIN > 4)
1636     #if COPY_CHUNK_SIZE < 8
1637       #error Stop_Compiling_Bad_COPY_CHUNK_SIZE
1638     #endif
1639     if (offset >= 4)
1640     {
1641       Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
1642       do
1643       {
1644         COPY_CHUNK_4_2(dest, src)
1645         #if COPY_CHUNK_SIZE != 16
1646           if (len == 8) break;
1647         #endif
1648         COPY_CHUNK_4_2(dest, src)
1649       }
1650       while (len -= 16);
1651       // return;
1652     }
1653     else
1654   #endif
1655     {
1656       // (offset < 4)
1657       const unsigned b0 = src[0];
1658       if (offset < 2)
1659       {
1660       #if defined(Z7_ZSTD_DEC_USE_UNALIGNED_COPY) && (COPY_CHUNK_SIZE == 16)
1661         #if defined(MY_CPU_64BIT)
1662         {
1663           const UInt64 v64 = (UInt64)b0 * 0x0101010101010101;
1664           Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
1665           do
1666           {
1667             ((UInt64 *)(void *)dest)[0] = v64;
1668             ((UInt64 *)(void *)dest)[1] = v64;
1669             dest += 16;
1670           }
1671           while (len -= 16);
1672         }
1673         #else
1674         {
1675           UInt32 v = b0;
1676           v |= v << 8;
1677           v |= v << 16;
1678           do
1679           {
1680             ((UInt32 *)(void *)dest)[0] = v;
1681             ((UInt32 *)(void *)dest)[1] = v;
1682             dest += 8;
1683             ((UInt32 *)(void *)dest)[0] = v;
1684             ((UInt32 *)(void *)dest)[1] = v;
1685             dest += 8;
1686           }
1687           while (len -= 16);
1688         }
1689         #endif
1690       #else
1691         do
1692         {
1693           dest[0] = (Byte)b0;
1694           dest[1] = (Byte)b0;
1695           dest += 2;
1696           dest[0] = (Byte)b0;
1697           dest[1] = (Byte)b0;
1698           dest += 2;
1699         }
1700         while (len -= 4);
1701       #endif
1702       }
1703       else if (offset == 2)
1704       {
1705         const Byte b1 = src[1];
1706         {
1707           do
1708           {
1709             dest[0] = (Byte)b0;
1710             dest[1] = b1;
1711             dest += 2;
1712           }
1713           while (len -= 2);
1714         }
1715       }
1716       else // (offset == 3)
1717       {
1718         const Byte *lim = dest + len - 2;
1719         const Byte b1 = src[1];
1720         const Byte b2 = src[2];
1721         do
1722         {
1723           dest[0] = (Byte)b0;
1724           dest[1] = b1;
1725           dest[2] = b2;
1726           dest += 3;
1727         }
1728         while (dest < lim);
1729         lim++; // points to last byte that must be written
1730         if (dest <= lim)
1731         {
1732           *dest = (Byte)b0;
1733           if (dest != lim)
1734             dest[1] = b1;
1735         }
1736       }
1737     }
1738   }
1739 }
1740 
1741 
1742 
1743 #define UPDATE_TOTAL_OUT(p, size) \
1744 { \
1745   size_t _toc = (p)->totalOutCheck + (size); \
1746   const size_t _ws = (p)->winSize; \
1747   if (_toc >= _ws) _toc = _ws; \
1748   (p)->totalOutCheck = _toc; \
1749 }
1750 
1751 
1752 #if defined(MY_CPU_64BIT) && defined(MY_CPU_LE_UNALIGN)
1753 // we can disable it for debug:
1754 #define Z7_ZSTD_DEC_USE_64BIT_LOADS
1755 #endif
1756 // #define Z7_ZSTD_DEC_USE_64BIT_LOADS // for debug : slow in 32-bit
1757 
1758 // SEQ_SRC_OFFSET: how many bytes (src) (seqSrc) was moved back from original value.
1759 // we need (SEQ_SRC_OFFSET != 0) for optimized memory access
1760 #ifdef Z7_ZSTD_DEC_USE_64BIT_LOADS
1761   #define SEQ_SRC_OFFSET 7
1762 #else
1763   #define SEQ_SRC_OFFSET 3
1764 #endif
1765 #define SRC_PLUS_FOR_4BYTES(bitOffset)  (SEQ_SRC_OFFSET - 3) + ((CBitCtr_signed)(bitOffset) >> 3)
1766 #define BIT_OFFSET_7BITS(bitOffset)  ((unsigned)(bitOffset) & 7)
1767 /*
1768   if (BIT_OFFSET_DELTA_BITS == 0) : bitOffset == number_of_unprocessed_bits
1769   if (BIT_OFFSET_DELTA_BITS == 1) : bitOffset == number_of_unprocessed_bits - 1
1770       and we can read 1 bit more in that mode : (8 * n + 1).
1771 */
1772 // #define BIT_OFFSET_DELTA_BITS 0
1773 #define BIT_OFFSET_DELTA_BITS 1
1774 #if BIT_OFFSET_DELTA_BITS == 1
1775   #define GET_SHIFT_FROM_BOFFS7(boff7)  (7 ^ (boff7))
1776 #else
1777   #define GET_SHIFT_FROM_BOFFS7(boff7)  (8 - BIT_OFFSET_DELTA_BITS - (boff7))
1778 #endif
1779 
1780 #define UPDATE_BIT_OFFSET(bitOffset, numBits) \
1781     (bitOffset) -= (CBitCtr)(numBits);
1782 
1783 #define GET_SHIFT(bitOffset)  GET_SHIFT_FROM_BOFFS7(BIT_OFFSET_7BITS(bitOffset))
1784 
1785 
1786 #if defined(Z7_ZSTD_DEC_USE_64BIT_LOADS)
1787   #if (NUM_OFFSET_SYMBOLS_MAX - BIT_OFFSET_DELTA_BITS < 32)
1788     /* if (NUM_OFFSET_SYMBOLS_MAX == 32 && BIT_OFFSET_DELTA_BITS == 1),
1789        we have depth 31 + 9 + 9 + 8 = 57 bits that can b read with single read. */
1790     #define Z7_ZSTD_DEC_USE_64BIT_PRELOAD_OF
1791   #endif
1792   #ifndef Z7_ZSTD_DEC_USE_64BIT_PRELOAD_OF
1793     #if (BIT_OFFSET_DELTA_BITS == 1)
1794     /* if (winLimit - winPos <= (kBlockSizeMax = (1 << 17)))
1795        {
1796          the case (16 bits literal extra + 16 match extra) is not possible
1797          in correct stream. So error will be detected for (16 + 16) case.
1798          And longest correct sequence after offset reading is (31 + 9 + 9 + 8 = 57 bits).
1799          So we can use just one 64-bit load here in that case.
1800        }
1801     */
1802     #define Z7_ZSTD_DEC_USE_64BIT_PRELOAD_ML
1803     #endif
1804   #endif
1805 #endif
1806 
1807 
1808 #if !defined(Z7_ZSTD_DEC_USE_64BIT_LOADS) || \
1809     (!defined(Z7_ZSTD_DEC_USE_64BIT_PRELOAD_OF) && \
1810      !defined(Z7_ZSTD_DEC_USE_64BIT_PRELOAD_ML))
1811 // in : (0 < bits <= (24 or 25)):
1812 #define STREAM_READ_BITS(dest, bits) \
1813 { \
1814   GET32(dest, src + SRC_PLUS_FOR_4BYTES(bitOffset)) \
1815   dest <<= GET_SHIFT(bitOffset); \
1816   UPDATE_BIT_OFFSET(bitOffset, bits) \
1817   dest >>= 32 - bits; \
1818 }
1819 #endif
1820 
1821 
1822 #define FSE_Peek_1(table, state)  table[state]
1823 
1824 #define STATE_VAR(name)  state_ ## name
1825 
1826 // in : (0 <= accuracy <= (24 or 25))
1827 #define FSE_INIT_STATE(name, cond) \
1828 { \
1829   UInt32 r; \
1830   const unsigned bits = p->name ## _accuracy; \
1831   GET32(r, src + SRC_PLUS_FOR_4BYTES(bitOffset)) \
1832   r <<= GET_SHIFT(bitOffset); \
1833   r >>= 1; \
1834   r >>= 31 ^ bits; \
1835   UPDATE_BIT_OFFSET(bitOffset, bits) \
1836   cond \
1837   STATE_VAR(name) = FSE_Peek_1(FSE_TABLE(name), r); \
1838   /* STATE_VAR(name) = dest << 16; */ \
1839 }
1840 
1841 
1842 #define FSE_Peek_Plus(name, r)  \
1843   STATE_VAR(name) = FSE_Peek_1(FSE_TABLE(name), \
1844     GET_FSE_REC_STATE(STATE_VAR(name)) + r);
1845 
1846 #define LZ_LOOP_ERROR_EXIT  { return SZ_ERROR_DATA; }
1847 
1848 #define BO_OVERFLOW_CHECK \
1849   { if ((CBitCtr_signed)bitOffset < 0) LZ_LOOP_ERROR_EXIT }
1850 
1851 
1852 #ifdef Z7_ZSTD_DEC_USE_64BIT_LOADS
1853 
1854 #define GET64(dest, p)  { const Byte *ptr = p;  dest = GetUi64(ptr); }
1855 
1856 #define FSE_PRELOAD \
1857 { \
1858   GET64(v, src - 4 + SRC_PLUS_FOR_4BYTES(bitOffset)) \
1859   v <<= GET_SHIFT(bitOffset); \
1860 }
1861 
1862 #define FSE_UPDATE_STATE_2(name, cond) \
1863 { \
1864   const unsigned bits = GET_FSE_REC_LEN(STATE_VAR(name)); \
1865   UInt64 r = v; \
1866   v <<= bits; \
1867   r >>= 1; \
1868   UPDATE_BIT_OFFSET(bitOffset, bits) \
1869   cond \
1870   r >>= 63 ^ bits; \
1871   FSE_Peek_Plus(name, r); \
1872 }
1873 
1874 #define FSE_UPDATE_STATES \
1875   FSE_UPDATE_STATE_2 (ll, {} ) \
1876   FSE_UPDATE_STATE_2 (ml, {} ) \
1877   FSE_UPDATE_STATE_2 (of, BO_OVERFLOW_CHECK) \
1878 
1879 #else // Z7_ZSTD_DEC_USE_64BIT_LOADS
1880 
1881 // it supports 8 bits accuracy for any code
1882 // it supports 9 bits accuracy, if (BIT_OFFSET_DELTA_BITS == 1)
1883 #define FSE_UPDATE_STATE_0(name, cond) \
1884 { \
1885   UInt32 r; \
1886   const unsigned bits = GET_FSE_REC_LEN(STATE_VAR(name)); \
1887   GET16(r, src + 2 + SRC_PLUS_FOR_4BYTES(bitOffset)) \
1888   r >>= (bitOffset & 7); \
1889   r &= (1 << (8 + BIT_OFFSET_DELTA_BITS)) - 1; \
1890   UPDATE_BIT_OFFSET(bitOffset, bits) \
1891   cond \
1892   r >>= (8 + BIT_OFFSET_DELTA_BITS) - bits; \
1893   FSE_Peek_Plus(name, r); \
1894 }
1895 
1896 // for debug (slow):
1897 // #define Z7_ZSTD_DEC_USE_FSE_FUSION_FORCE
1898 #if BIT_OFFSET_DELTA_BITS == 0 || defined(Z7_ZSTD_DEC_USE_FSE_FUSION_FORCE)
1899   #define Z7_ZSTD_DEC_USE_FSE_FUSION
1900 #endif
1901 
1902 #ifdef Z7_ZSTD_DEC_USE_FSE_FUSION
1903 #define FSE_UPDATE_STATE_1(name) \
1904 { UInt32 rest2; \
1905 { \
1906   UInt32 r; \
1907   unsigned bits; \
1908   GET32(r, src + SRC_PLUS_FOR_4BYTES(bitOffset)) \
1909   bits = GET_FSE_REC_LEN(STATE_VAR(name)); \
1910   r <<= GET_SHIFT(bitOffset); \
1911   rest2 = r << bits; \
1912   r >>= 1; \
1913   UPDATE_BIT_OFFSET(bitOffset, bits) \
1914   r >>= 31 ^ bits; \
1915   FSE_Peek_Plus(name, r); \
1916 }
1917 
1918 #define FSE_UPDATE_STATE_3(name) \
1919 { \
1920   const unsigned bits = GET_FSE_REC_LEN(STATE_VAR(name)); \
1921   rest2 >>= 1; \
1922   UPDATE_BIT_OFFSET(bitOffset, bits) \
1923   rest2 >>= 31 ^ bits; \
1924   FSE_Peek_Plus(name, rest2); \
1925 }}
1926 
1927 #define FSE_UPDATE_STATES \
1928   FSE_UPDATE_STATE_1 (ll) \
1929   FSE_UPDATE_STATE_3 (ml) \
1930   FSE_UPDATE_STATE_0 (of, BO_OVERFLOW_CHECK) \
1931 
1932 #else // Z7_ZSTD_DEC_USE_64BIT_LOADS
1933 
1934 #define FSE_UPDATE_STATES \
1935   FSE_UPDATE_STATE_0 (ll, {} ) \
1936   FSE_UPDATE_STATE_0 (ml, {} ) \
1937   FSE_UPDATE_STATE_0 (of, BO_OVERFLOW_CHECK) \
1938 
1939 #endif // Z7_ZSTD_DEC_USE_FSE_FUSION
1940 #endif // Z7_ZSTD_DEC_USE_64BIT_LOADS
1941 
1942 
1943 
1944 typedef struct
1945 {
1946   UInt32 numSeqs;
1947   UInt32 literalsLen;
1948   const Byte *literals;
1949 }
1950 CZstdDec1_Vars;
1951 
1952 
1953 // if (BIT_OFFSET_DELTA_BITS != 0), we need (BIT_OFFSET_DELTA_BYTES > 0)
1954 #define BIT_OFFSET_DELTA_BYTES   BIT_OFFSET_DELTA_BITS
1955 
1956 /* if (NUM_OFFSET_SYMBOLS_MAX == 32)
1957      max_seq_bit_length = (31) + 16 + 16 + 9 + 8 + 9 = 89 bits
1958    if defined(Z7_ZSTD_DEC_USE_64BIT_PRELOAD_OF) we have longest backward
1959      lookahead offset, and we read UInt64 after literal_len reading.
1960    if (BIT_OFFSET_DELTA_BITS == 1 && NUM_OFFSET_SYMBOLS_MAX == 32)
1961      MAX_BACKWARD_DEPTH = 16 bytes
1962 */
1963 #define MAX_BACKWARD_DEPTH  \
1964     ((NUM_OFFSET_SYMBOLS_MAX - 1 + 16 + 16 + 7) / 8 + 7 + BIT_OFFSET_DELTA_BYTES)
1965 
1966 /* srcLen != 0
1967    src == real_data_ptr - SEQ_SRC_OFFSET - BIT_OFFSET_DELTA_BYTES
1968    if defined(Z7_ZSTD_DEC_USE_64BIT_PRELOAD_ML) then
1969      (winLimit - p->winPos <= (1 << 17)) is required
1970 */
1971 static
1972 Z7_NO_INLINE
1973 // Z7_ATTRIB_NO_VECTOR
1974 SRes Decompress_Sequences(CZstdDec1 * const p,
1975     const Byte *src, const size_t srcLen,
1976     const size_t winLimit,
1977     const CZstdDec1_Vars * const vars)
1978 {
1979 #ifdef Z7_ZSTD_DEC_USE_BASES_LOCAL
1980   SEQ_EXTRA_TABLES(a_)
1981 #endif
1982 
1983   // for debug:
1984   // #define Z7_ZSTD_DEC_USE_LOCAL_FSE_TABLES
1985 #ifdef Z7_ZSTD_DEC_USE_LOCAL_FSE_TABLES
1986   #define FSE_TABLE(n)  fse. n
1987   const CZstdDecFseTables fse = p->fse;
1988   /*
1989   CZstdDecFseTables fse;
1990   #define COPY_FSE_TABLE(n) \
1991     memcpy(fse. n, p->fse. n, (size_t)4 << p-> n ## _accuracy);
1992   COPY_FSE_TABLE(of)
1993   COPY_FSE_TABLE(ll)
1994   COPY_FSE_TABLE(ml)
1995   */
1996 #else
1997   #define FSE_TABLE(n)  (p->fse.  n)
1998 #endif
1999 
2000 #ifdef Z7_ZSTD_DEC_USE_BASES_LOCAL
2001   FILL_LOC_BASES_ALL
2002 #endif
2003 
2004   {
2005     unsigned numSeqs = vars->numSeqs;
2006     const Byte *literals = vars->literals;
2007     ptrdiff_t literalsLen = (ptrdiff_t)vars->literalsLen;
2008     Byte * const win = p->win;
2009     size_t winPos = p->winPos;
2010     const size_t cycSize = p->cycSize;
2011     size_t totalOutCheck = p->totalOutCheck;
2012     const size_t winSize = p->winSize;
2013     size_t reps_0 = p->reps[0];
2014     size_t reps_1 = p->reps[1];
2015     size_t reps_2 = p->reps[2];
2016     UInt32 STATE_VAR(ll), STATE_VAR(of), STATE_VAR(ml);
2017     CBitCtr bitOffset;
2018 
2019     SET_bitOffset_TO_PAD (bitOffset, src + SEQ_SRC_OFFSET, srcLen + BIT_OFFSET_DELTA_BYTES)
2020 
2021     bitOffset -= BIT_OFFSET_DELTA_BITS;
2022 
2023     FSE_INIT_STATE(ll, {} )
2024     FSE_INIT_STATE(of, {} )
2025     FSE_INIT_STATE(ml, BO_OVERFLOW_CHECK)
2026 
2027     for (;;)
2028     {
2029       size_t matchLen;
2030     #ifdef Z7_ZSTD_DEC_USE_64BIT_LOADS
2031       UInt64 v;
2032     #endif
2033 
2034       #ifdef Z7_ZSTD_DEC_USE_64BIT_PRELOAD_OF
2035         FSE_PRELOAD
2036       #endif
2037 
2038       // if (of_code == 0)
2039       if ((Byte)STATE_VAR(of) == 0)
2040       {
2041         if (GET_FSE_REC_SYM(STATE_VAR(ll)) == 0)
2042         {
2043           const size_t offset = reps_1;
2044           reps_1 = reps_0;
2045           reps_0 = offset;
2046           STAT_INC(g_Num_Rep1)
2047         }
2048         STAT_UPDATE(else g_Num_Rep0++;)
2049       }
2050       else
2051       {
2052         const unsigned of_code = (Byte)STATE_VAR(of);
2053 
2054       #ifdef Z7_ZSTD_DEC_USE_64BIT_LOADS
2055         #if !defined(Z7_ZSTD_DEC_USE_64BIT_PRELOAD_OF)
2056           FSE_PRELOAD
2057         #endif
2058       #else
2059         UInt32 v;
2060         {
2061           const Byte *src4 = src + SRC_PLUS_FOR_4BYTES(bitOffset);
2062           const unsigned skip = GET_SHIFT(bitOffset);
2063           GET32(v, src4)
2064           v <<= skip;
2065           v |= (UInt32)src4[-1] >> (8 - skip);
2066         }
2067       #endif
2068 
2069         UPDATE_BIT_OFFSET(bitOffset, of_code)
2070 
2071         if (of_code == 1)
2072         {
2073           // read 1 bit
2074           #if defined(Z7_MSC_VER_ORIGINAL) || defined(MY_CPU_X86_OR_AMD64)
2075             #ifdef Z7_ZSTD_DEC_USE_64BIT_LOADS
2076               #define CHECK_HIGH_BIT_64(a)  ((Int64)(UInt64)(a) < 0)
2077             #else
2078               #define CHECK_HIGH_BIT_32(a)  ((Int32)(UInt32)(a) < 0)
2079             #endif
2080           #else
2081             #ifdef Z7_ZSTD_DEC_USE_64BIT_LOADS
2082               #define CHECK_HIGH_BIT_64(a)  ((UInt64)(a) & ((UInt64)1 << 63))
2083             #else
2084               #define CHECK_HIGH_BIT_32(a)  ((UInt32)(a) & ((UInt32)1 << 31))
2085             #endif
2086           #endif
2087 
2088           if
2089             #ifdef Z7_ZSTD_DEC_USE_64BIT_LOADS
2090               CHECK_HIGH_BIT_64 (((UInt64)GET_FSE_REC_SYM(STATE_VAR(ll)) - 1) ^ v)
2091             #else
2092               CHECK_HIGH_BIT_32 (((UInt32)GET_FSE_REC_SYM(STATE_VAR(ll)) - 1) ^ v)
2093             #endif
2094           {
2095             v <<= 1;
2096             {
2097               const size_t offset = reps_2;
2098               reps_2 = reps_1;
2099               reps_1 = reps_0;
2100               reps_0 = offset;
2101               STAT_INC(g_Num_Rep2)
2102             }
2103           }
2104           else
2105           {
2106             if (GET_FSE_REC_SYM(STATE_VAR(ll)) == 0)
2107             {
2108               // litLen == 0 && bit == 1
2109               STAT_INC(g_Num_Rep3)
2110               v <<= 1;
2111               reps_2 = reps_1;
2112               reps_1 = reps_0;
2113               if (--reps_0 == 0)
2114               {
2115                 // LZ_LOOP_ERROR_EXIT
2116                 // original-zstd decoder : input is corrupted; force offset to 1
2117                 // reps_0 = 1;
2118                 reps_0++;
2119               }
2120             }
2121             else
2122             {
2123               // litLen != 0 && bit == 0
2124               v <<= 1;
2125               {
2126                 const size_t offset = reps_1;
2127                 reps_1 = reps_0;
2128                 reps_0 = offset;
2129                 STAT_INC(g_Num_Rep1)
2130               }
2131             }
2132           }
2133         }
2134         else
2135         {
2136           // (2 <= of_code)
2137           // if (of_code >= 32) LZ_LOOP_ERROR_EXIT // optional check
2138           // we don't allow (of_code >= 32) cases in another code
2139           reps_2 = reps_1;
2140           reps_1 = reps_0;
2141           reps_0 = ((size_t)1 << of_code) - 3 + (size_t)
2142             #ifdef Z7_ZSTD_DEC_USE_64BIT_LOADS
2143               (v >> (64 - of_code));
2144               v <<= of_code;
2145             #else
2146               (v >> (32 - of_code));
2147             #endif
2148         }
2149       }
2150 
2151       #ifdef Z7_ZSTD_DEC_USE_64BIT_PRELOAD_ML
2152         FSE_PRELOAD
2153       #endif
2154 
2155       matchLen = (size_t)GET_FSE_REC_SYM(STATE_VAR(ml))
2156           #ifndef Z7_ZSTD_DEC_USE_ML_PLUS3
2157             + MATCH_LEN_MIN
2158           #endif
2159           ;
2160       {
2161         {
2162           if (matchLen >= 32 + MATCH_LEN_MIN) // if (state_ml & 0x20)
2163           {
2164             const unsigned extra = BASES_TABLE(SEQ_ML_EXTRA) [(size_t)matchLen - MATCH_LEN_MIN];
2165             matchLen = BASES_TABLE(SEQ_ML_BASES) [(size_t)matchLen - MATCH_LEN_MIN];
2166             #if defined(Z7_ZSTD_DEC_USE_64BIT_LOADS) && \
2167                (defined(Z7_ZSTD_DEC_USE_64BIT_PRELOAD_ML) || \
2168                 defined(Z7_ZSTD_DEC_USE_64BIT_PRELOAD_OF))
2169             {
2170               UPDATE_BIT_OFFSET(bitOffset, extra)
2171               matchLen += (size_t)(v >> (64 - extra));
2172               #if defined(Z7_ZSTD_DEC_USE_64BIT_PRELOAD_OF)
2173                 FSE_PRELOAD
2174               #else
2175                 v <<= extra;
2176               #endif
2177             }
2178             #else
2179             {
2180               UInt32 v32;
2181               STREAM_READ_BITS(v32, extra)
2182               matchLen += v32;
2183             }
2184             #endif
2185             STAT_INC(g_Num_Match)
2186           }
2187         }
2188       }
2189 
2190       #if  defined(Z7_ZSTD_DEC_USE_64BIT_LOADS) && \
2191           !defined(Z7_ZSTD_DEC_USE_64BIT_PRELOAD_OF) && \
2192           !defined(Z7_ZSTD_DEC_USE_64BIT_PRELOAD_ML)
2193         FSE_PRELOAD
2194       #endif
2195 
2196       {
2197         size_t litLen = GET_FSE_REC_SYM(STATE_VAR(ll));
2198         if (litLen)
2199         {
2200           // if (STATE_VAR(ll) & 0x70)
2201           if (litLen >= 16)
2202           {
2203             const unsigned extra = BASES_TABLE(SEQ_LL_EXTRA) [litLen];
2204             litLen = BASES_TABLE(SEQ_LL_BASES) [litLen];
2205             #ifdef Z7_ZSTD_DEC_USE_64BIT_LOADS
2206             {
2207               UPDATE_BIT_OFFSET(bitOffset, extra)
2208               litLen += (size_t)(v >> (64 - extra));
2209               #if defined(Z7_ZSTD_DEC_USE_64BIT_PRELOAD_OF)
2210                 FSE_PRELOAD
2211               #else
2212                 v <<= extra;
2213               #endif
2214             }
2215             #else
2216             {
2217               UInt32 v32;
2218               STREAM_READ_BITS(v32, extra)
2219               litLen += v32;
2220             }
2221             #endif
2222             STAT_INC(g_Num_LitsBig)
2223           }
2224 
2225           if ((literalsLen -= (ptrdiff_t)litLen) < 0)
2226             LZ_LOOP_ERROR_EXIT
2227           totalOutCheck += litLen;
2228           {
2229             const size_t rem = winLimit - winPos;
2230             if (litLen > rem)
2231               LZ_LOOP_ERROR_EXIT
2232             {
2233               const Byte *literals_temp = literals;
2234               Byte *d = win + winPos;
2235               literals += litLen;
2236               winPos += litLen;
2237               CopyLiterals(d, literals_temp, litLen, rem);
2238             }
2239           }
2240         }
2241         STAT_UPDATE(else g_Num_Lit0++;)
2242       }
2243 
2244       #define COPY_MATCH \
2245         { if (reps_0 > winSize || reps_0 > totalOutCheck) LZ_LOOP_ERROR_EXIT \
2246         totalOutCheck += matchLen; \
2247         { const size_t rem = winLimit - winPos; \
2248         if (matchLen > rem) LZ_LOOP_ERROR_EXIT \
2249         { const size_t winPos_temp = winPos; \
2250         winPos += matchLen; \
2251         CopyMatch(reps_0, matchLen, win, winPos_temp, rem, cycSize); }}}
2252 
2253       if (--numSeqs == 0)
2254       {
2255         COPY_MATCH
2256         break;
2257       }
2258       FSE_UPDATE_STATES
2259       COPY_MATCH
2260     } // for
2261 
2262     if ((CBitCtr_signed)bitOffset != BIT_OFFSET_DELTA_BYTES * 8 - BIT_OFFSET_DELTA_BITS)
2263       return SZ_ERROR_DATA;
2264 
2265     if (literalsLen)
2266     {
2267       const size_t rem = winLimit - winPos;
2268       if ((size_t)literalsLen > rem)
2269         return SZ_ERROR_DATA;
2270       {
2271         Byte *d = win + winPos;
2272         winPos += (size_t)literalsLen;
2273         totalOutCheck += (size_t)literalsLen;
2274         CopyLiterals
2275         // memcpy
2276           (d, literals, (size_t)literalsLen, rem);
2277       }
2278     }
2279     if (totalOutCheck >= winSize)
2280       totalOutCheck = winSize;
2281     p->totalOutCheck = totalOutCheck;
2282     p->winPos = winPos;
2283     p->reps[0] = (CZstdDecOffset)reps_0;
2284     p->reps[1] = (CZstdDecOffset)reps_1;
2285     p->reps[2] = (CZstdDecOffset)reps_2;
2286   }
2287   return SZ_OK;
2288 }
2289 
2290 
2291 // for debug: define to check that ZstdDec1_NeedTempBufferForInput() works correctly:
2292 // #define Z7_ZSTD_DEC_USE_CHECK_OF_NEED_TEMP // define it for debug only
2293 #ifdef Z7_ZSTD_DEC_USE_CHECK_OF_NEED_TEMP
2294 static unsigned g_numSeqs;
2295 #endif
2296 
2297 
2298 #define k_LitBlockType_Flag_RLE_or_Treeless  1
2299 #define k_LitBlockType_Flag_Compressed       2
2300 
2301 // outLimit : is strong limit
2302 // outLimit <= ZstdDec1_GET_BLOCK_SIZE_LIMIT(p)
2303 // inSize != 0
2304 static
2305 Z7_NO_INLINE
2306 SRes ZstdDec1_DecodeBlock(CZstdDec1 *p,
2307     const Byte *src, SizeT inSize, SizeT afterAvail,
2308     const size_t outLimit)
2309 {
2310   CZstdDec1_Vars vars;
2311   vars.literals = p->literalsBase;
2312   {
2313     const unsigned b0 = *src++;
2314     UInt32 numLits, compressedSize;
2315     const Byte *litStream;
2316     Byte *literalsDest;
2317     inSize--;
2318 
2319     if ((b0 & k_LitBlockType_Flag_Compressed) == 0)
2320     {
2321       // we need at least one additional byte for (numSeqs).
2322       // so we check for that additional byte in conditions.
2323       numLits = b0 >> 3;
2324       if (b0 & 4)
2325       {
2326         UInt32 v;
2327         if (inSize < 1 + 1) // we need at least 1 byte here and 1 byte for (numSeqs).
2328           return SZ_ERROR_DATA;
2329         numLits >>= 1;
2330         v = GetUi16(src);
2331         src += 2;
2332         inSize -= 2;
2333         if ((b0 & 8) == 0)
2334         {
2335           src--;
2336           inSize++;
2337           v = (Byte)v;
2338         }
2339         numLits += v << 4;
2340       }
2341       compressedSize = 1;
2342       if ((b0 & k_LitBlockType_Flag_RLE_or_Treeless) == 0)
2343         compressedSize = numLits;
2344     }
2345     else if (inSize < 4)
2346       return SZ_ERROR_DATA;
2347     else
2348     {
2349       const unsigned mode4Streams = b0 & 0xc;
2350       const unsigned numBytes = (3 * mode4Streams + 32) >> 4;
2351       const unsigned numBits = 4 * numBytes - 2;
2352       const UInt32 mask = ((UInt32)16 << numBits) - 1;
2353       compressedSize = GetUi32(src);
2354       numLits = ((
2355           #ifdef MY_CPU_LE_UNALIGN
2356             GetUi32(src - 1)
2357           #else
2358             ((compressedSize << 8) + b0)
2359           #endif
2360           ) >> 4) & mask;
2361       src += numBytes;
2362       inSize -= numBytes;
2363       compressedSize >>= numBits;
2364       compressedSize &= mask;
2365       /*
2366       if (numLits != 0) printf("inSize = %7u num_lits=%7u compressed=%7u ratio = %u  ratio2 = %u\n",
2367           i1, numLits, (unsigned)compressedSize * 1, (unsigned)compressedSize * 100 / numLits,
2368           (unsigned)numLits * 100 / (unsigned)inSize);
2369       }
2370       */
2371       if (compressedSize == 0)
2372         return SZ_ERROR_DATA; // (compressedSize == 0) is not allowed
2373     }
2374 
2375     STAT_UPDATE(g_Num_Lits += numLits;)
2376 
2377     vars.literalsLen = numLits;
2378 
2379     if (compressedSize >= inSize)
2380       return SZ_ERROR_DATA;
2381     litStream = src;
2382     src += compressedSize;
2383     inSize -= compressedSize;
2384     // inSize != 0
2385     {
2386       UInt32 numSeqs = *src++;
2387       inSize--;
2388       if (numSeqs > 127)
2389       {
2390         UInt32 b1;
2391         if (inSize == 0)
2392           return SZ_ERROR_DATA;
2393         numSeqs -= 128;
2394         b1 = *src++;
2395         inSize--;
2396         if (numSeqs == 127)
2397         {
2398           if (inSize == 0)
2399             return SZ_ERROR_DATA;
2400           numSeqs = (UInt32)(*src++) + 127;
2401           inSize--;
2402         }
2403         numSeqs = (numSeqs << 8) + b1;
2404       }
2405       if (numSeqs * MATCH_LEN_MIN + numLits > outLimit)
2406         return SZ_ERROR_DATA;
2407       vars.numSeqs = numSeqs;
2408 
2409       STAT_UPDATE(g_NumSeqs_total += numSeqs;)
2410       /*
2411         #ifdef SHOW_STAT
2412         printf("\n %5u : %8u, %8u : %5u", (int)g_Num_Blocks_Compressed, (int)numSeqs, (int)g_NumSeqs_total,
2413           (int)g_NumSeqs_total / g_Num_Blocks_Compressed);
2414         #endif
2415         // printf("\nnumSeqs2 = %d", numSeqs);
2416       */
2417     #ifdef Z7_ZSTD_DEC_USE_CHECK_OF_NEED_TEMP
2418       if (numSeqs != g_numSeqs) return SZ_ERROR_DATA; // for debug
2419     #endif
2420       if (numSeqs == 0)
2421       {
2422         if (inSize != 0)
2423           return SZ_ERROR_DATA;
2424         literalsDest = p->win + p->winPos;
2425       }
2426       else
2427         literalsDest = p->literalsBase;
2428     }
2429 
2430     if ((b0 & k_LitBlockType_Flag_Compressed) == 0)
2431     {
2432       if (b0 & k_LitBlockType_Flag_RLE_or_Treeless)
2433       {
2434         memset(literalsDest, litStream[0], numLits);
2435         if (vars.numSeqs)
2436         {
2437           // literalsDest == p->literalsBase == vars.literals
2438           #if COPY_CHUNK_SIZE > 1
2439             memset(p->literalsBase + numLits, 0, COPY_CHUNK_SIZE);
2440           #endif
2441         }
2442       }
2443       else
2444       {
2445         // unsigned y;
2446         // for (y = 0; y < 10000; y++)
2447         memcpy(literalsDest, litStream, numLits);
2448         if (vars.numSeqs)
2449         {
2450           /* we need up to (15 == COPY_CHUNK_SIZE - 1) space for optimized CopyLiterals().
2451              If we have additional space in input stream after literals stream,
2452              we use direct copy of rar literals in input stream */
2453           if ((size_t)(src + inSize - litStream) - numLits + afterAvail >= (COPY_CHUNK_SIZE - 1))
2454             vars.literals = litStream;
2455           else
2456           {
2457             // literalsDest == p->literalsBase == vars.literals
2458             #if COPY_CHUNK_SIZE > 1
2459             /* CopyLiterals():
2460                 1) we don't want reading non-initialized data
2461                 2) we will copy only zero byte after literals buffer */
2462               memset(p->literalsBase + numLits, 0, COPY_CHUNK_SIZE);
2463             #endif
2464           }
2465         }
2466       }
2467     }
2468     else
2469     {
2470       CInBufPair hufStream;
2471       hufStream.ptr = litStream;
2472       hufStream.len = compressedSize;
2473 
2474       if ((b0 & k_LitBlockType_Flag_RLE_or_Treeless) == 0)
2475       {
2476         // unsigned y = 100; CInBufPair hs2 = hufStream; do { hufStream = hs2;
2477         RINOK(Huf_DecodeTable(&p->huf, &hufStream))
2478         p->litHuf_wasSet = True;
2479         // } while (--y);
2480       }
2481       else if (!p->litHuf_wasSet)
2482         return SZ_ERROR_DATA;
2483 
2484       {
2485         // int yyy; for (yyy = 0; yyy < 34; yyy++) {
2486         SRes sres;
2487         if ((b0 & 0xc) == 0) // mode4Streams
2488           sres = Huf_Decompress_1stream((const Byte *)(const void *)p->huf.table64,
2489               hufStream.ptr - HUF_SRC_OFFSET, hufStream.len, literalsDest, numLits);
2490         else
2491         {
2492           // 6 bytes for the jump table + 4x1 bytes of end-padding Bytes)
2493           if (hufStream.len < 6 + 4)
2494             return SZ_ERROR_DATA;
2495           // the condition from original-zstd decoder:
2496           #define Z7_ZSTD_MIN_LITERALS_FOR_4_STREAMS 6
2497           if (numLits < Z7_ZSTD_MIN_LITERALS_FOR_4_STREAMS)
2498             return SZ_ERROR_DATA;
2499           sres = Huf_Decompress_4stream((const Byte *)(const void *)p->huf.table64,
2500               hufStream.ptr + (6 - HUF_SRC_OFFSET), hufStream.len, literalsDest, numLits);
2501         }
2502         RINOK(sres)
2503         // }
2504       }
2505     }
2506 
2507     if (vars.numSeqs == 0)
2508     {
2509       p->winPos += numLits;
2510       return SZ_OK;
2511     }
2512   }
2513   {
2514     CInBufPair in;
2515     unsigned mode;
2516     unsigned seqMode;
2517 
2518     in.ptr = src;
2519     in.len = inSize;
2520     if (in.len == 0)
2521       return SZ_ERROR_DATA;
2522     in.len--;
2523     mode = *in.ptr++;
2524     if (mode & 3) // Reserved bits
2525       return SZ_ERROR_DATA;
2526 
2527     seqMode = (mode >> 6);
2528     if (seqMode == k_SeqMode_Repeat)
2529       { if (!IS_SEQ_TABLES_WERE_SET(p)) return SZ_ERROR_DATA; }
2530     else RINOK(FSE_Decode_SeqTable(
2531         p->fse.ll,
2532         &in,
2533         6, // predefAccuracy
2534         &p->ll_accuracy,
2535         NUM_LL_SYMBOLS,
2536         k_PredefRecords_LL,
2537         seqMode))
2538 
2539     seqMode = (mode >> 4) & 3;
2540     if (seqMode == k_SeqMode_Repeat)
2541       { if (!IS_SEQ_TABLES_WERE_SET(p)) return SZ_ERROR_DATA; }
2542     else RINOK(FSE_Decode_SeqTable(
2543         p->fse.of,
2544         &in,
2545         5, // predefAccuracy
2546         &p->of_accuracy,
2547         NUM_OFFSET_SYMBOLS_MAX,
2548         k_PredefRecords_OF,
2549         seqMode))
2550 
2551     seqMode = (mode >> 2) & 3;
2552     if (seqMode == k_SeqMode_Repeat)
2553       { if (!IS_SEQ_TABLES_WERE_SET(p)) return SZ_ERROR_DATA; }
2554     else
2555     {
2556       RINOK(FSE_Decode_SeqTable(
2557         p->fse.ml,
2558         &in,
2559         6, // predefAccuracy
2560         &p->ml_accuracy,
2561         NUM_ML_SYMBOLS,
2562         k_PredefRecords_ML,
2563         seqMode))
2564       /*
2565       #if defined(Z7_ZSTD_DEC_USE_ML_PLUS3)
2566         // { unsigned y = 1 << 10; do
2567       {
2568         const unsigned accuracy = p->ml_accuracy;
2569         if (accuracy == 0)
2570           p->fse.ml[0] += 3;
2571         else
2572         #ifdef MY_CPU_64BIT
2573         {
2574           // alignemt (UInt64 _pad_Alignment) in fse.ml is required for that code
2575           UInt64 *table = (UInt64 *)(void *)p->fse.ml;
2576           const UInt64 *end = (const UInt64 *)(const void *)
2577             ((const Byte *)(const void *)table + ((size_t)sizeof(CFseRecord) << accuracy));
2578           do
2579           {
2580             table[0] += ((UInt64)MATCH_LEN_MIN << 32) + MATCH_LEN_MIN;
2581             table[1] += ((UInt64)MATCH_LEN_MIN << 32) + MATCH_LEN_MIN;
2582             table += 2;
2583           }
2584           while (table != end);
2585         }
2586         #else
2587         {
2588           UInt32 *table = p->fse.ml;
2589           const UInt32 *end = (const UInt32 *)(const void *)
2590             ((const Byte *)(const void *)table + ((size_t)sizeof(CFseRecord) << accuracy));
2591           do
2592           {
2593             table[0] += MATCH_LEN_MIN;
2594             table[1] += MATCH_LEN_MIN;
2595             table += 2;
2596             table[0] += MATCH_LEN_MIN;
2597             table[1] += MATCH_LEN_MIN;
2598             table += 2;
2599           }
2600           while (table != end);
2601         }
2602         #endif
2603       }
2604       // while (--y); }
2605       #endif
2606       */
2607     }
2608 
2609     // p->seqTables_wereSet = True;
2610     if (in.len == 0)
2611       return SZ_ERROR_DATA;
2612     return Decompress_Sequences(p,
2613         in.ptr - SEQ_SRC_OFFSET - BIT_OFFSET_DELTA_BYTES, in.len,
2614         p->winPos + outLimit, &vars);
2615   }
2616 }
2617 
2618 
2619 
2620 
2621 // inSize != 0
2622 // it must do similar to ZstdDec1_DecodeBlock()
2623 static size_t ZstdDec1_NeedTempBufferForInput(
2624     const SizeT beforeSize, const Byte * const src, const SizeT inSize)
2625 {
2626   unsigned b0;
2627   UInt32 pos;
2628 
2629   #ifdef Z7_ZSTD_DEC_USE_CHECK_OF_NEED_TEMP
2630     g_numSeqs = 1 << 24;
2631   #else
2632   // we have at least 3 bytes before seq data: litBlockType, numSeqs, seqMode
2633   #define MIN_BLOCK_LZ_HEADERS_SIZE 3
2634   if (beforeSize >= MAX_BACKWARD_DEPTH - MIN_BLOCK_LZ_HEADERS_SIZE)
2635     return 0;
2636   #endif
2637 
2638   b0 = src[0];
2639 
2640   if ((b0 & k_LitBlockType_Flag_Compressed) == 0)
2641   {
2642     UInt32 numLits = b0 >> 3;
2643     pos = 1;
2644     if (b0 & 4)
2645     {
2646       UInt32 v;
2647       if (inSize < 3)
2648         return 0;
2649       numLits >>= 1;
2650       v = GetUi16(src + 1);
2651       pos = 3;
2652       if ((b0 & 8) == 0)
2653       {
2654         pos = 2;
2655         v = (Byte)v;
2656       }
2657       numLits += v << 4;
2658     }
2659     if (b0 & k_LitBlockType_Flag_RLE_or_Treeless)
2660       numLits = 1;
2661     pos += numLits;
2662   }
2663   else if (inSize < 5)
2664     return 0;
2665   else
2666   {
2667     const unsigned mode4Streams = b0 & 0xc;
2668     const unsigned numBytes = (3 * mode4Streams + 48) >> 4;
2669     const unsigned numBits = 4 * numBytes - 6;
2670     UInt32 cs = GetUi32(src + 1);
2671     cs >>= numBits;
2672     cs &= ((UInt32)16 << numBits) - 1;
2673     if (cs == 0)
2674       return 0;
2675     pos = numBytes + cs;
2676   }
2677 
2678   if (pos >= inSize)
2679     return 0;
2680   {
2681     UInt32 numSeqs = src[pos++];
2682     if (numSeqs > 127)
2683     {
2684       UInt32 b1;
2685       if (pos >= inSize)
2686         return 0;
2687       numSeqs -= 128;
2688       b1 = src[pos++];
2689       if (numSeqs == 127)
2690       {
2691         if (pos >= inSize)
2692           return 0;
2693         numSeqs = (UInt32)(src[pos++]) + 127;
2694       }
2695       numSeqs = (numSeqs << 8) + b1;
2696     }
2697     #ifdef Z7_ZSTD_DEC_USE_CHECK_OF_NEED_TEMP
2698       g_numSeqs = numSeqs; // for debug
2699     #endif
2700     if (numSeqs == 0)
2701       return 0;
2702   }
2703   /*
2704   if (pos >= inSize)
2705     return 0;
2706   pos++;
2707   */
2708   // we will have one additional byte for seqMode:
2709   if (beforeSize + pos >= MAX_BACKWARD_DEPTH - 1)
2710     return 0;
2711   return 1;
2712 }
2713 
2714 
2715 
2716 // ---------- ZSTD FRAME ----------
2717 
2718 #define kBlockType_Raw          0
2719 #define kBlockType_RLE          1
2720 #define kBlockType_Compressed   2
2721 #define kBlockType_Reserved     3
2722 
2723 typedef enum
2724 {
2725   // begin: states that require 4 bytes:
2726   ZSTD2_STATE_SIGNATURE,
2727   ZSTD2_STATE_HASH,
2728   ZSTD2_STATE_SKIP_HEADER,
2729   // end of states that require 4 bytes
2730 
2731   ZSTD2_STATE_SKIP_DATA,
2732   ZSTD2_STATE_FRAME_HEADER,
2733   ZSTD2_STATE_AFTER_HEADER,
2734   ZSTD2_STATE_BLOCK,
2735   ZSTD2_STATE_DATA,
2736   ZSTD2_STATE_FINISHED
2737 } EZstd2State;
2738 
2739 
2740 struct CZstdDec
2741 {
2742   EZstd2State frameState;
2743   unsigned tempSize;
2744 
2745   Byte temp[14]; // 14 is required
2746 
2747   Byte descriptor;
2748   Byte windowDescriptor;
2749   Byte isLastBlock;
2750   Byte blockType;
2751   Byte isErrorState;
2752   Byte hashError;
2753   Byte disableHash;
2754   Byte isCyclicMode;
2755 
2756   UInt32 blockSize;
2757   UInt32 dictionaryId;
2758   UInt32 curBlockUnpackRem; // for compressed blocks only
2759   UInt32 inTempPos;
2760 
2761   UInt64 contentSize;
2762   UInt64 contentProcessed;
2763   CXxh64State xxh64;
2764 
2765   Byte *inTemp;
2766   SizeT winBufSize_Allocated;
2767   Byte *win_Base;
2768 
2769   ISzAllocPtr alloc_Small;
2770   ISzAllocPtr alloc_Big;
2771 
2772   CZstdDec1 decoder;
2773 };
2774 
2775 #define ZstdDec_GET_UNPROCESSED_XXH64_SIZE(p) \
2776   ((unsigned)(p)->contentProcessed & (Z7_XXH64_BLOCK_SIZE - 1))
2777 
2778 #define ZSTD_DEC_IS_LAST_BLOCK(p) ((p)->isLastBlock)
2779 
2780 
2781 static void ZstdDec_FreeWindow(CZstdDec * const p)
2782 {
2783   if (p->win_Base)
2784   {
2785     ISzAlloc_Free(p->alloc_Big, p->win_Base);
2786     p->win_Base = NULL;
2787     // p->decoder.win = NULL;
2788     p->winBufSize_Allocated = 0;
2789   }
2790 }
2791 
2792 
2793 CZstdDecHandle ZstdDec_Create(ISzAllocPtr alloc_Small, ISzAllocPtr alloc_Big)
2794 {
2795   CZstdDec *p = (CZstdDec *)ISzAlloc_Alloc(alloc_Small, sizeof(CZstdDec));
2796   if (!p)
2797     return NULL;
2798   p->alloc_Small = alloc_Small;
2799   p->alloc_Big = alloc_Big;
2800   // ZstdDec_CONSTRUCT(p)
2801   p->inTemp = NULL;
2802   p->win_Base = NULL;
2803   p->winBufSize_Allocated = 0;
2804   p->disableHash = False;
2805   ZstdDec1_Construct(&p->decoder);
2806   return p;
2807 }
2808 
2809 void ZstdDec_Destroy(CZstdDecHandle p)
2810 {
2811   #ifdef SHOW_STAT
2812     #define PRINT_STAT1(name, v) \
2813       printf("\n%25s = %9u", name, v);
2814   PRINT_STAT1("g_Num_Blocks_Compressed", g_Num_Blocks_Compressed)
2815   PRINT_STAT1("g_Num_Blocks_memcpy", g_Num_Blocks_memcpy)
2816   PRINT_STAT1("g_Num_Wrap_memmove_Num", g_Num_Wrap_memmove_Num)
2817   PRINT_STAT1("g_Num_Wrap_memmove_Bytes", g_Num_Wrap_memmove_Bytes)
2818   if (g_Num_Blocks_Compressed)
2819   {
2820     #define PRINT_STAT(name, v) \
2821       printf("\n%17s = %9u, per_block = %8u", name, v, v / g_Num_Blocks_Compressed);
2822     PRINT_STAT("g_NumSeqs", g_NumSeqs_total)
2823     // PRINT_STAT("g_NumCopy", g_NumCopy)
2824     PRINT_STAT("g_NumOver", g_NumOver)
2825     PRINT_STAT("g_NumOver2", g_NumOver2)
2826     PRINT_STAT("g_Num_Match", g_Num_Match)
2827     PRINT_STAT("g_Num_Lits", g_Num_Lits)
2828     PRINT_STAT("g_Num_LitsBig", g_Num_LitsBig)
2829     PRINT_STAT("g_Num_Lit0", g_Num_Lit0)
2830     PRINT_STAT("g_Num_Rep_0", g_Num_Rep0)
2831     PRINT_STAT("g_Num_Rep_1", g_Num_Rep1)
2832     PRINT_STAT("g_Num_Rep_2", g_Num_Rep2)
2833     PRINT_STAT("g_Num_Rep_3", g_Num_Rep3)
2834     PRINT_STAT("g_Num_Threshold_0", g_Num_Threshold_0)
2835     PRINT_STAT("g_Num_Threshold_1", g_Num_Threshold_1)
2836     PRINT_STAT("g_Num_Threshold_0sum", g_Num_Threshold_0sum)
2837     PRINT_STAT("g_Num_Threshold_1sum", g_Num_Threshold_1sum)
2838   }
2839   printf("\n");
2840   #endif
2841 
2842   ISzAlloc_Free(p->alloc_Small, p->decoder.literalsBase);
2843   // p->->decoder.literalsBase = NULL;
2844   ISzAlloc_Free(p->alloc_Small, p->inTemp);
2845   // p->inTemp = NULL;
2846   ZstdDec_FreeWindow(p);
2847   ISzAlloc_Free(p->alloc_Small, p);
2848 }
2849 
2850 
2851 
2852 #define kTempBuffer_PreSize  (1u << 6)
2853 #if kTempBuffer_PreSize < MAX_BACKWARD_DEPTH
2854   #error Stop_Compiling_Bad_kTempBuffer_PreSize
2855 #endif
2856 
2857 static SRes ZstdDec_AllocateMisc(CZstdDec *p)
2858 {
2859   #define k_Lit_AfterAvail  (1u << 6)
2860   #if k_Lit_AfterAvail < (COPY_CHUNK_SIZE - 1)
2861     #error Stop_Compiling_Bad_k_Lit_AfterAvail
2862   #endif
2863   // return ZstdDec1_Allocate(&p->decoder, p->alloc_Small);
2864   if (!p->decoder.literalsBase)
2865   {
2866     p->decoder.literalsBase = (Byte *)ISzAlloc_Alloc(p->alloc_Small,
2867         kBlockSizeMax + k_Lit_AfterAvail);
2868     if (!p->decoder.literalsBase)
2869       return SZ_ERROR_MEM;
2870   }
2871   if (!p->inTemp)
2872   {
2873     // we need k_Lit_AfterAvail here for owerread from raw literals stream
2874     p->inTemp = (Byte *)ISzAlloc_Alloc(p->alloc_Small,
2875         kBlockSizeMax + kTempBuffer_PreSize + k_Lit_AfterAvail);
2876     if (!p->inTemp)
2877       return SZ_ERROR_MEM;
2878   }
2879   return SZ_OK;
2880 }
2881 
2882 
2883 static void ZstdDec_Init_ForNewFrame(CZstdDec *p)
2884 {
2885   p->frameState = ZSTD2_STATE_SIGNATURE;
2886   p->tempSize = 0;
2887 
2888   p->isErrorState = False;
2889   p->hashError = False;
2890   p->isCyclicMode = False;
2891   p->contentProcessed = 0;
2892   Xxh64State_Init(&p->xxh64);
2893   ZstdDec1_Init(&p->decoder);
2894 }
2895 
2896 
2897 void ZstdDec_Init(CZstdDec *p)
2898 {
2899   ZstdDec_Init_ForNewFrame(p);
2900   p->decoder.winPos = 0;
2901   memset(p->temp, 0, sizeof(p->temp));
2902 }
2903 
2904 
2905 #define DESCRIPTOR_Get_DictionaryId_Flag(d)   ((d) & 3)
2906 #define DESCRIPTOR_FLAG_CHECKSUM              (1 << 2)
2907 #define DESCRIPTOR_FLAG_RESERVED              (1 << 3)
2908 // #define DESCRIPTOR_FLAG_UNUSED                (1 << 4)
2909 #define DESCRIPTOR_FLAG_SINGLE                (1 << 5)
2910 #define DESCRIPTOR_Get_ContentSize_Flag3(d)   ((d) >> 5)
2911 #define DESCRIPTOR_Is_ContentSize_Defined(d)  (((d) & 0xe0) != 0)
2912 
2913 
2914 static EZstd2State ZstdDec_UpdateState(CZstdDec * const p, const Byte b, CZstdDecInfo * const info)
2915 {
2916   unsigned tempSize = p->tempSize;
2917   p->temp[tempSize++] = b;
2918   p->tempSize = tempSize;
2919 
2920   if (p->frameState == ZSTD2_STATE_BLOCK)
2921   {
2922     if (tempSize < 3)
2923       return ZSTD2_STATE_BLOCK;
2924     {
2925       UInt32 b0 = GetUi32(p->temp);
2926       const unsigned type = ((unsigned)b0 >> 1) & 3;
2927       if (type == kBlockType_RLE && tempSize == 3)
2928         return ZSTD2_STATE_BLOCK;
2929       // info->num_Blocks_forType[type]++;
2930       info->num_Blocks++;
2931       if (type == kBlockType_Reserved)
2932       {
2933         p->isErrorState = True; // SZ_ERROR_UNSUPPORTED
2934         return ZSTD2_STATE_BLOCK;
2935       }
2936       p->blockType = (Byte)type;
2937       p->isLastBlock = (Byte)(b0 & 1);
2938       p->inTempPos = 0;
2939       p->tempSize = 0;
2940       b0 >>= 3;
2941       b0 &= 0x1fffff;
2942       // info->num_BlockBytes_forType[type] += b0;
2943       if (b0 == 0)
2944       {
2945         // empty RAW/RLE blocks are allowed in original-zstd decoder
2946         if (type == kBlockType_Compressed)
2947         {
2948           p->isErrorState = True;
2949           return ZSTD2_STATE_BLOCK;
2950         }
2951         if (!ZSTD_DEC_IS_LAST_BLOCK(p))
2952           return ZSTD2_STATE_BLOCK;
2953         if (p->descriptor & DESCRIPTOR_FLAG_CHECKSUM)
2954           return ZSTD2_STATE_HASH;
2955         return ZSTD2_STATE_FINISHED;
2956       }
2957       p->blockSize = b0;
2958       {
2959         UInt32 blockLim = ZstdDec1_GET_BLOCK_SIZE_LIMIT(&p->decoder);
2960         // compressed and uncompressed block sizes cannot be larger than min(kBlockSizeMax, window_size)
2961         if (b0 > blockLim)
2962         {
2963           p->isErrorState = True; // SZ_ERROR_UNSUPPORTED;
2964           return ZSTD2_STATE_BLOCK;
2965         }
2966         if (DESCRIPTOR_Is_ContentSize_Defined(p->descriptor))
2967         {
2968           const UInt64 rem = p->contentSize - p->contentProcessed;
2969           if (blockLim > rem)
2970               blockLim = (UInt32)rem;
2971         }
2972         p->curBlockUnpackRem = blockLim;
2973         // uncompressed block size cannot be larger than remain data size:
2974         if (type != kBlockType_Compressed)
2975         {
2976           if (b0 > blockLim)
2977           {
2978             p->isErrorState = True; // SZ_ERROR_UNSUPPORTED;
2979             return ZSTD2_STATE_BLOCK;
2980           }
2981         }
2982       }
2983     }
2984     return ZSTD2_STATE_DATA;
2985   }
2986 
2987   if ((unsigned)p->frameState < ZSTD2_STATE_SKIP_DATA)
2988   {
2989     UInt32 v;
2990     if (tempSize != 4)
2991       return p->frameState;
2992     v = GetUi32(p->temp);
2993     if ((unsigned)p->frameState < ZSTD2_STATE_HASH) // == ZSTD2_STATE_SIGNATURE
2994     {
2995       if (v == 0xfd2fb528)
2996       {
2997         p->tempSize = 0;
2998         info->num_DataFrames++;
2999         return ZSTD2_STATE_FRAME_HEADER;
3000       }
3001       if ((v & 0xfffffff0) == 0x184d2a50)
3002       {
3003         p->tempSize = 0;
3004         info->num_SkipFrames++;
3005         return ZSTD2_STATE_SKIP_HEADER;
3006       }
3007       p->isErrorState = True;
3008       return ZSTD2_STATE_SIGNATURE;
3009       // return ZSTD2_STATE_ERROR; // is not ZSTD stream
3010     }
3011     if (p->frameState == ZSTD2_STATE_HASH)
3012     {
3013       info->checksum_Defined = True;
3014       info->checksum = v;
3015       // #ifndef DISABLE_XXH_CHECK
3016       if (!p->disableHash)
3017       {
3018         if (p->decoder.winPos < ZstdDec_GET_UNPROCESSED_XXH64_SIZE(p))
3019         {
3020           // unexpected code failure
3021           p->isErrorState = True;
3022           // SZ_ERROR_FAIL;
3023         }
3024         else
3025         if ((UInt32)Xxh64State_Digest(&p->xxh64,
3026             p->decoder.win + (p->decoder.winPos - ZstdDec_GET_UNPROCESSED_XXH64_SIZE(p)),
3027             p->contentProcessed) != v)
3028         {
3029           p->hashError = True;
3030           // return ZSTD2_STATE_ERROR; // hash error
3031         }
3032       }
3033       // #endif
3034       return ZSTD2_STATE_FINISHED;
3035     }
3036     // (p->frameState == ZSTD2_STATE_SKIP_HEADER)
3037     {
3038       p->blockSize = v;
3039       info->skipFrames_Size += v;
3040       p->tempSize = 0;
3041       /* we want the caller could know that there was finished frame
3042          finished frame. So we allow the case where
3043          we have ZSTD2_STATE_SKIP_DATA state with (blockSize == 0).
3044       */
3045       // if (v == 0) return ZSTD2_STATE_SIGNATURE;
3046       return ZSTD2_STATE_SKIP_DATA;
3047     }
3048   }
3049 
3050   // if (p->frameState == ZSTD2_STATE_FRAME_HEADER)
3051   {
3052     unsigned descriptor;
3053     const Byte *h;
3054     descriptor = p->temp[0];
3055     p->descriptor = (Byte)descriptor;
3056     if (descriptor & DESCRIPTOR_FLAG_RESERVED) // reserved bit
3057     {
3058       p->isErrorState = True;
3059       return ZSTD2_STATE_FRAME_HEADER;
3060       // return ZSTD2_STATE_ERROR;
3061     }
3062     {
3063       const unsigned n = DESCRIPTOR_Get_ContentSize_Flag3(descriptor);
3064       // tempSize -= 1 + ((1u << (n >> 1)) | ((n + 1) & 1));
3065       tempSize -= (0x9a563422u >> (n * 4)) & 0xf;
3066     }
3067     if (tempSize != (4u >> (3 - DESCRIPTOR_Get_DictionaryId_Flag(descriptor))))
3068       return ZSTD2_STATE_FRAME_HEADER;
3069 
3070     info->descriptor_OR     = (Byte)(info->descriptor_OR     |  descriptor);
3071     info->descriptor_NOT_OR = (Byte)(info->descriptor_NOT_OR | ~descriptor);
3072 
3073     h = &p->temp[1];
3074     {
3075       Byte w = 0;
3076       if ((descriptor & DESCRIPTOR_FLAG_SINGLE) == 0)
3077       {
3078         w = *h++;
3079         if (info->windowDescriptor_MAX < w)
3080             info->windowDescriptor_MAX = w;
3081         // info->are_WindowDescriptors = True;
3082         // info->num_WindowDescriptors++;
3083       }
3084       else
3085       {
3086         // info->are_SingleSegments = True;
3087         // info->num_SingleSegments++;
3088       }
3089       p->windowDescriptor = w;
3090     }
3091     {
3092       unsigned n = DESCRIPTOR_Get_DictionaryId_Flag(descriptor);
3093       UInt32 d = 0;
3094       if (n)
3095       {
3096         n = 1u << (n - 1);
3097         d = GetUi32(h) & ((UInt32)(Int32)-1 >> (32 - 8u * n));
3098         h += n;
3099       }
3100       p->dictionaryId = d;
3101       // info->dictionaryId_Cur = d;
3102       if (d != 0)
3103       {
3104         if (info->dictionaryId == 0)
3105           info->dictionaryId = d;
3106         else if (info->dictionaryId != d)
3107           info->are_DictionaryId_Different = True;
3108       }
3109     }
3110     {
3111       unsigned n = DESCRIPTOR_Get_ContentSize_Flag3(descriptor);
3112       UInt64 v = 0;
3113       if (n)
3114       {
3115         n >>= 1;
3116         if (n == 1)
3117           v = 256;
3118         v += GetUi64(h) & ((UInt64)(Int64)-1 >> (64 - (8u << n)));
3119         // info->are_ContentSize_Known = True;
3120         // info->num_Frames_with_ContentSize++;
3121         if (info->contentSize_MAX < v)
3122             info->contentSize_MAX = v;
3123         info->contentSize_Total += v;
3124       }
3125       else
3126       {
3127         info->are_ContentSize_Unknown = True;
3128         // info->num_Frames_without_ContentSize++;
3129       }
3130       p->contentSize = v;
3131     }
3132     // if ((size_t)(h - p->temp) != headerSize) return ZSTD2_STATE_ERROR; // it's unexpected internal code failure
3133     p->tempSize = 0;
3134 
3135     info->checksum_Defined = False;
3136     /*
3137     if (descriptor & DESCRIPTOR_FLAG_CHECKSUM)
3138       info->are_Checksums = True;
3139     else
3140       info->are_Non_Checksums = True;
3141     */
3142 
3143     return ZSTD2_STATE_AFTER_HEADER; // ZSTD2_STATE_BLOCK;
3144   }
3145 }
3146 
3147 
3148 static void ZstdDec_Update_XXH(CZstdDec * const p, size_t xxh64_winPos)
3149 {
3150  /*
3151  #ifdef DISABLE_XXH_CHECK
3152   UNUSED_VAR(data)
3153  #else
3154  */
3155   if (!p->disableHash && (p->descriptor & DESCRIPTOR_FLAG_CHECKSUM))
3156   {
3157     // const size_t pos = p->xxh64_winPos;
3158     const size_t size = (p->decoder.winPos - xxh64_winPos) & ~(size_t)31;
3159     if (size)
3160     {
3161       // p->xxh64_winPos = pos + size;
3162       Xxh64State_UpdateBlocks(&p->xxh64,
3163           p->decoder.win + xxh64_winPos,
3164           p->decoder.win + xxh64_winPos + size);
3165     }
3166   }
3167 }
3168 
3169 
3170 /*
3171 in:
3172   (winLimit) : is relaxed limit, where this function is allowed to stop writing of decoded data (if possible).
3173     - this function uses (winLimit) for RAW/RLE blocks only,
3174         because this function can decode single RAW/RLE block in several different calls.
3175     - this function DOESN'T use (winLimit) for Compressed blocks,
3176         because this function decodes full compressed block in single call.
3177   (CZstdDec1::winPos <= winLimit)
3178   (winLimit <= CZstdDec1::cycSize).
3179   Note: if (ds->outBuf_fromCaller) mode is used, then
3180   {
3181     (strong_limit) is stored in CZstdDec1::cycSize.
3182     So (winLimit) is more strong than (strong_limit).
3183   }
3184 
3185 exit:
3186   Note: (CZstdDecState::winPos) will be set by caller after exit of this function.
3187 
3188   This function can exit for any of these conditions:
3189     - (frameState == ZSTD2_STATE_AFTER_HEADER)
3190     - (frameState == ZSTD2_STATE_FINISHED) : frame was finished : (status == ZSTD_STATUS_FINISHED_FRAME) is set
3191     - finished non-empty non-last block. So (CZstdDec1::winPos_atExit != winPos_atFuncStart).
3192     - ZSTD_STATUS_NEEDS_MORE_INPUT in src
3193     - (CZstdDec1::winPos) have reached (winLimit) in non-finished RAW/RLE block
3194 
3195   This function decodes no more than one non-empty block.
3196   So it fulfills the condition at exit:
3197     (CZstdDec1::winPos_atExit - winPos_atFuncStart <= block_size_max)
3198   Note: (winPos_atExit > winLimit) is possible in some cases after compressed block decoding.
3199 
3200   if (ds->outBuf_fromCaller) mode (useAdditionalWinLimit medo)
3201   {
3202     then this function uses additional strong limit from (CZstdDec1::cycSize).
3203     So this function will not write any data after (CZstdDec1::cycSize)
3204     And it fulfills the condition at exit:
3205       (CZstdDec1::winPos_atExit <= CZstdDec1::cycSize)
3206   }
3207 */
3208 static SRes ZstdDec_DecodeBlock(CZstdDec * const p, CZstdDecState * const ds,
3209     SizeT winLimitAdd)
3210 {
3211   const Byte *src = ds->inBuf;
3212   SizeT * const srcLen = &ds->inPos;
3213   const SizeT inSize = ds->inLim;
3214   // const int useAdditionalWinLimit = ds->outBuf_fromCaller ? 1 : 0;
3215   enum_ZstdStatus * const status = &ds->status;
3216   CZstdDecInfo * const info = &ds->info;
3217   SizeT winLimit;
3218 
3219   const SizeT winPos_atFuncStart = p->decoder.winPos;
3220   src += *srcLen;
3221   *status = ZSTD_STATUS_NOT_SPECIFIED;
3222 
3223   // finishMode = ZSTD_FINISH_ANY;
3224   if (ds->outSize_Defined)
3225   {
3226     if (ds->outSize < ds->outProcessed)
3227     {
3228       // p->isAfterSizeMode = 2; // we have extra bytes already
3229       *status = ZSTD_STATUS_OUT_REACHED;
3230       return SZ_OK;
3231       // size = 0;
3232     }
3233     else
3234     {
3235       // p->outSize >= p->outProcessed
3236       const UInt64 rem = ds->outSize - ds->outProcessed;
3237       /*
3238       if (rem == 0)
3239       p->isAfterSizeMode = 1; // we have reached exact required size
3240       */
3241       if (winLimitAdd >= rem)
3242       {
3243         winLimitAdd = (SizeT)rem;
3244         // if (p->finishMode) finishMode = ZSTD_FINISH_END;
3245       }
3246     }
3247   }
3248 
3249   winLimit = p->decoder.winPos + winLimitAdd;
3250   // (p->decoder.winPos <= winLimit)
3251 
3252   // while (p->frameState != ZSTD2_STATE_ERROR)
3253   while (!p->isErrorState)
3254   {
3255     SizeT inCur = inSize - *srcLen;
3256 
3257     if (p->frameState == ZSTD2_STATE_DATA)
3258     {
3259       /* (p->decoder.winPos == winPos_atFuncStart) is expected,
3260          because this function doesn't start new block.
3261          if it have finished some non-empty block in this call. */
3262       if (p->decoder.winPos != winPos_atFuncStart)
3263         return SZ_ERROR_FAIL; // it's unexpected
3264 
3265       /*
3266       if (p->decoder.winPos > winLimit)
3267       {
3268         // we can be here, if in this function call
3269         //      - we have extracted non-empty compressed block, and (winPos > winLimit) after that.
3270         //      - we have started new block decoding after that.
3271         // It's unexpected case, because we exit after non-empty non-last block.
3272         *status = (inSize == *srcLen) ?
3273             ZSTD_STATUS_NEEDS_MORE_INPUT :
3274             ZSTD_STATUS_NOT_FINISHED;
3275         return SZ_OK;
3276       }
3277       */
3278       // p->decoder.winPos <= winLimit
3279 
3280       if (p->blockType != kBlockType_Compressed)
3281       {
3282         // it's RLE or RAW block.
3283         // p->BlockSize != 0_
3284         // winLimit <= p->decoder.cycSize
3285         /* So here we use more strong (winLimit), even for
3286            (ds->outBuf_fromCaller) mode. */
3287         SizeT outCur = winLimit - p->decoder.winPos;
3288         {
3289           const UInt32 rem = p->blockSize;
3290           if (outCur > rem)
3291               outCur = rem;
3292         }
3293         if (p->blockType == kBlockType_Raw)
3294         {
3295           if (outCur > inCur)
3296               outCur = inCur;
3297           /* output buffer is better aligned for XXH code.
3298              So we use hash for output buffer data */
3299           // ZstdDec_Update_XXH(p, src, outCur); // for debug:
3300           memcpy(p->decoder.win + p->decoder.winPos, src, outCur);
3301           src += outCur;
3302           *srcLen += outCur;
3303         }
3304         else // kBlockType_RLE
3305         {
3306           #define RLE_BYTE_INDEX_IN_temp  3
3307           memset(p->decoder.win + p->decoder.winPos,
3308               p->temp[RLE_BYTE_INDEX_IN_temp], outCur);
3309         }
3310         {
3311           const SizeT xxh64_winPos = p->decoder.winPos - ZstdDec_GET_UNPROCESSED_XXH64_SIZE(p);
3312           p->decoder.winPos += outCur;
3313           p->contentProcessed += outCur;
3314           ZstdDec_Update_XXH(p, xxh64_winPos);
3315         }
3316         // ds->winPos = p->decoder.winPos;  // the caller does it instead. for debug:
3317         UPDATE_TOTAL_OUT(&p->decoder, outCur)
3318         ds->outProcessed += outCur;
3319         if (p->blockSize -= (UInt32)outCur)
3320         {
3321           /*
3322           if (ds->outSize_Defined)
3323           {
3324             if (ds->outSize <= ds->outProcessed) ds->isAfterSizeMode = (enum_ZstdStatus)
3325                (ds->outSize == ds->outProcessed ? 1u: 2u);
3326           }
3327           */
3328           *status = (enum_ZstdStatus)
3329               (ds->outSize_Defined && ds->outSize <= ds->outProcessed ?
3330                 ZSTD_STATUS_OUT_REACHED : (p->blockType == kBlockType_Raw && inSize == *srcLen) ?
3331                 ZSTD_STATUS_NEEDS_MORE_INPUT :
3332                 ZSTD_STATUS_NOT_FINISHED);
3333           return SZ_OK;
3334         }
3335       }
3336       else // kBlockType_Compressed
3337       {
3338         // p->blockSize != 0
3339         // (uncompressed_size_of_block == 0) is allowed
3340         // (p->curBlockUnpackRem == 0) is allowed
3341         /*
3342         if (p->decoder.winPos >= winLimit)
3343         {
3344           if (p->decoder.winPos != winPos_atFuncStart)
3345           {
3346             // it's unexpected case
3347             // We already have some data in finished blocks in this function call.
3348             //   So we don't decompress new block after (>=winLimit),
3349             //   even if it's empty block.
3350             *status = (inSize == *srcLen) ?
3351                 ZSTD_STATUS_NEEDS_MORE_INPUT :
3352                 ZSTD_STATUS_NOT_FINISHED;
3353             return SZ_OK;
3354           }
3355           // (p->decoder.winPos == winLimit == winPos_atFuncStart)
3356           // we will decode current block, because that current
3357           //   block can be empty block and we want to make some visible
3358           //   change of (src) stream after function start.
3359         }
3360         */
3361         /*
3362         if (ds->outSize_Defined && ds->outSize < ds->outProcessed)
3363         {
3364           // we don't want to start new block, if we have more extra decoded bytes already
3365           *status = ZSTD_STATUS_OUT_REACHED;
3366           return SZ_OK;
3367         }
3368         */
3369         {
3370           const Byte *comprStream;
3371           size_t afterAvail;
3372           UInt32 inTempPos = p->inTempPos;
3373           const UInt32 rem = p->blockSize - inTempPos;
3374           // rem != 0
3375           if (inTempPos != 0  // (inTemp) buffer already contains some input data
3376               || inCur < rem  // available input data size is smaller than compressed block size
3377               || ZstdDec1_NeedTempBufferForInput(*srcLen, src, rem))
3378           {
3379             if (inCur > rem)
3380                 inCur = rem;
3381             if (inCur)
3382             {
3383               STAT_INC(g_Num_Blocks_memcpy)
3384               // we clear data for backward lookahead reading
3385               if (inTempPos == 0)
3386                 memset(p->inTemp + kTempBuffer_PreSize - MAX_BACKWARD_DEPTH, 0, MAX_BACKWARD_DEPTH);
3387               // { unsigned y = 0; for(;y < 1000; y++)
3388               memcpy(p->inTemp + inTempPos + kTempBuffer_PreSize, src, inCur);
3389               // }
3390               src += inCur;
3391               *srcLen += inCur;
3392               inTempPos += (UInt32)inCur;
3393               p->inTempPos = inTempPos;
3394             }
3395             if (inTempPos != p->blockSize)
3396             {
3397               *status = ZSTD_STATUS_NEEDS_MORE_INPUT;
3398               return SZ_OK;
3399             }
3400             #if COPY_CHUNK_SIZE > 1
3401               memset(p->inTemp + kTempBuffer_PreSize + inTempPos, 0, COPY_CHUNK_SIZE);
3402             #endif
3403             comprStream = p->inTemp + kTempBuffer_PreSize;
3404             afterAvail = k_Lit_AfterAvail;
3405             // we don't want to read non-initialized data or junk in CopyMatch():
3406           }
3407           else
3408           {
3409             // inCur >= rem
3410             // we use direct decoding from (src) buffer:
3411             afterAvail = inCur - rem;
3412             comprStream = src;
3413             src += rem;
3414             *srcLen += rem;
3415           }
3416 
3417           #ifdef Z7_ZSTD_DEC_USE_CHECK_OF_NEED_TEMP
3418             ZstdDec1_NeedTempBufferForInput(*srcLen, comprStream, p->blockSize);
3419           #endif
3420           // printf("\nblockSize=%u", p->blockSize);
3421           // printf("%x\n", (unsigned)p->contentProcessed);
3422           STAT_INC(g_Num_Blocks_Compressed)
3423           {
3424             SRes sres;
3425             const size_t winPos = p->decoder.winPos;
3426             /*
3427                if ( useAdditionalWinLimit), we use strong unpack limit: smallest from
3428                   - limit from stream : (curBlockUnpackRem)
3429                   - limit from caller : (cycSize - winPos)
3430                if (!useAdditionalWinLimit), we use only relaxed limit:
3431                   - limit from stream : (curBlockUnpackRem)
3432             */
3433             SizeT outLimit = p->curBlockUnpackRem;
3434             if (ds->outBuf_fromCaller)
3435             // if (useAdditionalWinLimit)
3436             {
3437               const size_t limit = p->decoder.cycSize - winPos;
3438               if (outLimit > limit)
3439                   outLimit = limit;
3440             }
3441             sres = ZstdDec1_DecodeBlock(&p->decoder,
3442                 comprStream, p->blockSize, afterAvail, outLimit);
3443             // ds->winPos = p->decoder.winPos;  // the caller does it instead. for debug:
3444             if (sres)
3445             {
3446               p->isErrorState = True;
3447               return sres;
3448             }
3449             {
3450               const SizeT xxh64_winPos = winPos - ZstdDec_GET_UNPROCESSED_XXH64_SIZE(p);
3451               const size_t num = p->decoder.winPos - winPos;
3452               ds->outProcessed += num;
3453               p->contentProcessed += num;
3454               ZstdDec_Update_XXH(p, xxh64_winPos);
3455             }
3456           }
3457           // printf("\nwinPos=%x", (int)(unsigned)p->decoder.winPos);
3458         }
3459       }
3460 
3461       /*
3462       if (ds->outSize_Defined)
3463       {
3464         if (ds->outSize <= ds->outProcessed) ds->isAfterSizeMode = (enum_ZstdStatus)
3465            (ds->outSize == ds->outProcessed ? 1u: 2u);
3466       }
3467       */
3468 
3469       if (!ZSTD_DEC_IS_LAST_BLOCK(p))
3470       {
3471         p->frameState = ZSTD2_STATE_BLOCK;
3472         if (ds->outSize_Defined && ds->outSize < ds->outProcessed)
3473         {
3474           *status = ZSTD_STATUS_OUT_REACHED;
3475           return SZ_OK;
3476         }
3477         // we exit only if (winPos) was changed in this function call:
3478         if (p->decoder.winPos != winPos_atFuncStart)
3479         {
3480           // decoded block was not empty. So we exit:
3481           *status = (enum_ZstdStatus)(
3482               (inSize == *srcLen) ?
3483                 ZSTD_STATUS_NEEDS_MORE_INPUT :
3484                 ZSTD_STATUS_NOT_FINISHED);
3485           return SZ_OK;
3486         }
3487         // (p->decoder.winPos == winPos_atFuncStart)
3488         // so current decoded block was empty.
3489         // we will try to decode more blocks in this function.
3490         continue;
3491       }
3492 
3493       // decoded block was last in frame
3494       if (p->descriptor & DESCRIPTOR_FLAG_CHECKSUM)
3495       {
3496         p->frameState = ZSTD2_STATE_HASH;
3497         if (ds->outSize_Defined && ds->outSize < ds->outProcessed)
3498         {
3499           *status = ZSTD_STATUS_OUT_REACHED;
3500           return SZ_OK; // disable if want to
3501           /* We want to get same return codes for any input buffer sizes.
3502              We want to get faster ZSTD_STATUS_OUT_REACHED status.
3503              So we exit with ZSTD_STATUS_OUT_REACHED here,
3504              instead of ZSTD2_STATE_HASH and ZSTD2_STATE_FINISHED processing.
3505              that depends from input buffer size and that can set
3506              ZSTD_STATUS_NEEDS_MORE_INPUT or return SZ_ERROR_DATA or SZ_ERROR_CRC.
3507           */
3508         }
3509       }
3510       else
3511       {
3512         /* ZSTD2_STATE_FINISHED proccesing doesn't depend from input buffer */
3513         p->frameState = ZSTD2_STATE_FINISHED;
3514       }
3515       /*
3516       p->frameState = (p->descriptor & DESCRIPTOR_FLAG_CHECKSUM) ?
3517           ZSTD2_STATE_HASH :
3518           ZSTD2_STATE_FINISHED;
3519       */
3520       /* it's required to process ZSTD2_STATE_FINISHED state in this function call,
3521          because we must check contentSize and hashError in ZSTD2_STATE_FINISHED code,
3522          while the caller can reinit full state for ZSTD2_STATE_FINISHED
3523          So we can't exit from function here. */
3524       continue;
3525     }
3526 
3527     if (p->frameState == ZSTD2_STATE_FINISHED)
3528     {
3529       *status = ZSTD_STATUS_FINISHED_FRAME;
3530       if (DESCRIPTOR_Is_ContentSize_Defined(p->descriptor)
3531           && p->contentSize != p->contentProcessed)
3532         return SZ_ERROR_DATA;
3533       if (p->hashError) // for debug
3534         return SZ_ERROR_CRC;
3535       return SZ_OK;
3536       // p->frameState = ZSTD2_STATE_SIGNATURE;
3537       // continue;
3538     }
3539 
3540     if (p->frameState == ZSTD2_STATE_AFTER_HEADER)
3541       return SZ_OK; // we need memory allocation for that state
3542 
3543     if (p->frameState == ZSTD2_STATE_SKIP_DATA)
3544     {
3545       UInt32 blockSize = p->blockSize;
3546       // (blockSize == 0) is possible
3547       if (inCur > blockSize)
3548           inCur = blockSize;
3549       src += inCur;
3550       *srcLen += inCur;
3551       blockSize -= (UInt32)inCur;
3552       p->blockSize = blockSize;
3553       if (blockSize == 0)
3554       {
3555         p->frameState = ZSTD2_STATE_SIGNATURE;
3556         // continue; // for debug: we can continue without return to caller.
3557         // we notify the caller that skip frame was finished:
3558         *status = ZSTD_STATUS_FINISHED_FRAME;
3559         return SZ_OK;
3560       }
3561       // blockSize != 0
3562       // (inCur) was smaller than previous value of p->blockSize.
3563       // (inSize == *srcLen) now
3564       *status = ZSTD_STATUS_NEEDS_MORE_INPUT;
3565       return SZ_OK;
3566     }
3567 
3568     if (inCur == 0)
3569     {
3570       *status = ZSTD_STATUS_NEEDS_MORE_INPUT;
3571       return SZ_OK;
3572     }
3573 
3574     {
3575       (*srcLen)++;
3576       p->frameState = ZstdDec_UpdateState(p, *src++, info);
3577     }
3578   }
3579 
3580   *status = ZSTD_STATUS_NOT_SPECIFIED;
3581   p->isErrorState = True;
3582   // p->frameState = ZSTD2_STATE_ERROR;
3583   // if (p->frameState = ZSTD2_STATE_SIGNATURE)  return SZ_ERROR_NO_ARCHIVE
3584   return SZ_ERROR_DATA;
3585 }
3586 
3587 
3588 
3589 
3590 SRes ZstdDec_Decode(CZstdDecHandle dec, CZstdDecState *p)
3591 {
3592   p->needWrite_Size = 0;
3593   p->status = ZSTD_STATUS_NOT_SPECIFIED;
3594   dec->disableHash = p->disableHash;
3595 
3596   if (p->outBuf_fromCaller)
3597   {
3598     dec->decoder.win = p->outBuf_fromCaller;
3599     dec->decoder.cycSize = p->outBufSize_fromCaller;
3600   }
3601 
3602   // p->winPos = dec->decoder.winPos;
3603 
3604   for (;;)
3605   {
3606     SizeT winPos, size;
3607     // SizeT outProcessed;
3608     SRes res;
3609 
3610     if (p->wrPos > dec->decoder.winPos)
3611       return SZ_ERROR_FAIL;
3612 
3613     if (dec->frameState == ZSTD2_STATE_FINISHED)
3614     {
3615       if (!p->outBuf_fromCaller)
3616       {
3617         // we need to set positions to zero for new frame.
3618         if (p->wrPos != dec->decoder.winPos)
3619         {
3620           /* We have already asked the caller to flush all data
3621              with (p->needWrite_Size) and (ZSTD_STATUS_FINISHED_FRAME) status.
3622              So it's unexpected case */
3623           // p->winPos = dec->decoder.winPos;
3624           // p->needWrite_Size = dec->decoder.winPos - p->wrPos; // flush size asking
3625           // return SZ_OK; // ask to flush again
3626           return SZ_ERROR_FAIL;
3627         }
3628         // (p->wrPos == dec->decoder.winPos), and we wrap to zero:
3629         dec->decoder.winPos = 0;
3630         p->winPos = 0;
3631         p->wrPos = 0;
3632       }
3633       ZstdDec_Init_ForNewFrame(dec);
3634       // continue;
3635     }
3636 
3637     winPos = dec->decoder.winPos;
3638     {
3639       SizeT next = dec->decoder.cycSize;
3640       /* cycSize == 0, if no buffer was allocated still,
3641          or, if (outBuf_fromCaller) mode and (outBufSize_fromCaller == 0) */
3642       if (!p->outBuf_fromCaller
3643           && next
3644           && next <= winPos
3645           && dec->isCyclicMode)
3646       {
3647         // (0 < decoder.cycSize <= winPos) in isCyclicMode.
3648         // so we need to wrap (winPos) and (wrPos) over (cycSize).
3649         const size_t delta = next;
3650         // (delta) is how many bytes we remove from buffer.
3651         /*
3652         // we don't need data older than last (cycSize) bytes.
3653         size_t delta = winPos - next; // num bytes after (cycSize)
3654         if (delta <= next) // it's expected case
3655           delta = next;
3656         // delta == Max(cycSize, winPos - cycSize)
3657         */
3658         if (p->wrPos < delta)
3659         {
3660           // (wrPos < decoder.cycSize)
3661           // We have asked already the caller to flush required data
3662           // p->status = ZSTD_STATUS_NOT_SPECIFIED;
3663           // p->winPos = winPos;
3664           // p->needWrite_Size = delta - p->wrPos; // flush size asking
3665           // return SZ_OK; // ask to flush again
3666           return SZ_ERROR_FAIL;
3667         }
3668         // p->wrPos >= decoder.cycSize
3669         // we move extra data after (decoder.cycSize) to start of cyclic buffer:
3670         winPos -= delta;
3671         if (winPos)
3672         {
3673           if (winPos >= delta)
3674             return SZ_ERROR_FAIL;
3675           memmove(dec->decoder.win, dec->decoder.win + delta, winPos);
3676           // printf("\nmemmove processed=%8x winPos=%8x\n", (unsigned)p->outProcessed, (unsigned)dec->decoder.winPos);
3677           STAT_INC(g_Num_Wrap_memmove_Num)
3678           STAT_UPDATE(g_Num_Wrap_memmove_Bytes += (unsigned)winPos;)
3679         }
3680         dec->decoder.winPos = winPos;
3681         p->winPos = winPos;
3682         p->wrPos -= delta;
3683         // dec->xxh64_winPos -= delta;
3684 
3685         // (winPos < delta)
3686         #ifdef Z7_STD_DEC_USE_AFTER_CYC_BUF
3687           /* we set the data after cycSize, because
3688              we don't want to read non-initialized data or junk in CopyMatch(). */
3689           memset(dec->decoder.win + next, 0, COPY_CHUNK_SIZE);
3690         #endif
3691 
3692         /*
3693         if (winPos == next)
3694         {
3695           if (winPos != p->wrPos)
3696           {
3697             // we already requested before to flush full data for that case.
3698             //   but we give the caller a second chance to flush data:
3699             p->needWrite_Size = winPos - p->wrPos;
3700             return SZ_OK;
3701           }
3702           // (decoder.cycSize == winPos == p->wrPos)
3703           // so we do second wrapping to zero:
3704           winPos = 0;
3705           dec->decoder.winPos = 0;
3706           p->winPos = 0;
3707           p->wrPos = 0;
3708         }
3709         */
3710         // (winPos < next)
3711       }
3712 
3713       if (winPos > next)
3714         return SZ_ERROR_FAIL; // it's unexpected case
3715       /*
3716         if (!outBuf_fromCaller && isCyclicMode && cycSize != 0)
3717           then (winPos <  cycSize)
3718           else (winPos <= cycSize)
3719       */
3720       if (!p->outBuf_fromCaller)
3721       {
3722         // that code is optional. We try to optimize write chunk sizes.
3723         /* (next2) is expected next write position in the caller,
3724            if the caller writes by kBlockSizeMax chunks.
3725         */
3726         /*
3727         const size_t next2 = (winPos + kBlockSizeMax) & (kBlockSizeMax - 1);
3728         if (winPos < next2 && next2 < next)
3729           next = next2;
3730         */
3731       }
3732       size = next - winPos;
3733     }
3734 
3735     // note: ZstdDec_DecodeBlock() uses (winLimit = winPos + size) only for RLE and RAW blocks
3736     res = ZstdDec_DecodeBlock(dec, p, size);
3737     /*
3738       after one block decoding:
3739       if (!outBuf_fromCaller && isCyclicMode && cycSize != 0)
3740         then (winPos <  cycSize + max_block_size)
3741         else (winPos <= cycSize)
3742     */
3743 
3744     if (!p->outBuf_fromCaller)
3745       p->win = dec->decoder.win;
3746     p->winPos = dec->decoder.winPos;
3747 
3748     // outProcessed = dec->decoder.winPos - winPos;
3749     // p->outProcessed += outProcessed;
3750 
3751     if (res != SZ_OK)
3752       return res;
3753 
3754     if (dec->frameState != ZSTD2_STATE_AFTER_HEADER)
3755     {
3756       if (p->outBuf_fromCaller)
3757         return SZ_OK;
3758       {
3759         // !p->outBuf_fromCaller
3760         /*
3761           if (ZSTD_STATUS_FINISHED_FRAME), we request full flushing here because
3762             1) it's simpler to work with allocation and extracting of next frame,
3763             2) it's better to start writing to next new frame with aligned memory
3764                for faster xxh 64-bit reads.
3765         */
3766         size_t end = dec->decoder.winPos;  // end pos for all data flushing
3767         if (p->status != ZSTD_STATUS_FINISHED_FRAME)
3768         {
3769           // we will request flush here only for cases when wrap in cyclic buffer can be required in next call.
3770           if (!dec->isCyclicMode)
3771             return SZ_OK;
3772           // isCyclicMode
3773           {
3774             const size_t delta = dec->decoder.cycSize;
3775             if (end < delta)
3776               return SZ_OK; // (winPos < cycSize). no need for flush
3777             // cycSize <= winPos
3778             // So we ask the caller to flush of (cycSize - wrPos) bytes,
3779             // and then we will wrap cylicBuffer in next call
3780             end = delta;
3781           }
3782         }
3783         p->needWrite_Size = end - p->wrPos;
3784       }
3785       return SZ_OK;
3786     }
3787 
3788     // ZSTD2_STATE_AFTER_HEADER
3789     {
3790       BoolInt useCyclic = False;
3791       size_t cycSize;
3792 
3793       // p->status = ZSTD_STATUS_NOT_FINISHED;
3794       if (dec->dictionaryId != 0)
3795       {
3796         /* actually we can try to decode some data,
3797            because it's possible that some data doesn't use dictionary */
3798         // p->status = ZSTD_STATUS_NOT_SPECIFIED;
3799         return SZ_ERROR_UNSUPPORTED;
3800       }
3801 
3802       {
3803         UInt64 winSize = dec->contentSize;
3804         UInt64 winSize_Allocate = winSize;
3805         const unsigned descriptor = dec->descriptor;
3806 
3807         if ((descriptor & DESCRIPTOR_FLAG_SINGLE) == 0)
3808         {
3809           const Byte wd = dec->windowDescriptor;
3810           winSize = (UInt64)(8 + (wd & 7)) << ((wd >> 3) + 10 - 3);
3811           if (!DESCRIPTOR_Is_ContentSize_Defined(descriptor)
3812               || winSize_Allocate > winSize)
3813           {
3814             winSize_Allocate = winSize;
3815             useCyclic = True;
3816           }
3817         }
3818         /*
3819         else
3820         {
3821           if (p->info.singleSegment_ContentSize_MAX < winSize)
3822               p->info.singleSegment_ContentSize_MAX = winSize;
3823           // p->info.num_SingleSegments++;
3824         }
3825         */
3826         if (p->info.windowSize_MAX < winSize)
3827             p->info.windowSize_MAX = winSize;
3828         if (p->info.windowSize_Allocate_MAX < winSize_Allocate)
3829             p->info.windowSize_Allocate_MAX = winSize_Allocate;
3830         /*
3831            winSize_Allocate is MIN(content_size, window_size_from_descriptor).
3832            Wven if (content_size < (window_size_from_descriptor))
3833              original-zstd still uses (window_size_from_descriptor) to check that decoding is allowed.
3834            We try to follow original-zstd, and here we check (winSize) instead of (winSize_Allocate))
3835         */
3836         if (
3837               // winSize_Allocate   // it's relaxed check
3838               winSize               // it's more strict check to be compatible with original-zstd
3839             > ((UInt64)1 << MAX_WINDOW_SIZE_LOG))
3840           return SZ_ERROR_UNSUPPORTED; // SZ_ERROR_MEM
3841         cycSize = (size_t)winSize_Allocate;
3842         if (cycSize != winSize_Allocate)
3843           return SZ_ERROR_MEM;
3844         // cycSize <= winSize
3845         /* later we will use (CZstdDec1::winSize) to check match offsets and check block sizes.
3846            if (there is window descriptor)
3847            {
3848              We will check block size with (window_size_from_descriptor) instead of (winSize_Allocate).
3849              Does original-zstd do it that way also?
3850            }
3851            Here we must reduce full real 64-bit (winSize) to size_t for (CZstdDec1::winSize).
3852            Also we don't want too big values for (CZstdDec1::winSize).
3853            our (CZstdDec1::winSize) will meet the condition:
3854              (CZstdDec1::winSize < kBlockSizeMax || CZstdDec1::winSize <= cycSize).
3855         */
3856         dec->decoder.winSize = (winSize < kBlockSizeMax) ? (size_t)winSize: cycSize;
3857         // note: (CZstdDec1::winSize > cycSize) is possible, if (!useCyclic)
3858       }
3859 
3860       RINOK(ZstdDec_AllocateMisc(dec))
3861 
3862       if (p->outBuf_fromCaller)
3863         dec->isCyclicMode = False;
3864       else
3865       {
3866         size_t d = cycSize;
3867 
3868         if (dec->decoder.winPos != p->wrPos)
3869           return SZ_ERROR_FAIL;
3870 
3871         dec->decoder.winPos = 0;
3872         p->wrPos = 0;
3873         p->winPos = dec->decoder.winPos;
3874 
3875         /*
3876         const size_t needWrite = dec->decoder.winPos - p->wrPos;
3877         if (!needWrite)
3878         {
3879           dec->decoder.winPos = 0;
3880           p->wrPos = 0;
3881           p->winPos = dec->decoder.winPos;
3882         }
3883         */
3884         /* if (!useCyclic) we allocate only cycSize = ContentSize.
3885            But if we want to support the case where new frame starts with winPos != 0,
3886            then we will wrap over zero, and we still need
3887            to set (useCyclic) and allocate additional buffer spaces.
3888            Now we don't allow new frame starting with (winPos != 0).
3889            so (dec->decoder->winPos == 0)
3890            can use (!useCyclic) with reduced buffer sizes.
3891         */
3892         /*
3893         if (dec->decoder->winPos != 0)
3894           useCyclic = True;
3895         */
3896 
3897         if (useCyclic)
3898         {
3899           /* cyclyc buffer size must be at least (COPY_CHUNK_SIZE - 1) bytes
3900              larger than window size, because CopyMatch() can write additional
3901              (COPY_CHUNK_SIZE - 1) bytes and overwrite oldests data in cyclyc buffer.
3902              But for performance reasons we align (cycSize) for (kBlockSizeMax).
3903              also we must provide (cycSize >= max_decoded_data_after_cycSize),
3904              because after data move wrapping over zero we must provide (winPos < cycSize).
3905           */
3906           const size_t alignSize = kBlockSizeMax;
3907           /* here we add (1 << 7) instead of (COPY_CHUNK_SIZE - 1), because
3908              we want to get same (cycSize) for different COPY_CHUNK_SIZE values. */
3909           // cycSize += (COPY_CHUNK_SIZE - 1) + (alignSize - 1); // for debug : we can get smallest (cycSize)
3910           cycSize += (1 << 7) + alignSize;
3911           cycSize &= ~(size_t)(alignSize - 1);
3912           // cycSize must be aligned for 32, because xxh requires 32-bytes blocks.
3913           // cycSize += 12345; // for debug
3914           // cycSize += 1 << 10; // for debug
3915           // cycSize += 32; // for debug
3916           // cycSize += kBlockSizeMax; // for debug
3917           if (cycSize < d)
3918             return SZ_ERROR_MEM;
3919           /*
3920              in cyclic buffer mode we allow to decode one additional block
3921              that exceeds (cycSize).
3922              So we must allocate additional (kBlockSizeMax) bytes after (cycSize).
3923              if defined(Z7_STD_DEC_USE_AFTER_CYC_BUF)
3924              {
3925                we can read (COPY_CHUNK_SIZE - 1) bytes after (cycSize)
3926                but we aready allocate additional kBlockSizeMax that
3927                is larger than COPY_CHUNK_SIZE.
3928                So we don't need additional space of COPY_CHUNK_SIZE after (cycSize).
3929              }
3930           */
3931           /*
3932           #ifdef Z7_STD_DEC_USE_AFTER_CYC_BUF
3933           d = cycSize + (1 << 7); // we must add at least (COPY_CHUNK_SIZE - 1)
3934           #endif
3935           */
3936           d = cycSize + kBlockSizeMax;
3937           if (d < cycSize)
3938             return SZ_ERROR_MEM;
3939         }
3940 
3941         {
3942           const size_t kMinWinAllocSize = 1 << 12;
3943           if (d < kMinWinAllocSize)
3944               d = kMinWinAllocSize;
3945         }
3946 
3947         if (d > dec->winBufSize_Allocated)
3948         {
3949           /*
3950           if (needWrite)
3951           {
3952             p->needWrite_Size = needWrite;
3953             return SZ_OK;
3954             // return SZ_ERROR_FAIL;
3955           }
3956           */
3957 
3958           if (dec->winBufSize_Allocated != 0)
3959           {
3960             const size_t k_extra = (useCyclic || d >= (1u << 20)) ?
3961                 2 * kBlockSizeMax : 0;
3962             unsigned i = useCyclic ? 17 : 12;
3963             for (; i < sizeof(size_t) * 8; i++)
3964             {
3965               const size_t d2 = ((size_t)1 << i) + k_extra;
3966               if (d2 >= d)
3967               {
3968                 d = d2;
3969                 break;
3970               }
3971             }
3972           }
3973           // RINOK(ZstdDec_AllocateWindow(dec, d))
3974           ZstdDec_FreeWindow(dec);
3975           dec->win_Base = (Byte *)ISzAlloc_Alloc(dec->alloc_Big, d);
3976           if (!dec->win_Base)
3977             return SZ_ERROR_MEM;
3978           dec->decoder.win = dec->win_Base;
3979           dec->winBufSize_Allocated = d;
3980         }
3981         /*
3982         else
3983         {
3984           // for non-cyclycMode we want flush data, and set winPos = 0
3985           if (needWrite)
3986           {
3987             if (!useCyclic || dec->decoder.winPos >= cycSize)
3988             {
3989               p->needWrite_Size = needWrite;
3990               return SZ_OK;
3991               // return SZ_ERROR_FAIL;
3992             }
3993           }
3994         }
3995         */
3996 
3997         dec->decoder.cycSize = cycSize;
3998         p->win = dec->decoder.win;
3999         // p->cycSize = dec->decoder.cycSize;
4000         dec->isCyclicMode = (Byte)useCyclic;
4001       } // (!p->outBuf_fromCaller) end
4002 
4003       // p->winPos = dec->decoder.winPos;
4004       dec->frameState = ZSTD2_STATE_BLOCK;
4005       // continue;
4006     } // ZSTD2_STATE_AFTER_HEADER end
4007   }
4008 }
4009 
4010 
4011 void ZstdDec_GetResInfo(const CZstdDec *dec,
4012     const CZstdDecState *p,
4013     SRes res,
4014     CZstdDecResInfo *stat)
4015 {
4016   // ZstdDecInfo_CLEAR(stat);
4017   stat->extraSize = 0;
4018   stat->is_NonFinishedFrame = False;
4019   if (dec->frameState != ZSTD2_STATE_FINISHED)
4020   {
4021     if (dec->frameState == ZSTD2_STATE_SIGNATURE)
4022     {
4023       stat->extraSize = (Byte)dec->tempSize;
4024       if (ZstdDecInfo_GET_NUM_FRAMES(&p->info) == 0)
4025         res = SZ_ERROR_NO_ARCHIVE;
4026     }
4027     else
4028     {
4029       stat->is_NonFinishedFrame = True;
4030       if (res == SZ_OK && p->status == ZSTD_STATUS_NEEDS_MORE_INPUT)
4031         res = SZ_ERROR_INPUT_EOF;
4032     }
4033   }
4034   stat->decode_SRes = res;
4035 }
4036 
4037 
4038 size_t ZstdDec_ReadUnusedFromInBuf(
4039     CZstdDecHandle dec,
4040     size_t afterDecoding_tempPos,
4041     void *data, size_t size)
4042 {
4043   size_t processed = 0;
4044   if (dec->frameState == ZSTD2_STATE_SIGNATURE)
4045   {
4046     Byte *dest = (Byte *)data;
4047     const size_t tempSize = dec->tempSize;
4048     while (afterDecoding_tempPos < tempSize)
4049     {
4050       if (size == 0)
4051         break;
4052       size--;
4053       *dest++ = dec->temp[afterDecoding_tempPos++];
4054       processed++;
4055     }
4056   }
4057   return processed;
4058 }
4059 
4060 
4061 void ZstdDecState_Clear(CZstdDecState *p)
4062 {
4063   memset(p, 0 , sizeof(*p));
4064 }
4065