• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) Yann Collet, Facebook, Inc.
3  * All rights reserved.
4  *
5  * This source code is licensed under both the BSD-style license (found in the
6  * LICENSE file in the root directory of this source tree) and the GPLv2 (found
7  * in the COPYING file in the root directory of this source tree).
8  * You may select, at your option, one of the above-listed licenses.
9  */
10 
11 #include <limits.h>
12 #include <math.h>
13 #include <stddef.h>
14 #include <stdio.h>
15 #include <stdlib.h>
16 #include <string.h>
17 
18 #include "util.h"
19 #include "timefn.h"   /* UTIL_clockSpanMicro, SEC_TO_MICRO, UTIL_TIME_INITIALIZER */
20 #include "zstd.h"
21 #include "zstd_internal.h"
22 #include "mem.h"
23 #define ZDICT_STATIC_LINKING_ONLY
24 #include "zdict.h"
25 
26 /* Direct access to internal compression functions is required */
27 #include "zstd_compress.c"
28 
29 #define XXH_STATIC_LINKING_ONLY
30 #include "xxhash.h"     /* XXH64 */
31 
32 #ifndef MIN
33     #define MIN(a, b) ((a) < (b) ? (a) : (b))
34 #endif
35 
36 #ifndef MAX_PATH
37     #ifdef PATH_MAX
38         #define MAX_PATH PATH_MAX
39     #else
40         #define MAX_PATH 256
41     #endif
42 #endif
43 
44 /*-************************************
45 *  DISPLAY Macros
46 **************************************/
47 #define DISPLAY(...)          fprintf(stderr, __VA_ARGS__)
48 #define DISPLAYLEVEL(l, ...)  if (g_displayLevel>=l) { DISPLAY(__VA_ARGS__); }
49 static U32 g_displayLevel = 2;
50 
51 #define DISPLAYUPDATE(...)                                                     \
52     do {                                                                       \
53         if ((UTIL_clockSpanMicro(g_displayClock) > g_refreshRate) ||           \
54             (g_displayLevel >= 4)) {                                           \
55             g_displayClock = UTIL_getTime();                                   \
56             DISPLAY(__VA_ARGS__);                                              \
57             if (g_displayLevel >= 4) fflush(stderr);                           \
58         }                                                                      \
59     } while (0)
60 
61 static const U64 g_refreshRate = SEC_TO_MICRO / 6;
62 static UTIL_time_t g_displayClock = UTIL_TIME_INITIALIZER;
63 
64 #define CHECKERR(code)                                                         \
65     do {                                                                       \
66         if (ZSTD_isError(code)) {                                              \
67             DISPLAY("Error occurred while generating data: %s\n",              \
68                     ZSTD_getErrorName(code));                                  \
69             exit(1);                                                           \
70         }                                                                      \
71     } while (0)
72 
73 /*-*******************************************************
74 *  Random function
75 *********************************************************/
RAND(U32 * src)76 static U32 RAND(U32* src)
77 {
78 #define RAND_rotl32(x,r) ((x << r) | (x >> (32 - r)))
79     static const U32 prime1 = 2654435761U;
80     static const U32 prime2 = 2246822519U;
81     U32 rand32 = *src;
82     rand32 *= prime1;
83     rand32 += prime2;
84     rand32  = RAND_rotl32(rand32, 13);
85     *src = rand32;
86     return RAND_rotl32(rand32, 27);
87 #undef RAND_rotl32
88 }
89 
90 #define DISTSIZE (8192)
91 
92 /* Write `size` bytes into `ptr`, all of which are less than or equal to `maxSymb` */
RAND_bufferMaxSymb(U32 * seed,void * ptr,size_t size,int maxSymb)93 static void RAND_bufferMaxSymb(U32* seed, void* ptr, size_t size, int maxSymb)
94 {
95     size_t i;
96     BYTE* op = ptr;
97 
98     for (i = 0; i < size; i++) {
99         op[i] = (BYTE) (RAND(seed) % (maxSymb + 1));
100     }
101 }
102 
103 /* Write `size` random bytes into `ptr` */
RAND_buffer(U32 * seed,void * ptr,size_t size)104 static void RAND_buffer(U32* seed, void* ptr, size_t size)
105 {
106     size_t i;
107     BYTE* op = ptr;
108 
109     for (i = 0; i + 4 <= size; i += 4) {
110         MEM_writeLE32(op + i, RAND(seed));
111     }
112     for (; i < size; i++) {
113         op[i] = RAND(seed) & 0xff;
114     }
115 }
116 
117 /* Write `size` bytes into `ptr` following the distribution `dist` */
RAND_bufferDist(U32 * seed,BYTE * dist,void * ptr,size_t size)118 static void RAND_bufferDist(U32* seed, BYTE* dist, void* ptr, size_t size)
119 {
120     size_t i;
121     BYTE* op = ptr;
122 
123     for (i = 0; i < size; i++) {
124         op[i] = dist[RAND(seed) % DISTSIZE];
125     }
126 }
127 
128 /* Generate a random distribution where the frequency of each symbol follows a
129  * geometric distribution defined by `weight`
130  * `dist` should have size at least `DISTSIZE` */
RAND_genDist(U32 * seed,BYTE * dist,double weight)131 static void RAND_genDist(U32* seed, BYTE* dist, double weight)
132 {
133     size_t i = 0;
134     size_t statesLeft = DISTSIZE;
135     BYTE symb = (BYTE) (RAND(seed) % 256);
136     BYTE step = (BYTE) ((RAND(seed) % 256) | 1); /* force it to be odd so it's relatively prime to 256 */
137 
138     while (i < DISTSIZE) {
139         size_t states = ((size_t)(weight * statesLeft)) + 1;
140         size_t j;
141         for (j = 0; j < states && i < DISTSIZE; j++, i++) {
142             dist[i] = symb;
143         }
144 
145         symb += step;
146         statesLeft -= states;
147     }
148 }
149 
150 /* Generates a random number in the range [min, max) */
RAND_range(U32 * seed,U32 min,U32 max)151 static inline U32 RAND_range(U32* seed, U32 min, U32 max)
152 {
153     return (RAND(seed) % (max-min)) + min;
154 }
155 
156 #define ROUND(x) ((U32)(x + 0.5))
157 
158 /* Generates a random number in an exponential distribution with mean `mean` */
RAND_exp(U32 * seed,double mean)159 static double RAND_exp(U32* seed, double mean)
160 {
161     double const u = RAND(seed) / (double) UINT_MAX;
162     return log(1-u) * (-mean);
163 }
164 
165 /*-*******************************************************
166 *  Constants and Structs
167 *********************************************************/
168 const char *BLOCK_TYPES[] = {"raw", "rle", "compressed"};
169 
170 #define MAX_DECOMPRESSED_SIZE_LOG 20
171 #define MAX_DECOMPRESSED_SIZE (1ULL << MAX_DECOMPRESSED_SIZE_LOG)
172 
173 #define MAX_WINDOW_LOG 22 /* Recommended support is 8MB, so limit to 4MB + mantissa */
174 
175 #define MIN_SEQ_LEN (3)
176 #define MAX_NB_SEQ ((ZSTD_BLOCKSIZE_MAX + MIN_SEQ_LEN - 1) / MIN_SEQ_LEN)
177 
178 BYTE CONTENT_BUFFER[MAX_DECOMPRESSED_SIZE];
179 BYTE FRAME_BUFFER[MAX_DECOMPRESSED_SIZE * 2];
180 BYTE LITERAL_BUFFER[ZSTD_BLOCKSIZE_MAX];
181 
182 seqDef SEQUENCE_BUFFER[MAX_NB_SEQ];
183 BYTE SEQUENCE_LITERAL_BUFFER[ZSTD_BLOCKSIZE_MAX]; /* storeSeq expects a place to copy literals to */
184 BYTE SEQUENCE_LLCODE[ZSTD_BLOCKSIZE_MAX];
185 BYTE SEQUENCE_MLCODE[ZSTD_BLOCKSIZE_MAX];
186 BYTE SEQUENCE_OFCODE[ZSTD_BLOCKSIZE_MAX];
187 
188 U64 WKSP[HUF_WORKSPACE_SIZE_U64];
189 
190 typedef struct {
191     size_t contentSize; /* 0 means unknown (unless contentSize == windowSize == 0) */
192     unsigned windowSize; /* contentSize >= windowSize means single segment */
193 } frameHeader_t;
194 
195 /* For repeat modes */
196 typedef struct {
197     U32 rep[ZSTD_REP_NUM];
198 
199     int hufInit;
200     /* the distribution used in the previous block for repeat mode */
201     BYTE hufDist[DISTSIZE];
202     HUF_CElt hufTable [HUF_CTABLE_SIZE_ST(255)];
203 
204     int fseInit;
205     FSE_CTable offcodeCTable  [FSE_CTABLE_SIZE_U32(OffFSELog, MaxOff)];
206     FSE_CTable matchlengthCTable[FSE_CTABLE_SIZE_U32(MLFSELog, MaxML)];
207     FSE_CTable litlengthCTable  [FSE_CTABLE_SIZE_U32(LLFSELog, MaxLL)];
208 
209     /* Symbols that were present in the previous distribution, for use with
210      * set_repeat */
211     BYTE litlengthSymbolSet[36];
212     BYTE offsetSymbolSet[29];
213     BYTE matchlengthSymbolSet[53];
214 } cblockStats_t;
215 
216 typedef struct {
217     void* data;
218     void* dataStart;
219     void* dataEnd;
220 
221     void* src;
222     void* srcStart;
223     void* srcEnd;
224 
225     frameHeader_t header;
226 
227     cblockStats_t stats;
228     cblockStats_t oldStats; /* so they can be rolled back if uncompressible */
229 } frame_t;
230 
231 typedef struct {
232     int useDict;
233     U32 dictID;
234     size_t dictContentSize;
235     BYTE* dictContent;
236 } dictInfo;
237 
238 typedef enum {
239   gt_frame = 0,  /* generate frames */
240   gt_block,      /* generate compressed blocks without block/frame headers */
241 } genType_e;
242 
243 /*-*******************************************************
244 *  Global variables (set from command line)
245 *********************************************************/
246 U32 g_maxDecompressedSizeLog = MAX_DECOMPRESSED_SIZE_LOG;  /* <= 20 */
247 U32 g_maxBlockSize = ZSTD_BLOCKSIZE_MAX;                       /* <= 128 KB */
248 
249 /*-*******************************************************
250 *  Generator Functions
251 *********************************************************/
252 
253 struct {
254     int contentSize; /* force the content size to be present */
255 } opts; /* advanced options on generation */
256 
257 /* Generate and write a random frame header */
writeFrameHeader(U32 * seed,frame_t * frame,dictInfo info)258 static void writeFrameHeader(U32* seed, frame_t* frame, dictInfo info)
259 {
260     BYTE* const op = frame->data;
261     size_t pos = 0;
262     frameHeader_t fh;
263 
264     BYTE windowByte = 0;
265 
266     int singleSegment = 0;
267     int contentSizeFlag = 0;
268     int fcsCode = 0;
269 
270     memset(&fh, 0, sizeof(fh));
271 
272     /* generate window size */
273     {
274         /* Follow window algorithm from specification */
275         int const exponent = RAND(seed) % (MAX_WINDOW_LOG - 10);
276         int const mantissa = RAND(seed) % 8;
277         windowByte = (BYTE) ((exponent << 3) | mantissa);
278         fh.windowSize = (1U << (exponent + 10));
279         fh.windowSize += fh.windowSize / 8 * mantissa;
280     }
281 
282     {
283         /* Generate random content size */
284         size_t highBit;
285         if (RAND(seed) & 7 && g_maxDecompressedSizeLog > 7) {
286             /* do content of at least 128 bytes */
287             highBit = 1ULL << RAND_range(seed, 7, g_maxDecompressedSizeLog);
288         } else if (RAND(seed) & 3) {
289             /* do small content */
290             highBit = 1ULL << RAND_range(seed, 0, MIN(7, 1U << g_maxDecompressedSizeLog));
291         } else {
292             /* 0 size frame */
293             highBit = 0;
294         }
295         fh.contentSize = highBit ? highBit + (RAND(seed) % highBit) : 0;
296 
297         /* provide size sometimes */
298         contentSizeFlag = opts.contentSize | (RAND(seed) & 1);
299 
300         if (contentSizeFlag && (fh.contentSize == 0 || !(RAND(seed) & 7))) {
301             /* do single segment sometimes */
302             fh.windowSize = (U32) fh.contentSize;
303             singleSegment = 1;
304         }
305     }
306 
307     if (contentSizeFlag) {
308         /* Determine how large fcs field has to be */
309         int minFcsCode = (fh.contentSize >= 256) +
310                                (fh.contentSize >= 65536 + 256) +
311                                (fh.contentSize > 0xFFFFFFFFU);
312         if (!singleSegment && !minFcsCode) {
313             minFcsCode = 1;
314         }
315         fcsCode = minFcsCode + (RAND(seed) % (4 - minFcsCode));
316         if (fcsCode == 1 && fh.contentSize < 256) fcsCode++;
317     }
318 
319     /* write out the header */
320     MEM_writeLE32(op + pos, ZSTD_MAGICNUMBER);
321     pos += 4;
322 
323     {
324         /*
325          * fcsCode: 2-bit flag specifying how many bytes used to represent Frame_Content_Size (bits 7-6)
326          * singleSegment: 1-bit flag describing if data must be regenerated within a single continuous memory segment. (bit 5)
327          * contentChecksumFlag: 1-bit flag that is set if frame includes checksum at the end -- set to 1 below (bit 2)
328          * dictBits: 2-bit flag describing how many bytes Dictionary_ID uses -- set to 3 (bits 1-0)
329          * For more information: https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#frame_header
330          */
331         int const dictBits = info.useDict ? 3 : 0;
332         BYTE const frameHeaderDescriptor =
333                 (BYTE) ((fcsCode << 6) | (singleSegment << 5) | (1 << 2) | dictBits);
334         op[pos++] = frameHeaderDescriptor;
335     }
336 
337     if (!singleSegment) {
338         op[pos++] = windowByte;
339     }
340     if (info.useDict) {
341         MEM_writeLE32(op + pos, (U32) info.dictID);
342         pos += 4;
343     }
344     if (contentSizeFlag) {
345         switch (fcsCode) {
346         default: /* Impossible */
347         case 0: op[pos++] = (BYTE) fh.contentSize; break;
348         case 1: MEM_writeLE16(op + pos, (U16) (fh.contentSize - 256)); pos += 2; break;
349         case 2: MEM_writeLE32(op + pos, (U32) fh.contentSize); pos += 4; break;
350         case 3: MEM_writeLE64(op + pos, (U64) fh.contentSize); pos += 8; break;
351         }
352     }
353 
354     DISPLAYLEVEL(3, " frame content size:\t%u\n", (unsigned)fh.contentSize);
355     DISPLAYLEVEL(3, " frame window size:\t%u\n", fh.windowSize);
356     DISPLAYLEVEL(3, " content size flag:\t%d\n", contentSizeFlag);
357     DISPLAYLEVEL(3, " single segment flag:\t%d\n", singleSegment);
358 
359     frame->data = op + pos;
360     frame->header = fh;
361 }
362 
363 /* Write a literal block in either raw or RLE form, return the literals size */
writeLiteralsBlockSimple(U32 * seed,frame_t * frame,size_t contentSize)364 static size_t writeLiteralsBlockSimple(U32* seed, frame_t* frame, size_t contentSize)
365 {
366     BYTE* op = (BYTE*)frame->data;
367     int const type = RAND(seed) % 2;
368     int const sizeFormatDesc = RAND(seed) % 8;
369     size_t litSize;
370     size_t maxLitSize = MIN(contentSize, g_maxBlockSize);
371 
372     if (sizeFormatDesc == 0) {
373         /* Size_FormatDesc = ?0 */
374         maxLitSize = MIN(maxLitSize, 31);
375     } else if (sizeFormatDesc <= 4) {
376         /* Size_FormatDesc = 01 */
377         maxLitSize = MIN(maxLitSize, 4095);
378     } else {
379         /* Size_Format = 11 */
380         maxLitSize = MIN(maxLitSize, 1048575);
381     }
382 
383     litSize = RAND(seed) % (maxLitSize + 1);
384     if (frame->src == frame->srcStart && litSize == 0) {
385         litSize = 1; /* no empty literals if there's nothing preceding this block */
386     }
387     if (litSize + 3 > contentSize) {
388         litSize = contentSize; /* no matches shorter than 3 are allowed */
389     }
390     /* use smallest size format that fits */
391     if (litSize < 32) {
392         op[0] = (type | (0 << 2) | (litSize << 3)) & 0xff;
393         op += 1;
394     } else if (litSize < 4096) {
395         op[0] = (type | (1 << 2) | (litSize << 4)) & 0xff;
396         op[1] = (litSize >> 4) & 0xff;
397         op += 2;
398     } else {
399         op[0] = (type | (3 << 2) | (litSize << 4)) & 0xff;
400         op[1] = (litSize >> 4) & 0xff;
401         op[2] = (litSize >> 12) & 0xff;
402         op += 3;
403     }
404 
405     if (type == 0) {
406         /* Raw literals */
407         DISPLAYLEVEL(4, "   raw literals\n");
408 
409         RAND_buffer(seed, LITERAL_BUFFER, litSize);
410         memcpy(op, LITERAL_BUFFER, litSize);
411         op += litSize;
412     } else {
413         /* RLE literals */
414         BYTE const symb = (BYTE) (RAND(seed) % 256);
415 
416         DISPLAYLEVEL(4, "   rle literals: 0x%02x\n", (unsigned)symb);
417 
418         memset(LITERAL_BUFFER, symb, litSize);
419         op[0] = symb;
420         op++;
421     }
422 
423     frame->data = op;
424 
425     return litSize;
426 }
427 
428 /* Generate a Huffman header for the given source */
writeHufHeader(U32 * seed,HUF_CElt * hufTable,void * dst,size_t dstSize,const void * src,size_t srcSize)429 static size_t writeHufHeader(U32* seed, HUF_CElt* hufTable, void* dst, size_t dstSize,
430                                  const void* src, size_t srcSize)
431 {
432     BYTE* const ostart = (BYTE*)dst;
433     BYTE* op = ostart;
434 
435     unsigned huffLog = 11;
436     unsigned maxSymbolValue = 255;
437 
438     unsigned count[HUF_SYMBOLVALUE_MAX+1];
439 
440     /* Scan input and build symbol stats */
441     {   size_t const largest = HIST_count_wksp (count, &maxSymbolValue, (const BYTE*)src, srcSize, WKSP, sizeof(WKSP));
442         assert(!HIST_isError(largest));
443         if (largest == srcSize) { *ostart = ((const BYTE*)src)[0]; return 0; }   /* single symbol, rle */
444         if (largest <= (srcSize >> 7)+1) return 0;   /* Fast heuristic : not compressible enough */
445     }
446 
447     /* Build Huffman Tree */
448     /* Max Huffman log is 11, min is highbit(maxSymbolValue)+1 */
449     huffLog = RAND_range(seed, ZSTD_highbit32(maxSymbolValue)+1, huffLog+1);
450     DISPLAYLEVEL(6, "     huffman log: %u\n", huffLog);
451     {   size_t const maxBits = HUF_buildCTable_wksp (hufTable, count, maxSymbolValue, huffLog, WKSP, sizeof(WKSP));
452         CHECKERR(maxBits);
453         huffLog = (U32)maxBits;
454     }
455 
456     /* Write table description header */
457     {   size_t const hSize = HUF_writeCTable (op, dstSize, hufTable, maxSymbolValue, huffLog);
458         if (hSize + 12 >= srcSize) return 0;   /* not useful to try compression */
459         op += hSize;
460     }
461 
462     return op - ostart;
463 }
464 
465 /* Write a Huffman coded literals block and return the literals size */
writeLiteralsBlockCompressed(U32 * seed,frame_t * frame,size_t contentSize)466 static size_t writeLiteralsBlockCompressed(U32* seed, frame_t* frame, size_t contentSize)
467 {
468     BYTE* origop = (BYTE*)frame->data;
469     BYTE* opend = (BYTE*)frame->dataEnd;
470     BYTE* op;
471     BYTE* const ostart = origop;
472     int const sizeFormat = RAND(seed) % 4;
473     size_t litSize;
474     size_t hufHeaderSize = 0;
475     size_t compressedSize = 0;
476     size_t maxLitSize = MIN(contentSize-3, g_maxBlockSize);
477 
478     symbolEncodingType_e hType;
479 
480     if (contentSize < 64) {
481         /* make sure we get reasonably-sized literals for compression */
482         return ERROR(GENERIC);
483     }
484 
485     DISPLAYLEVEL(4, "   compressed literals\n");
486 
487     switch (sizeFormat) {
488     case 0: /* fall through, size is the same as case 1 */
489     case 1:
490         maxLitSize = MIN(maxLitSize, 1023);
491         origop += 3;
492         break;
493     case 2:
494         maxLitSize = MIN(maxLitSize, 16383);
495         origop += 4;
496         break;
497     case 3:
498         maxLitSize = MIN(maxLitSize, 262143);
499         origop += 5;
500         break;
501     default:; /* impossible */
502     }
503 
504     do {
505         op = origop;
506         do {
507             litSize = RAND(seed) % (maxLitSize + 1);
508         } while (litSize < 32); /* avoid small literal sizes */
509         if (litSize + 3 > contentSize) {
510             litSize = contentSize; /* no matches shorter than 3 are allowed */
511         }
512 
513         /* most of the time generate a new distribution */
514         if ((RAND(seed) & 3) || !frame->stats.hufInit) {
515             do {
516                 if (RAND(seed) & 3) {
517                     /* add 10 to ensure some compressibility */
518                     double const weight = ((RAND(seed) % 90) + 10) / 100.0;
519 
520                     DISPLAYLEVEL(5, "    distribution weight: %d%%\n",
521                                  (int)(weight * 100));
522 
523                     RAND_genDist(seed, frame->stats.hufDist, weight);
524                 } else {
525                     /* sometimes do restricted range literals to force
526                      * non-huffman headers */
527                     DISPLAYLEVEL(5, "    small range literals\n");
528                     RAND_bufferMaxSymb(seed, frame->stats.hufDist, DISTSIZE,
529                                        15);
530                 }
531                 RAND_bufferDist(seed, frame->stats.hufDist, LITERAL_BUFFER,
532                                 litSize);
533 
534                 /* generate the header from the distribution instead of the
535                  * actual data to avoid bugs with symbols that were in the
536                  * distribution but never showed up in the output */
537                 hufHeaderSize = writeHufHeader(
538                         seed, frame->stats.hufTable, op, opend - op,
539                         frame->stats.hufDist, DISTSIZE);
540                 CHECKERR(hufHeaderSize);
541                 /* repeat until a valid header is written */
542             } while (hufHeaderSize == 0);
543             op += hufHeaderSize;
544             hType = set_compressed;
545 
546             frame->stats.hufInit = 1;
547         } else {
548             /* repeat the distribution/table from last time */
549             DISPLAYLEVEL(5, "    huffman repeat stats\n");
550             RAND_bufferDist(seed, frame->stats.hufDist, LITERAL_BUFFER,
551                             litSize);
552             hufHeaderSize = 0;
553             hType = set_repeat;
554         }
555 
556         do {
557             compressedSize =
558                     sizeFormat == 0
559                             ? HUF_compress1X_usingCTable(
560                                       op, opend - op, LITERAL_BUFFER, litSize,
561                                       frame->stats.hufTable)
562                             : HUF_compress4X_usingCTable(
563                                       op, opend - op, LITERAL_BUFFER, litSize,
564                                       frame->stats.hufTable);
565             CHECKERR(compressedSize);
566             /* this only occurs when it could not compress or similar */
567         } while (compressedSize <= 0);
568 
569         op += compressedSize;
570 
571         compressedSize += hufHeaderSize;
572         DISPLAYLEVEL(5, "    regenerated size: %u\n", (unsigned)litSize);
573         DISPLAYLEVEL(5, "    compressed size: %u\n", (unsigned)compressedSize);
574         if (compressedSize >= litSize) {
575             DISPLAYLEVEL(5, "     trying again\n");
576             /* if we have to try again, reset the stats so we don't accidentally
577              * try to repeat a distribution we just made */
578             frame->stats = frame->oldStats;
579         } else {
580             break;
581         }
582     } while (1);
583 
584     /* write header */
585     switch (sizeFormat) {
586     case 0: /* fall through, size is the same as case 1 */
587     case 1: {
588         U32 const header = hType | (sizeFormat << 2) | ((U32)litSize << 4) |
589                            ((U32)compressedSize << 14);
590         MEM_writeLE24(ostart, header);
591         break;
592     }
593     case 2: {
594         U32 const header = hType | (sizeFormat << 2) | ((U32)litSize << 4) |
595                            ((U32)compressedSize << 18);
596         MEM_writeLE32(ostart, header);
597         break;
598     }
599     case 3: {
600         U32 const header = hType | (sizeFormat << 2) | ((U32)litSize << 4) |
601                            ((U32)compressedSize << 22);
602         MEM_writeLE32(ostart, header);
603         ostart[4] = (BYTE)(compressedSize >> 10);
604         break;
605     }
606     default:; /* impossible */
607     }
608 
609     frame->data = op;
610     return litSize;
611 }
612 
writeLiteralsBlock(U32 * seed,frame_t * frame,size_t contentSize)613 static size_t writeLiteralsBlock(U32* seed, frame_t* frame, size_t contentSize)
614 {
615     /* only do compressed for larger segments to avoid compressibility issues */
616     if (RAND(seed) & 7 && contentSize >= 64) {
617         return writeLiteralsBlockCompressed(seed, frame, contentSize);
618     } else {
619         return writeLiteralsBlockSimple(seed, frame, contentSize);
620     }
621 }
622 
initSeqStore(seqStore_t * seqStore)623 static inline void initSeqStore(seqStore_t *seqStore) {
624     seqStore->maxNbSeq = MAX_NB_SEQ;
625     seqStore->maxNbLit = ZSTD_BLOCKSIZE_MAX;
626     seqStore->sequencesStart = SEQUENCE_BUFFER;
627     seqStore->litStart = SEQUENCE_LITERAL_BUFFER;
628     seqStore->llCode = SEQUENCE_LLCODE;
629     seqStore->mlCode = SEQUENCE_MLCODE;
630     seqStore->ofCode = SEQUENCE_OFCODE;
631 
632     ZSTD_resetSeqStore(seqStore);
633 }
634 
635 /* Randomly generate sequence commands */
636 static U32
generateSequences(U32 * seed,frame_t * frame,seqStore_t * seqStore,size_t contentSize,size_t literalsSize,dictInfo info)637 generateSequences(U32* seed, frame_t* frame, seqStore_t* seqStore,
638                   size_t contentSize, size_t literalsSize, dictInfo info)
639 {
640     /* The total length of all the matches */
641     size_t const remainingMatch = contentSize - literalsSize;
642     size_t excessMatch = 0;
643     U32 numSequences = 0;
644     U32 i;
645 
646     const BYTE* literals = LITERAL_BUFFER;
647     BYTE* srcPtr = frame->src;
648 
649     if (literalsSize != contentSize) {
650         /* each match must be at least MIN_SEQ_LEN, so this is the maximum
651          * number of sequences we can have */
652         U32 const maxSequences = (U32)remainingMatch / MIN_SEQ_LEN;
653         numSequences = (RAND(seed) % maxSequences) + 1;
654 
655         /* the extra match lengths we have to allocate to each sequence */
656         excessMatch = remainingMatch - numSequences * MIN_SEQ_LEN;
657     }
658 
659     DISPLAYLEVEL(5, "    total match lengths: %u\n", (unsigned)remainingMatch);
660     for (i = 0; i < numSequences; i++) {
661         /* Generate match and literal lengths by exponential distribution to
662          * ensure nice numbers */
663         U32 matchLen =
664                 MIN_SEQ_LEN +
665                 ROUND(RAND_exp(seed, excessMatch / (double)(numSequences - i)));
666         U32 literalLen =
667                 (RAND(seed) & 7)
668                         ? ROUND(RAND_exp(seed,
669                                          literalsSize /
670                                                  (double)(numSequences - i)))
671                         : 0;
672         /* actual offset, code to send, and point to copy up to when shifting
673          * codes in the repeat offsets history */
674         U32 offset, offsetCode, repIndex;
675 
676         /* bounds checks */
677         matchLen = (U32) MIN(matchLen, excessMatch + MIN_SEQ_LEN);
678         literalLen = MIN(literalLen, (U32) literalsSize);
679         if (i == 0 && srcPtr == frame->srcStart && literalLen == 0) literalLen = 1;
680         if (i + 1 == numSequences) matchLen = MIN_SEQ_LEN + (U32) excessMatch;
681 
682         memcpy(srcPtr, literals, literalLen);
683         srcPtr += literalLen;
684         do {
685             if (RAND(seed) & 7) {
686                 /* do a normal offset */
687                 U32 const dataDecompressed = (U32)((BYTE*)srcPtr-(BYTE*)frame->srcStart);
688                 offset = (RAND(seed) %
689                           MIN(frame->header.windowSize,
690                               (size_t)((BYTE*)srcPtr - (BYTE*)frame->srcStart))) +
691                          1;
692                 if (info.useDict && (RAND(seed) & 1) && i + 1 != numSequences && dataDecompressed < frame->header.windowSize) {
693                     /* need to occasionally generate offsets that go past the start */
694                     /* including i+1 != numSequences because the last sequences has to adhere to predetermined contentSize */
695                     U32 lenPastStart = (RAND(seed) % info.dictContentSize) + 1;
696                     offset = (U32)((BYTE*)srcPtr - (BYTE*)frame->srcStart)+lenPastStart;
697                     if (offset > frame->header.windowSize) {
698                         if (lenPastStart < MIN_SEQ_LEN) {
699                             /* when offset > windowSize, matchLen bound by end of dictionary (lenPastStart) */
700                             /* this also means that lenPastStart must be greater than MIN_SEQ_LEN */
701                             /* make sure lenPastStart does not go past dictionary start though */
702                             lenPastStart = MIN(lenPastStart+MIN_SEQ_LEN, (U32)info.dictContentSize);
703                             offset = (U32)((BYTE*)srcPtr - (BYTE*)frame->srcStart) + lenPastStart;
704                         }
705                         {   U32 const matchLenBound = MIN(frame->header.windowSize, lenPastStart);
706                             matchLen = MIN(matchLen, matchLenBound);
707                         }
708                     }
709                 }
710                 offsetCode = STORE_OFFSET(offset);
711                 repIndex = 2;
712             } else {
713                 /* do a repeat offset */
714                 U32 const randomRepIndex = RAND(seed) % 3;
715                 offsetCode = STORE_REPCODE(randomRepIndex + 1);  /* expects values between 1 & 3 */
716                 if (literalLen > 0) {
717                     offset = frame->stats.rep[randomRepIndex];
718                     repIndex = randomRepIndex;
719                 } else {
720                     /* special case : literalLen == 0 */
721                     offset = randomRepIndex == 2 ? frame->stats.rep[0] - 1
722                                            : frame->stats.rep[randomRepIndex + 1];
723                     repIndex = MIN(2, randomRepIndex + 1);
724                 }
725             }
726         } while (((!info.useDict) && (offset > (size_t)((BYTE*)srcPtr - (BYTE*)frame->srcStart))) || offset == 0);
727 
728         {   BYTE* const dictEnd = info.dictContent + info.dictContentSize;
729             size_t j;
730             for (j = 0; j < matchLen; j++) {
731                 if ((U32)((BYTE*)srcPtr - (BYTE*)frame->srcStart) < offset) {
732                     /* copy from dictionary instead of literals */
733                     size_t const dictOffset = offset - (srcPtr - (BYTE*)frame->srcStart);
734                     *srcPtr = *(dictEnd - dictOffset);
735                 }
736                 else {
737                     *srcPtr = *(srcPtr-offset);
738                 }
739                 srcPtr++;
740         }   }
741 
742         {   int r;
743             for (r = repIndex; r > 0; r--) {
744                 frame->stats.rep[r] = frame->stats.rep[r - 1];
745             }
746             frame->stats.rep[0] = offset;
747         }
748 
749         DISPLAYLEVEL(6, "      LL: %5u OF: %5u ML: %5u",
750                     (unsigned)literalLen, (unsigned)offset, (unsigned)matchLen);
751         DISPLAYLEVEL(7, " srcPos: %8u seqNb: %3u",
752                      (unsigned)((BYTE*)srcPtr - (BYTE*)frame->srcStart), (unsigned)i);
753         DISPLAYLEVEL(6, "\n");
754         if (STORED_IS_REPCODE(offsetCode)) {  /* expects sumtype numeric representation of ZSTD_storeSeq() */
755             DISPLAYLEVEL(7, "        repeat offset: %d\n", (int)repIndex);
756         }
757         /* use libzstd sequence handling */
758         ZSTD_storeSeq(seqStore, literalLen, literals, literals + literalLen,
759                       offsetCode, matchLen);
760 
761         literalsSize -= literalLen;
762         excessMatch -= (matchLen - MIN_SEQ_LEN);
763         literals += literalLen;
764     }
765 
766     memcpy(srcPtr, literals, literalsSize);
767     srcPtr += literalsSize;
768     DISPLAYLEVEL(6, "      excess literals: %5u", (unsigned)literalsSize);
769     DISPLAYLEVEL(7, " srcPos: %8u", (unsigned)((BYTE*)srcPtr - (BYTE*)frame->srcStart));
770     DISPLAYLEVEL(6, "\n");
771 
772     return numSequences;
773 }
774 
initSymbolSet(const BYTE * symbols,size_t len,BYTE * set,BYTE maxSymbolValue)775 static void initSymbolSet(const BYTE* symbols, size_t len, BYTE* set, BYTE maxSymbolValue)
776 {
777     size_t i;
778 
779     memset(set, 0, (size_t)maxSymbolValue+1);
780 
781     for (i = 0; i < len; i++) {
782         set[symbols[i]] = 1;
783     }
784 }
785 
isSymbolSubset(const BYTE * symbols,size_t len,const BYTE * set,BYTE maxSymbolValue)786 static int isSymbolSubset(const BYTE* symbols, size_t len, const BYTE* set, BYTE maxSymbolValue)
787 {
788     size_t i;
789 
790     for (i = 0; i < len; i++) {
791         if (symbols[i] > maxSymbolValue || !set[symbols[i]]) {
792             return 0;
793         }
794     }
795     return 1;
796 }
797 
writeSequences(U32 * seed,frame_t * frame,seqStore_t * seqStorePtr,size_t nbSeq)798 static size_t writeSequences(U32* seed, frame_t* frame, seqStore_t* seqStorePtr,
799                              size_t nbSeq)
800 {
801     /* This code is mostly copied from ZSTD_compressSequences in zstd_compress.c */
802     unsigned count[MaxSeq+1];
803     S16 norm[MaxSeq+1];
804     FSE_CTable* CTable_LitLength = frame->stats.litlengthCTable;
805     FSE_CTable* CTable_OffsetBits = frame->stats.offcodeCTable;
806     FSE_CTable* CTable_MatchLength = frame->stats.matchlengthCTable;
807     U32 LLtype, Offtype, MLtype;   /* compressed, raw or rle */
808     const seqDef* const sequences = seqStorePtr->sequencesStart;
809     const BYTE* const ofCodeTable = seqStorePtr->ofCode;
810     const BYTE* const llCodeTable = seqStorePtr->llCode;
811     const BYTE* const mlCodeTable = seqStorePtr->mlCode;
812     BYTE* const oend = (BYTE*)frame->dataEnd;
813     BYTE* op = (BYTE*)frame->data;
814     BYTE* seqHead;
815     BYTE scratchBuffer[FSE_BUILD_CTABLE_WORKSPACE_SIZE(MaxSeq, MaxFSELog)];
816 
817     /* literals compressing block removed so that can be done separately */
818 
819     /* Sequences Header */
820     if ((oend-op) < 3 /*max nbSeq Size*/ + 1 /*seqHead */) return ERROR(dstSize_tooSmall);
821     if (nbSeq < 0x7F) *op++ = (BYTE)nbSeq;
822     else if (nbSeq < LONGNBSEQ) op[0] = (BYTE)((nbSeq>>8) + 0x80), op[1] = (BYTE)nbSeq, op+=2;
823     else op[0]=0xFF, MEM_writeLE16(op+1, (U16)(nbSeq - LONGNBSEQ)), op+=3;
824 
825     if (nbSeq==0) {
826         frame->data = op;
827         return 0;
828     }
829 
830     /* seqHead : flags for FSE encoding type */
831     seqHead = op++;
832 
833     /* convert length/distances into codes */
834     ZSTD_seqToCodes(seqStorePtr);
835 
836     /* CTable for Literal Lengths */
837     {   unsigned max = MaxLL;
838         size_t const mostFrequent = HIST_countFast_wksp(count, &max, llCodeTable, nbSeq, WKSP, sizeof(WKSP));   /* cannot fail */
839         assert(!HIST_isError(mostFrequent));
840         if (frame->stats.fseInit && !(RAND(seed) & 3) &&
841                    isSymbolSubset(llCodeTable, nbSeq,
842                                   frame->stats.litlengthSymbolSet, 35)) {
843             /* maybe do repeat mode if we're allowed to */
844             LLtype = set_repeat;
845         } else if (mostFrequent == nbSeq) {
846             /* do RLE if we have the chance */
847             *op++ = llCodeTable[0];
848             FSE_buildCTable_rle(CTable_LitLength, (BYTE)max);
849             LLtype = set_rle;
850         } else if (!(RAND(seed) & 3)) {
851             /* maybe use the default distribution */
852             CHECKERR(FSE_buildCTable_wksp(CTable_LitLength, LL_defaultNorm, MaxLL, LL_defaultNormLog, scratchBuffer, sizeof(scratchBuffer)));
853             LLtype = set_basic;
854         } else {
855             /* fall back on a full table */
856             size_t nbSeq_1 = nbSeq;
857             const U32 tableLog = FSE_optimalTableLog(LLFSELog, nbSeq, max);
858             if (count[llCodeTable[nbSeq-1]]>1) { count[llCodeTable[nbSeq-1]]--; nbSeq_1--; }
859             FSE_normalizeCount(norm, tableLog, count, nbSeq_1, max, nbSeq >= 2048);
860             { size_t const NCountSize = FSE_writeNCount(op, oend-op, norm, max, tableLog);   /* overflow protected */
861               if (FSE_isError(NCountSize)) return ERROR(GENERIC);
862               op += NCountSize; }
863             CHECKERR(FSE_buildCTable_wksp(CTable_LitLength, norm, max, tableLog, scratchBuffer, sizeof(scratchBuffer)));
864             LLtype = set_compressed;
865     }   }
866 
867     /* CTable for Offsets */
868     /* see Literal Lengths for descriptions of mode choices */
869     {   unsigned max = MaxOff;
870         size_t const mostFrequent = HIST_countFast_wksp(count, &max, ofCodeTable, nbSeq, WKSP, sizeof(WKSP));   /* cannot fail */
871         assert(!HIST_isError(mostFrequent));
872         if (frame->stats.fseInit && !(RAND(seed) & 3) &&
873                    isSymbolSubset(ofCodeTable, nbSeq,
874                                   frame->stats.offsetSymbolSet, 28)) {
875             Offtype = set_repeat;
876         } else if (mostFrequent == nbSeq) {
877             *op++ = ofCodeTable[0];
878             FSE_buildCTable_rle(CTable_OffsetBits, (BYTE)max);
879             Offtype = set_rle;
880         } else if (!(RAND(seed) & 3)) {
881             FSE_buildCTable_wksp(CTable_OffsetBits, OF_defaultNorm, DefaultMaxOff, OF_defaultNormLog, scratchBuffer, sizeof(scratchBuffer));
882             Offtype = set_basic;
883         } else {
884             size_t nbSeq_1 = nbSeq;
885             const U32 tableLog = FSE_optimalTableLog(OffFSELog, nbSeq, max);
886             if (count[ofCodeTable[nbSeq-1]]>1) { count[ofCodeTable[nbSeq-1]]--; nbSeq_1--; }
887             FSE_normalizeCount(norm, tableLog, count, nbSeq_1, max, nbSeq >= 2048);
888             { size_t const NCountSize = FSE_writeNCount(op, oend-op, norm, max, tableLog);   /* overflow protected */
889               if (FSE_isError(NCountSize)) return ERROR(GENERIC);
890               op += NCountSize; }
891             FSE_buildCTable_wksp(CTable_OffsetBits, norm, max, tableLog, scratchBuffer, sizeof(scratchBuffer));
892             Offtype = set_compressed;
893     }   }
894 
895     /* CTable for MatchLengths */
896     /* see Literal Lengths for descriptions of mode choices */
897     {   unsigned max = MaxML;
898         size_t const mostFrequent = HIST_countFast_wksp(count, &max, mlCodeTable, nbSeq, WKSP, sizeof(WKSP));   /* cannot fail */
899         assert(!HIST_isError(mostFrequent));
900         if (frame->stats.fseInit && !(RAND(seed) & 3) &&
901                    isSymbolSubset(mlCodeTable, nbSeq,
902                                   frame->stats.matchlengthSymbolSet, 52)) {
903             MLtype = set_repeat;
904         } else if (mostFrequent == nbSeq) {
905             *op++ = *mlCodeTable;
906             FSE_buildCTable_rle(CTable_MatchLength, (BYTE)max);
907             MLtype = set_rle;
908         } else if (!(RAND(seed) & 3)) {
909             /* sometimes do default distribution */
910             FSE_buildCTable_wksp(CTable_MatchLength, ML_defaultNorm, MaxML, ML_defaultNormLog, scratchBuffer, sizeof(scratchBuffer));
911             MLtype = set_basic;
912         } else {
913             /* fall back on table */
914             size_t nbSeq_1 = nbSeq;
915             const U32 tableLog = FSE_optimalTableLog(MLFSELog, nbSeq, max);
916             if (count[mlCodeTable[nbSeq-1]]>1) { count[mlCodeTable[nbSeq-1]]--; nbSeq_1--; }
917             FSE_normalizeCount(norm, tableLog, count, nbSeq_1, max, nbSeq >= 2048);
918             { size_t const NCountSize = FSE_writeNCount(op, oend-op, norm, max, tableLog);   /* overflow protected */
919               if (FSE_isError(NCountSize)) return ERROR(GENERIC);
920               op += NCountSize; }
921             FSE_buildCTable_wksp(CTable_MatchLength, norm, max, tableLog, scratchBuffer, sizeof(scratchBuffer));
922             MLtype = set_compressed;
923     }   }
924     frame->stats.fseInit = 1;
925     initSymbolSet(llCodeTable, nbSeq, frame->stats.litlengthSymbolSet, 35);
926     initSymbolSet(ofCodeTable, nbSeq, frame->stats.offsetSymbolSet, 28);
927     initSymbolSet(mlCodeTable, nbSeq, frame->stats.matchlengthSymbolSet, 52);
928 
929     DISPLAYLEVEL(5, "    LL type: %d OF type: %d ML type: %d\n", (unsigned)LLtype, (unsigned)Offtype, (unsigned)MLtype);
930 
931     *seqHead = (BYTE)((LLtype<<6) + (Offtype<<4) + (MLtype<<2));
932 
933     /* Encoding Sequences */
934     {   BIT_CStream_t blockStream;
935         FSE_CState_t  stateMatchLength;
936         FSE_CState_t  stateOffsetBits;
937         FSE_CState_t  stateLitLength;
938 
939         RETURN_ERROR_IF(
940             ERR_isError(BIT_initCStream(&blockStream, op, oend-op)),
941             dstSize_tooSmall, "not enough space remaining");
942 
943         /* first symbols */
944         FSE_initCState2(&stateMatchLength, CTable_MatchLength, mlCodeTable[nbSeq-1]);
945         FSE_initCState2(&stateOffsetBits,  CTable_OffsetBits,  ofCodeTable[nbSeq-1]);
946         FSE_initCState2(&stateLitLength,   CTable_LitLength,   llCodeTable[nbSeq-1]);
947         BIT_addBits(&blockStream, sequences[nbSeq-1].litLength, LL_bits[llCodeTable[nbSeq-1]]);
948         if (MEM_32bits()) BIT_flushBits(&blockStream);
949         BIT_addBits(&blockStream, sequences[nbSeq-1].mlBase, ML_bits[mlCodeTable[nbSeq-1]]);
950         if (MEM_32bits()) BIT_flushBits(&blockStream);
951         BIT_addBits(&blockStream, sequences[nbSeq-1].offBase, ofCodeTable[nbSeq-1]);
952         BIT_flushBits(&blockStream);
953 
954         {   size_t n;
955             for (n=nbSeq-2 ; n<nbSeq ; n--) {      /* intentional underflow */
956                 BYTE const llCode = llCodeTable[n];
957                 BYTE const ofCode = ofCodeTable[n];
958                 BYTE const mlCode = mlCodeTable[n];
959                 U32  const llBits = LL_bits[llCode];
960                 U32  const ofBits = ofCode;                                     /* 32b*/  /* 64b*/
961                 U32  const mlBits = ML_bits[mlCode];
962                                                                                 /* (7)*/  /* (7)*/
963                 FSE_encodeSymbol(&blockStream, &stateOffsetBits, ofCode);       /* 15 */  /* 15 */
964                 FSE_encodeSymbol(&blockStream, &stateMatchLength, mlCode);      /* 24 */  /* 24 */
965                 if (MEM_32bits()) BIT_flushBits(&blockStream);                  /* (7)*/
966                 FSE_encodeSymbol(&blockStream, &stateLitLength, llCode);        /* 16 */  /* 33 */
967                 if (MEM_32bits() || (ofBits+mlBits+llBits >= 64-7-(LLFSELog+MLFSELog+OffFSELog)))
968                     BIT_flushBits(&blockStream);                                /* (7)*/
969                 BIT_addBits(&blockStream, sequences[n].litLength, llBits);
970                 if (MEM_32bits() && ((llBits+mlBits)>24)) BIT_flushBits(&blockStream);
971                 BIT_addBits(&blockStream, sequences[n].mlBase, mlBits);
972                 if (MEM_32bits()) BIT_flushBits(&blockStream);                  /* (7)*/
973                 BIT_addBits(&blockStream, sequences[n].offBase, ofBits);         /* 31 */
974                 BIT_flushBits(&blockStream);                                    /* (7)*/
975         }   }
976 
977         FSE_flushCState(&blockStream, &stateMatchLength);
978         FSE_flushCState(&blockStream, &stateOffsetBits);
979         FSE_flushCState(&blockStream, &stateLitLength);
980 
981         {   size_t const streamSize = BIT_closeCStream(&blockStream);
982             if (streamSize==0) return ERROR(dstSize_tooSmall);   /* not enough space */
983             op += streamSize;
984     }   }
985 
986     frame->data = op;
987 
988     return 0;
989 }
990 
writeSequencesBlock(U32 * seed,frame_t * frame,size_t contentSize,size_t literalsSize,dictInfo info)991 static size_t writeSequencesBlock(U32* seed, frame_t* frame, size_t contentSize,
992                                   size_t literalsSize, dictInfo info)
993 {
994     seqStore_t seqStore;
995     size_t numSequences;
996 
997 
998     initSeqStore(&seqStore);
999 
1000     /* randomly generate sequences */
1001     numSequences = generateSequences(seed, frame, &seqStore, contentSize, literalsSize, info);
1002     /* write them out to the frame data */
1003     CHECKERR(writeSequences(seed, frame, &seqStore, numSequences));
1004 
1005     return numSequences;
1006 }
1007 
writeCompressedBlock(U32 * seed,frame_t * frame,size_t contentSize,dictInfo info)1008 static size_t writeCompressedBlock(U32* seed, frame_t* frame, size_t contentSize, dictInfo info)
1009 {
1010     BYTE* const blockStart = (BYTE*)frame->data;
1011     size_t literalsSize;
1012     size_t nbSeq;
1013 
1014     DISPLAYLEVEL(4, "  compressed block:\n");
1015 
1016     literalsSize = writeLiteralsBlock(seed, frame, contentSize);
1017 
1018     DISPLAYLEVEL(4, "   literals size: %u\n", (unsigned)literalsSize);
1019 
1020     nbSeq = writeSequencesBlock(seed, frame, contentSize, literalsSize, info);
1021 
1022     DISPLAYLEVEL(4, "   number of sequences: %u\n", (unsigned)nbSeq);
1023 
1024     return (BYTE*)frame->data - blockStart;
1025 }
1026 
writeBlock(U32 * seed,frame_t * frame,size_t contentSize,int lastBlock,dictInfo info)1027 static void writeBlock(U32* seed, frame_t* frame, size_t contentSize,
1028                        int lastBlock, dictInfo info)
1029 {
1030     int const blockTypeDesc = RAND(seed) % 8;
1031     size_t blockSize;
1032     int blockType;
1033 
1034     BYTE *const header = (BYTE*)frame->data;
1035     BYTE *op = header + 3;
1036 
1037     DISPLAYLEVEL(4, " block:\n");
1038     DISPLAYLEVEL(4, "  block content size: %u\n", (unsigned)contentSize);
1039     DISPLAYLEVEL(4, "  last block: %s\n", lastBlock ? "yes" : "no");
1040 
1041     if (blockTypeDesc == 0) {
1042         /* Raw data frame */
1043 
1044         RAND_buffer(seed, frame->src, contentSize);
1045         memcpy(op, frame->src, contentSize);
1046 
1047         op += contentSize;
1048         blockType = 0;
1049         blockSize = contentSize;
1050     } else if (blockTypeDesc == 1 && frame->header.contentSize > 0) {
1051         /* RLE (Don't create RLE block if frame content is 0 since block size of 1 may exceed max block size)*/
1052         BYTE const symbol = RAND(seed) & 0xff;
1053 
1054         op[0] = symbol;
1055         memset(frame->src, symbol, contentSize);
1056 
1057         op++;
1058         blockType = 1;
1059         blockSize = contentSize;
1060     } else {
1061         /* compressed, most common */
1062         size_t compressedSize;
1063         blockType = 2;
1064 
1065         frame->oldStats = frame->stats;
1066 
1067         frame->data = op;
1068         compressedSize = writeCompressedBlock(seed, frame, contentSize, info);
1069         if (compressedSize >= contentSize) {   /* compressed block must be strictly smaller than uncompressed one */
1070             blockType = 0;
1071             memcpy(op, frame->src, contentSize);
1072 
1073             op += contentSize;
1074             blockSize = contentSize; /* fall back on raw block if data doesn't
1075                                         compress */
1076 
1077             frame->stats = frame->oldStats; /* don't update the stats */
1078         } else {
1079             op += compressedSize;
1080             blockSize = compressedSize;
1081         }
1082     }
1083     frame->src = (BYTE*)frame->src + contentSize;
1084 
1085     DISPLAYLEVEL(4, "  block type: %s\n", BLOCK_TYPES[blockType]);
1086     DISPLAYLEVEL(4, "  block size field: %u\n", (unsigned)blockSize);
1087 
1088     header[0] = (BYTE) ((lastBlock | (blockType << 1) | (blockSize << 3)) & 0xff);
1089     MEM_writeLE16(header + 1, (U16) (blockSize >> 5));
1090 
1091     frame->data = op;
1092 }
1093 
writeBlocks(U32 * seed,frame_t * frame,dictInfo info)1094 static void writeBlocks(U32* seed, frame_t* frame, dictInfo info)
1095 {
1096     size_t contentLeft = frame->header.contentSize;
1097     size_t const maxBlockSize = MIN(g_maxBlockSize, frame->header.windowSize);
1098     while (1) {
1099         /* 1 in 4 chance of ending frame */
1100         int const lastBlock = contentLeft > maxBlockSize ? 0 : !(RAND(seed) & 3);
1101         size_t blockContentSize;
1102         if (lastBlock) {
1103             blockContentSize = contentLeft;
1104         } else {
1105             if (contentLeft > 0 && (RAND(seed) & 7)) {
1106                 /* some variable size block */
1107                 blockContentSize = RAND(seed) % (MIN(maxBlockSize, contentLeft)+1);
1108             } else if (contentLeft > maxBlockSize && (RAND(seed) & 1)) {
1109                 /* some full size block */
1110                 blockContentSize = maxBlockSize;
1111             } else {
1112                 /* some empty block */
1113                 blockContentSize = 0;
1114             }
1115         }
1116 
1117         writeBlock(seed, frame, blockContentSize, lastBlock, info);
1118 
1119         contentLeft -= blockContentSize;
1120         if (lastBlock) break;
1121     }
1122 }
1123 
writeChecksum(frame_t * frame)1124 static void writeChecksum(frame_t* frame)
1125 {
1126     /* write checksum so implementations can verify their output */
1127     U64 digest = XXH64(frame->srcStart, (BYTE*)frame->src-(BYTE*)frame->srcStart, 0);
1128     DISPLAYLEVEL(3, "  checksum: %08x\n", (unsigned)digest);
1129     MEM_writeLE32(frame->data, (U32)digest);
1130     frame->data = (BYTE*)frame->data + 4;
1131 }
1132 
outputBuffer(const void * buf,size_t size,const char * const path)1133 static void outputBuffer(const void* buf, size_t size, const char* const path)
1134 {
1135     /* write data out to file */
1136     const BYTE* ip = (const BYTE*)buf;
1137     FILE* out;
1138     if (path) {
1139         out = fopen(path, "wb");
1140     } else {
1141         out = stdout;
1142     }
1143     if (!out) {
1144         fprintf(stderr, "Failed to open file at %s: ", path);
1145         perror(NULL);
1146         exit(1);
1147     }
1148 
1149     {   size_t fsize = size;
1150         size_t written = 0;
1151         while (written < fsize) {
1152             written += fwrite(ip + written, 1, fsize - written, out);
1153             if (ferror(out)) {
1154                 fprintf(stderr, "Failed to write to file at %s: ", path);
1155                 perror(NULL);
1156                 exit(1);
1157             }
1158         }
1159     }
1160 
1161     if (path) {
1162         fclose(out);
1163     }
1164 }
1165 
initFrame(frame_t * fr)1166 static void initFrame(frame_t* fr)
1167 {
1168     memset(fr, 0, sizeof(*fr));
1169     fr->data = fr->dataStart = FRAME_BUFFER;
1170     fr->dataEnd = FRAME_BUFFER + sizeof(FRAME_BUFFER);
1171     fr->src = fr->srcStart = CONTENT_BUFFER;
1172     fr->srcEnd = CONTENT_BUFFER + sizeof(CONTENT_BUFFER);
1173 
1174     /* init repeat codes */
1175     fr->stats.rep[0] = 1;
1176     fr->stats.rep[1] = 4;
1177     fr->stats.rep[2] = 8;
1178 }
1179 
1180 /**
1181  * Generated a single zstd compressed block with no block/frame header.
1182  * Returns the final seed.
1183  */
generateCompressedBlock(U32 seed,frame_t * frame,dictInfo info)1184 static U32 generateCompressedBlock(U32 seed, frame_t* frame, dictInfo info)
1185 {
1186     size_t blockContentSize;
1187     int blockWritten = 0;
1188     BYTE* op;
1189     DISPLAYLEVEL(4, "block seed: %u\n", (unsigned)seed);
1190     initFrame(frame);
1191     op = (BYTE*)frame->data;
1192 
1193     while (!blockWritten) {
1194         size_t cSize;
1195         /* generate window size */
1196         {   int const exponent = RAND(&seed) % (MAX_WINDOW_LOG - 10);
1197             int const mantissa = RAND(&seed) % 8;
1198             frame->header.windowSize = (1U << (exponent + 10));
1199             frame->header.windowSize += (frame->header.windowSize / 8) * mantissa;
1200         }
1201 
1202         /* generate content size */
1203         {   size_t const maxBlockSize = MIN(g_maxBlockSize, frame->header.windowSize);
1204             if (RAND(&seed) & 15) {
1205                 /* some full size blocks */
1206                 blockContentSize = maxBlockSize;
1207             } else if (RAND(&seed) & 7 && g_maxBlockSize >= (1U << 7)) {
1208                 /* some small blocks <= 128 bytes*/
1209                 blockContentSize = RAND(&seed) % (1U << 7);
1210             } else {
1211                 /* some variable size blocks */
1212                 blockContentSize = RAND(&seed) % maxBlockSize;
1213             }
1214         }
1215 
1216         /* try generating a compressed block */
1217         frame->oldStats = frame->stats;
1218         frame->data = op;
1219         cSize = writeCompressedBlock(&seed, frame, blockContentSize, info);
1220         if (cSize >= blockContentSize) {  /* compressed size must be strictly smaller than decompressed size : https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#blocks */
1221             /* data doesn't compress -- try again */
1222             frame->stats = frame->oldStats; /* don't update the stats */
1223             DISPLAYLEVEL(5, "   can't compress block : try again \n");
1224         } else {
1225             blockWritten = 1;
1226             DISPLAYLEVEL(4, "   block size: %u \n", (unsigned)cSize);
1227             frame->src = (BYTE*)frame->src + blockContentSize;
1228         }
1229     }
1230     return seed;
1231 }
1232 
1233 /* Return the final seed */
generateFrame(U32 seed,frame_t * fr,dictInfo info)1234 static U32 generateFrame(U32 seed, frame_t* fr, dictInfo info)
1235 {
1236     /* generate a complete frame */
1237     DISPLAYLEVEL(3, "frame seed: %u\n", (unsigned)seed);
1238     initFrame(fr);
1239 
1240     writeFrameHeader(&seed, fr, info);
1241     writeBlocks(&seed, fr, info);
1242     writeChecksum(fr);
1243 
1244     return seed;
1245 }
1246 
1247 /*_*******************************************************
1248 *  Dictionary Helper Functions
1249 *********************************************************/
1250 /* returns 0 if successful, otherwise returns 1 upon error */
genRandomDict(U32 dictID,U32 seed,size_t dictSize,BYTE * fullDict)1251 static int genRandomDict(U32 dictID, U32 seed, size_t dictSize, BYTE* fullDict)
1252 {
1253     /* allocate space for samples */
1254     int ret = 0;
1255     unsigned const numSamples = 4;
1256     size_t sampleSizes[4];
1257     BYTE* const samples = malloc(5000*sizeof(BYTE));
1258     if (samples == NULL) {
1259         DISPLAY("Error: could not allocate space for samples\n");
1260         return 1;
1261     }
1262 
1263     /* generate samples */
1264     {   unsigned literalValue = 1;
1265         unsigned samplesPos = 0;
1266         size_t currSize = 1;
1267         while (literalValue <= 4) {
1268             sampleSizes[literalValue - 1] = currSize;
1269             {   size_t k;
1270                 for (k = 0; k < currSize; k++) {
1271                     *(samples + (samplesPos++)) = (BYTE)literalValue;
1272             }   }
1273             literalValue++;
1274             currSize *= 16;
1275     }   }
1276 
1277     {   size_t dictWriteSize = 0;
1278         ZDICT_params_t zdictParams;
1279         size_t const headerSize = MAX(dictSize/4, 256);
1280         size_t const dictContentSize = dictSize - headerSize;
1281         BYTE* const dictContent = fullDict + headerSize;
1282         if (dictContentSize < ZDICT_CONTENTSIZE_MIN || dictSize < ZDICT_DICTSIZE_MIN) {
1283             DISPLAY("Error: dictionary size is too small\n");
1284             ret = 1;
1285             goto exitGenRandomDict;
1286         }
1287 
1288         /* init dictionary params */
1289         memset(&zdictParams, 0, sizeof(zdictParams));
1290         zdictParams.dictID = dictID;
1291         zdictParams.notificationLevel = 1;
1292 
1293         /* fill in dictionary content */
1294         RAND_buffer(&seed, (void*)dictContent, dictContentSize);
1295 
1296         /* finalize dictionary with random samples */
1297         dictWriteSize = ZDICT_finalizeDictionary(fullDict, dictSize,
1298                                     dictContent, dictContentSize,
1299                                     samples, sampleSizes, numSamples,
1300                                     zdictParams);
1301 
1302         if (ZDICT_isError(dictWriteSize)) {
1303             DISPLAY("Could not finalize dictionary: %s\n", ZDICT_getErrorName(dictWriteSize));
1304             ret = 1;
1305         }
1306     }
1307 
1308 exitGenRandomDict:
1309     free(samples);
1310     return ret;
1311 }
1312 
initDictInfo(int useDict,size_t dictContentSize,BYTE * dictContent,U32 dictID)1313 static dictInfo initDictInfo(int useDict, size_t dictContentSize, BYTE* dictContent, U32 dictID){
1314     /* allocate space statically */
1315     dictInfo dictOp;
1316     memset(&dictOp, 0, sizeof(dictOp));
1317     dictOp.useDict = useDict;
1318     dictOp.dictContentSize = dictContentSize;
1319     dictOp.dictContent = dictContent;
1320     dictOp.dictID = dictID;
1321     return dictOp;
1322 }
1323 
1324 /*-*******************************************************
1325 *  Test Mode
1326 *********************************************************/
1327 
1328 BYTE DECOMPRESSED_BUFFER[MAX_DECOMPRESSED_SIZE];
1329 
testDecodeSimple(frame_t * fr)1330 static size_t testDecodeSimple(frame_t* fr)
1331 {
1332     /* test decoding the generated data with the simple API */
1333     size_t const ret = ZSTD_decompress(DECOMPRESSED_BUFFER, MAX_DECOMPRESSED_SIZE,
1334                            fr->dataStart, (BYTE*)fr->data - (BYTE*)fr->dataStart);
1335 
1336     if (ZSTD_isError(ret)) return ret;
1337 
1338     if (memcmp(DECOMPRESSED_BUFFER, fr->srcStart,
1339                (BYTE*)fr->src - (BYTE*)fr->srcStart) != 0) {
1340         return ERROR(corruption_detected);
1341     }
1342 
1343     return ret;
1344 }
1345 
testDecodeStreaming(frame_t * fr)1346 static size_t testDecodeStreaming(frame_t* fr)
1347 {
1348     /* test decoding the generated data with the streaming API */
1349     ZSTD_DStream* zd = ZSTD_createDStream();
1350     ZSTD_inBuffer in;
1351     ZSTD_outBuffer out;
1352     size_t ret;
1353 
1354     if (!zd) return ERROR(memory_allocation);
1355 
1356     in.src = fr->dataStart;
1357     in.pos = 0;
1358     in.size = (BYTE*)fr->data - (BYTE*)fr->dataStart;
1359 
1360     out.dst = DECOMPRESSED_BUFFER;
1361     out.pos = 0;
1362     out.size = ZSTD_DStreamOutSize();
1363 
1364     ZSTD_initDStream(zd);
1365     while (1) {
1366         ret = ZSTD_decompressStream(zd, &out, &in);
1367         if (ZSTD_isError(ret)) goto cleanup; /* error */
1368         if (ret == 0) break; /* frame is done */
1369 
1370         /* force decoding to be done in chunks */
1371         out.size += MIN(ZSTD_DStreamOutSize(), MAX_DECOMPRESSED_SIZE - out.size);
1372     }
1373 
1374     ret = out.pos;
1375 
1376     if (memcmp(out.dst, fr->srcStart, out.pos) != 0) {
1377         return ERROR(corruption_detected);
1378     }
1379 
1380 cleanup:
1381     ZSTD_freeDStream(zd);
1382     return ret;
1383 }
1384 
testDecodeWithDict(U32 seed,genType_e genType)1385 static size_t testDecodeWithDict(U32 seed, genType_e genType)
1386 {
1387     /* create variables */
1388     size_t const dictSize = RAND(&seed) % (10 << 20) + ZDICT_DICTSIZE_MIN + ZDICT_CONTENTSIZE_MIN;
1389     U32 const dictID = RAND(&seed);
1390     size_t errorDetected = 0;
1391     BYTE* const fullDict = malloc(dictSize);
1392     if (fullDict == NULL) {
1393         return ERROR(GENERIC);
1394     }
1395 
1396     /* generate random dictionary */
1397     if (genRandomDict(dictID, seed, dictSize, fullDict)) {  /* return 0 on success */
1398         errorDetected = ERROR(GENERIC);
1399         goto dictTestCleanup;
1400     }
1401 
1402 
1403     {   frame_t fr;
1404         dictInfo info;
1405         ZSTD_DCtx* const dctx = ZSTD_createDCtx();
1406         size_t ret;
1407 
1408         /* get dict info */
1409         {   size_t const headerSize = MAX(dictSize/4, 256);
1410             size_t const dictContentSize = dictSize-headerSize;
1411             BYTE* const dictContent = fullDict+headerSize;
1412             info = initDictInfo(1, dictContentSize, dictContent, dictID);
1413         }
1414 
1415         /* manually decompress and check difference */
1416         if (genType == gt_frame) {
1417             /* Test frame */
1418             generateFrame(seed, &fr, info);
1419             ret = ZSTD_decompress_usingDict(dctx, DECOMPRESSED_BUFFER, MAX_DECOMPRESSED_SIZE,
1420                                             fr.dataStart, (BYTE*)fr.data - (BYTE*)fr.dataStart,
1421                                             fullDict, dictSize);
1422         } else {
1423             /* Test block */
1424             generateCompressedBlock(seed, &fr, info);
1425             ret = ZSTD_decompressBegin_usingDict(dctx, fullDict, dictSize);
1426             if (ZSTD_isError(ret)) {
1427                 errorDetected = ret;
1428                 ZSTD_freeDCtx(dctx);
1429                 goto dictTestCleanup;
1430             }
1431             ret = ZSTD_decompressBlock(dctx, DECOMPRESSED_BUFFER, MAX_DECOMPRESSED_SIZE,
1432                                        fr.dataStart, (BYTE*)fr.data - (BYTE*)fr.dataStart);
1433         }
1434         ZSTD_freeDCtx(dctx);
1435 
1436         if (ZSTD_isError(ret)) {
1437             errorDetected = ret;
1438             goto dictTestCleanup;
1439         }
1440 
1441         if (memcmp(DECOMPRESSED_BUFFER, fr.srcStart, (BYTE*)fr.src - (BYTE*)fr.srcStart) != 0) {
1442             errorDetected = ERROR(corruption_detected);
1443             goto dictTestCleanup;
1444         }
1445     }
1446 
1447 dictTestCleanup:
1448     free(fullDict);
1449     return errorDetected;
1450 }
1451 
testDecodeRawBlock(frame_t * fr)1452 static size_t testDecodeRawBlock(frame_t* fr)
1453 {
1454     ZSTD_DCtx* dctx = ZSTD_createDCtx();
1455     size_t ret = ZSTD_decompressBegin(dctx);
1456     if (ZSTD_isError(ret)) return ret;
1457 
1458     ret = ZSTD_decompressBlock(
1459             dctx,
1460             DECOMPRESSED_BUFFER, MAX_DECOMPRESSED_SIZE,
1461             fr->dataStart, (BYTE*)fr->data - (BYTE*)fr->dataStart);
1462     ZSTD_freeDCtx(dctx);
1463     if (ZSTD_isError(ret)) return ret;
1464 
1465     if (memcmp(DECOMPRESSED_BUFFER, fr->srcStart,
1466                (BYTE*)fr->src - (BYTE*)fr->srcStart) != 0) {
1467         return ERROR(corruption_detected);
1468     }
1469 
1470     return ret;
1471 }
1472 
runBlockTest(U32 * seed)1473 static int runBlockTest(U32* seed)
1474 {
1475     frame_t fr;
1476     U32 const seedCopy = *seed;
1477     {   dictInfo const info = initDictInfo(0, 0, NULL, 0);
1478         *seed = generateCompressedBlock(*seed, &fr, info);
1479     }
1480 
1481     {   size_t const r = testDecodeRawBlock(&fr);
1482         if (ZSTD_isError(r)) {
1483             DISPLAY("Error in block mode on test seed %u: %s\n",
1484                     (unsigned)seedCopy, ZSTD_getErrorName(r));
1485             return 1;
1486         }
1487     }
1488 
1489     {   size_t const r = testDecodeWithDict(*seed, gt_block);
1490         if (ZSTD_isError(r)) {
1491             DISPLAY("Error in block mode with dictionary on test seed %u: %s\n",
1492                     (unsigned)seedCopy, ZSTD_getErrorName(r));
1493             return 1;
1494         }
1495     }
1496     return 0;
1497 }
1498 
runFrameTest(U32 * seed)1499 static int runFrameTest(U32* seed)
1500 {
1501     frame_t fr;
1502     U32 const seedCopy = *seed;
1503     {   dictInfo const info = initDictInfo(0, 0, NULL, 0);
1504         *seed = generateFrame(*seed, &fr, info);
1505     }
1506 
1507     {   size_t const r = testDecodeSimple(&fr);
1508         if (ZSTD_isError(r)) {
1509             DISPLAY("Error in simple mode on test seed %u: %s\n",
1510                     (unsigned)seedCopy, ZSTD_getErrorName(r));
1511             return 1;
1512         }
1513     }
1514     {   size_t const r = testDecodeStreaming(&fr);
1515         if (ZSTD_isError(r)) {
1516             DISPLAY("Error in streaming mode on test seed %u: %s\n",
1517                     (unsigned)seedCopy, ZSTD_getErrorName(r));
1518             return 1;
1519         }
1520     }
1521     {   size_t const r = testDecodeWithDict(*seed, gt_frame);  /* avoid big dictionaries */
1522         if (ZSTD_isError(r)) {
1523             DISPLAY("Error in dictionary mode on test seed %u: %s\n",
1524                     (unsigned)seedCopy, ZSTD_getErrorName(r));
1525             return 1;
1526         }
1527     }
1528     return 0;
1529 }
1530 
runTestMode(U32 seed,unsigned numFiles,unsigned const testDurationS,genType_e genType)1531 static int runTestMode(U32 seed, unsigned numFiles, unsigned const testDurationS,
1532                        genType_e genType)
1533 {
1534     unsigned fnum;
1535 
1536     UTIL_time_t const startClock = UTIL_getTime();
1537     U64 const maxClockSpan = testDurationS * SEC_TO_MICRO;
1538 
1539     if (numFiles == 0 && !testDurationS) numFiles = 1;
1540 
1541     DISPLAY("seed: %u\n", (unsigned)seed);
1542 
1543     for (fnum = 0; fnum < numFiles || UTIL_clockSpanMicro(startClock) < maxClockSpan; fnum++) {
1544         if (fnum < numFiles)
1545             DISPLAYUPDATE("\r%u/%u        ", fnum, numFiles);
1546         else
1547             DISPLAYUPDATE("\r%u           ", fnum);
1548 
1549         {   int const ret = (genType == gt_frame) ?
1550                             runFrameTest(&seed) :
1551                             runBlockTest(&seed);
1552             if (ret) return ret;
1553         }
1554     }
1555 
1556     DISPLAY("\r%u tests completed: ", fnum);
1557     DISPLAY("OK\n");
1558 
1559     return 0;
1560 }
1561 
1562 /*-*******************************************************
1563 *  File I/O
1564 *********************************************************/
1565 
generateFile(U32 seed,const char * const path,const char * const origPath,genType_e genType)1566 static int generateFile(U32 seed, const char* const path,
1567                         const char* const origPath, genType_e genType)
1568 {
1569     frame_t fr;
1570 
1571     DISPLAY("seed: %u\n", (unsigned)seed);
1572 
1573     {   dictInfo const info = initDictInfo(0, 0, NULL, 0);
1574         if (genType == gt_frame) {
1575             generateFrame(seed, &fr, info);
1576         } else {
1577             generateCompressedBlock(seed, &fr, info);
1578         }
1579     }
1580     outputBuffer(fr.dataStart, (BYTE*)fr.data - (BYTE*)fr.dataStart, path);
1581     if (origPath) {
1582         outputBuffer(fr.srcStart, (BYTE*)fr.src - (BYTE*)fr.srcStart, origPath);
1583     }
1584     return 0;
1585 }
1586 
generateCorpus(U32 seed,unsigned numFiles,const char * const path,const char * const origPath,genType_e genType)1587 static int generateCorpus(U32 seed, unsigned numFiles, const char* const path,
1588                           const char* const origPath, genType_e genType)
1589 {
1590     char outPath[MAX_PATH];
1591     unsigned fnum;
1592 
1593     DISPLAY("seed: %u\n", (unsigned)seed);
1594 
1595     for (fnum = 0; fnum < numFiles; fnum++) {
1596         frame_t fr;
1597 
1598         DISPLAYUPDATE("\r%u/%u        ", fnum, numFiles);
1599 
1600         {   dictInfo const info = initDictInfo(0, 0, NULL, 0);
1601             if (genType == gt_frame) {
1602                 seed = generateFrame(seed, &fr, info);
1603             } else {
1604                 seed = generateCompressedBlock(seed, &fr, info);
1605             }
1606         }
1607 
1608         if (snprintf(outPath, MAX_PATH, "%s/z%06u.zst", path, fnum) + 1 > MAX_PATH) {
1609             DISPLAY("Error: path too long\n");
1610             return 1;
1611         }
1612         outputBuffer(fr.dataStart, (BYTE*)fr.data - (BYTE*)fr.dataStart, outPath);
1613 
1614         if (origPath) {
1615             if (snprintf(outPath, MAX_PATH, "%s/z%06u", origPath, fnum) + 1 > MAX_PATH) {
1616                 DISPLAY("Error: path too long\n");
1617                 return 1;
1618             }
1619             outputBuffer(fr.srcStart, (BYTE*)fr.src - (BYTE*)fr.srcStart, outPath);
1620         }
1621     }
1622 
1623     DISPLAY("\r%u/%u      \n", fnum, numFiles);
1624 
1625     return 0;
1626 }
1627 
generateCorpusWithDict(U32 seed,unsigned numFiles,const char * const path,const char * const origPath,const size_t dictSize,genType_e genType)1628 static int generateCorpusWithDict(U32 seed, unsigned numFiles, const char* const path,
1629                                   const char* const origPath, const size_t dictSize,
1630                                   genType_e genType)
1631 {
1632     char outPath[MAX_PATH];
1633     BYTE* fullDict;
1634     U32 const dictID = RAND(&seed);
1635     int errorDetected = 0;
1636 
1637     if (snprintf(outPath, MAX_PATH, "%s/dictionary", path) + 1 > MAX_PATH) {
1638         DISPLAY("Error: path too long\n");
1639         return 1;
1640     }
1641 
1642     /* allocate space for the dictionary */
1643     fullDict = malloc(dictSize);
1644     if (fullDict == NULL) {
1645         DISPLAY("Error: could not allocate space for full dictionary.\n");
1646         return 1;
1647     }
1648 
1649     /* randomly generate the dictionary */
1650     {   int const ret = genRandomDict(dictID, seed, dictSize, fullDict);
1651         if (ret != 0) {
1652             errorDetected = ret;
1653             goto dictCleanup;
1654         }
1655     }
1656 
1657     /* write out dictionary */
1658     if (numFiles != 0) {
1659         if (snprintf(outPath, MAX_PATH, "%s/dictionary", path) + 1 > MAX_PATH) {
1660             DISPLAY("Error: dictionary path too long\n");
1661             errorDetected = 1;
1662             goto dictCleanup;
1663         }
1664         outputBuffer(fullDict, dictSize, outPath);
1665     }
1666     else {
1667         outputBuffer(fullDict, dictSize, "dictionary");
1668     }
1669 
1670     /* generate random compressed/decompressed files */
1671     {   unsigned fnum;
1672         for (fnum = 0; fnum < MAX(numFiles, 1); fnum++) {
1673             frame_t fr;
1674             DISPLAYUPDATE("\r%u/%u        ", fnum, numFiles);
1675             {
1676                 size_t const headerSize = MAX(dictSize/4, 256);
1677                 size_t const dictContentSize = dictSize-headerSize;
1678                 BYTE* const dictContent = fullDict+headerSize;
1679                 dictInfo const info = initDictInfo(1, dictContentSize, dictContent, dictID);
1680                 if (genType == gt_frame) {
1681                     seed = generateFrame(seed, &fr, info);
1682                 } else {
1683                     seed = generateCompressedBlock(seed, &fr, info);
1684                 }
1685             }
1686 
1687             if (numFiles != 0) {
1688                 if (snprintf(outPath, MAX_PATH, "%s/z%06u.zst", path, fnum) + 1 > MAX_PATH) {
1689                     DISPLAY("Error: path too long\n");
1690                     errorDetected = 1;
1691                     goto dictCleanup;
1692                 }
1693                 outputBuffer(fr.dataStart, (BYTE*)fr.data - (BYTE*)fr.dataStart, outPath);
1694 
1695                 if (origPath) {
1696                     if (snprintf(outPath, MAX_PATH, "%s/z%06u", origPath, fnum) + 1 > MAX_PATH) {
1697                         DISPLAY("Error: path too long\n");
1698                         errorDetected = 1;
1699                         goto dictCleanup;
1700                     }
1701                     outputBuffer(fr.srcStart, (BYTE*)fr.src - (BYTE*)fr.srcStart, outPath);
1702                 }
1703             }
1704             else {
1705                 outputBuffer(fr.dataStart, (BYTE*)fr.data - (BYTE*)fr.dataStart, path);
1706                 if (origPath) {
1707                     outputBuffer(fr.srcStart, (BYTE*)fr.src - (BYTE*)fr.srcStart, origPath);
1708                 }
1709             }
1710         }
1711     }
1712 
1713 dictCleanup:
1714     free(fullDict);
1715     return errorDetected;
1716 }
1717 
1718 
1719 /*_*******************************************************
1720 *  Command line
1721 *********************************************************/
makeSeed(void)1722 static U32 makeSeed(void)
1723 {
1724     U32 t = (U32) time(NULL);
1725     return XXH32(&t, sizeof(t), 0) % 65536;
1726 }
1727 
readInt(const char ** argument)1728 static unsigned readInt(const char** argument)
1729 {
1730     unsigned val = 0;
1731     while ((**argument>='0') && (**argument<='9')) {
1732         val *= 10;
1733         val += **argument - '0';
1734         (*argument)++;
1735     }
1736     return val;
1737 }
1738 
usage(const char * programName)1739 static void usage(const char* programName)
1740 {
1741     DISPLAY( "Usage :\n");
1742     DISPLAY( "      %s [args]\n", programName);
1743     DISPLAY( "\n");
1744     DISPLAY( "Arguments :\n");
1745     DISPLAY( " -p<path> : select output path (default:stdout)\n");
1746     DISPLAY( "                in multiple files mode this should be a directory\n");
1747     DISPLAY( " -o<path> : select path to output original file (default:no output)\n");
1748     DISPLAY( "                in multiple files mode this should be a directory\n");
1749     DISPLAY( " -s#      : select seed (default:random based on time)\n");
1750     DISPLAY( " -n#      : number of files to generate (default:1)\n");
1751     DISPLAY( " -t       : activate test mode (test files against libzstd instead of outputting them)\n");
1752     DISPLAY( " -T#      : length of time to run tests for\n");
1753     DISPLAY( " -v       : increase verbosity level (default:0, max:7)\n");
1754     DISPLAY( " -h/H     : display help/long help and exit\n");
1755 }
1756 
advancedUsage(const char * programName)1757 static void advancedUsage(const char* programName)
1758 {
1759     usage(programName);
1760     DISPLAY( "\n");
1761     DISPLAY( "Advanced arguments        :\n");
1762     DISPLAY( " --content-size           : always include the content size in the frame header\n");
1763     DISPLAY( " --use-dict=#             : include a dictionary used to decompress the corpus\n");
1764     DISPLAY( " --gen-blocks             : generate raw compressed blocks without block/frame headers\n");
1765     DISPLAY( " --max-block-size-log=#   : max block size log, must be in range [2, 17]\n");
1766     DISPLAY( " --max-content-size-log=# : max content size log, must be <= 20\n");
1767     DISPLAY( "                            (this is ignored with gen-blocks)\n");
1768 }
1769 
1770 /*! readU32FromChar() :
1771     @return : unsigned integer value read from input in `char` format
1772     allows and interprets K, KB, KiB, M, MB and MiB suffix.
1773     Will also modify `*stringPtr`, advancing it to position where it stopped reading.
1774     Note : function result can overflow if digit string > MAX_UINT */
readU32FromChar(const char ** stringPtr)1775 static unsigned readU32FromChar(const char** stringPtr)
1776 {
1777     unsigned result = 0;
1778     while ((**stringPtr >='0') && (**stringPtr <='9'))
1779         result *= 10, result += **stringPtr - '0', (*stringPtr)++ ;
1780     if ((**stringPtr=='K') || (**stringPtr=='M')) {
1781         result <<= 10;
1782         if (**stringPtr=='M') result <<= 10;
1783         (*stringPtr)++ ;
1784         if (**stringPtr=='i') (*stringPtr)++;
1785         if (**stringPtr=='B') (*stringPtr)++;
1786     }
1787     return result;
1788 }
1789 
1790 /** longCommandWArg() :
1791  *  check if *stringPtr is the same as longCommand.
1792  *  If yes, @return 1 and advances *stringPtr to the position which immediately follows longCommand.
1793  *  @return 0 and doesn't modify *stringPtr otherwise.
1794  */
longCommandWArg(const char ** stringPtr,const char * longCommand)1795 static unsigned longCommandWArg(const char** stringPtr, const char* longCommand)
1796 {
1797     size_t const comSize = strlen(longCommand);
1798     int const result = !strncmp(*stringPtr, longCommand, comSize);
1799     if (result) *stringPtr += comSize;
1800     return result;
1801 }
1802 
main(int argc,char ** argv)1803 int main(int argc, char** argv)
1804 {
1805     U32 seed = 0;
1806     int seedset = 0;
1807     unsigned numFiles = 0;
1808     unsigned testDuration = 0;
1809     int testMode = 0;
1810     const char* path = NULL;
1811     const char* origPath = NULL;
1812     int useDict = 0;
1813     unsigned dictSize = (10 << 10); /* 10 kB default */
1814     genType_e genType = gt_frame;
1815 
1816     int argNb;
1817 
1818     /* Check command line */
1819     for (argNb=1; argNb<argc; argNb++) {
1820         const char* argument = argv[argNb];
1821         if(!argument) continue;   /* Protection if argument empty */
1822 
1823         /* Handle commands. Aggregated commands are allowed */
1824         if (argument[0]=='-') {
1825             argument++;
1826             while (*argument!=0) {
1827                 switch(*argument)
1828                 {
1829                 case 'h':
1830                     usage(argv[0]);
1831                     return 0;
1832                 case 'H':
1833                     advancedUsage(argv[0]);
1834                     return 0;
1835                 case 'v':
1836                     argument++;
1837                     g_displayLevel++;
1838                     break;
1839                 case 's':
1840                     argument++;
1841                     seedset=1;
1842                     seed = readInt(&argument);
1843                     break;
1844                 case 'n':
1845                     argument++;
1846                     numFiles = readInt(&argument);
1847                     break;
1848                 case 'T':
1849                     argument++;
1850                     testDuration = readInt(&argument);
1851                     if (*argument == 'm') {
1852                         testDuration *= 60;
1853                         argument++;
1854                         if (*argument == 'n') argument++;
1855                     }
1856                     break;
1857                 case 'o':
1858                     argument++;
1859                     origPath = argument;
1860                     argument += strlen(argument);
1861                     break;
1862                 case 'p':
1863                     argument++;
1864                     path = argument;
1865                     argument += strlen(argument);
1866                     break;
1867                 case 't':
1868                     argument++;
1869                     testMode = 1;
1870                     break;
1871                 case '-':
1872                     argument++;
1873                     if (strcmp(argument, "content-size") == 0) {
1874                         opts.contentSize = 1;
1875                     } else if (longCommandWArg(&argument, "use-dict=")) {
1876                         dictSize = readU32FromChar(&argument);
1877                         useDict = 1;
1878                     } else if (strcmp(argument, "gen-blocks") == 0) {
1879                         genType = gt_block;
1880                     } else if (longCommandWArg(&argument, "max-block-size-log=")) {
1881                         U32 value = readU32FromChar(&argument);
1882                         if (value >= 2 && value <= ZSTD_BLOCKSIZE_MAX) {
1883                             g_maxBlockSize = 1U << value;
1884                         }
1885                     } else if (longCommandWArg(&argument, "max-content-size-log=")) {
1886                         U32 value = readU32FromChar(&argument);
1887                         g_maxDecompressedSizeLog =
1888                                 MIN(MAX_DECOMPRESSED_SIZE_LOG, value);
1889                     } else {
1890                         advancedUsage(argv[0]);
1891                         return 1;
1892                     }
1893                     argument += strlen(argument);
1894                     break;
1895                 default:
1896                     usage(argv[0]);
1897                     return 1;
1898     }   }   }   }   /* for (argNb=1; argNb<argc; argNb++) */
1899 
1900     if (!seedset) {
1901         seed = makeSeed();
1902     }
1903 
1904     if (testMode) {
1905         return runTestMode(seed, numFiles, testDuration, genType);
1906     } else {
1907         if (testDuration) {
1908             DISPLAY("Error: -T requires test mode (-t)\n\n");
1909             usage(argv[0]);
1910             return 1;
1911         }
1912     }
1913 
1914     if (!path) {
1915         DISPLAY("Error: path is required in file generation mode\n");
1916         usage(argv[0]);
1917         return 1;
1918     }
1919 
1920     if (numFiles == 0 && useDict == 0) {
1921         return generateFile(seed, path, origPath, genType);
1922     } else if (useDict == 0){
1923         return generateCorpus(seed, numFiles, path, origPath, genType);
1924     } else {
1925         /* should generate files with a dictionary */
1926         return generateCorpusWithDict(seed, numFiles, path, origPath, dictSize, genType);
1927     }
1928 
1929 }
1930