• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) Yann Collet, Facebook, Inc.
3  * All rights reserved.
4  *
5  * This source code is licensed under both the BSD-style license (found in the
6  * LICENSE file in the root directory of this source tree) and the GPLv2 (found
7  * in the COPYING file in the root directory of this source tree).
8  * You may select, at your option, one of the above-listed licenses.
9  */
10 
11 
12 
13 /* **************************************
14 *  Compiler Warnings
15 ****************************************/
16 #ifdef _MSC_VER
17 #  pragma warning(disable : 4127)    /* disable: C4127: conditional expression is constant */
18 #endif
19 
20 
21 /*-*************************************
22 *  Includes
23 ***************************************/
24 #include "platform.h"       /* Large Files support */
25 #include "util.h"           /* UTIL_getFileSize, UTIL_getTotalFileSize */
26 #include <stdlib.h>         /* malloc, free */
27 #include <string.h>         /* memset */
28 #include <stdio.h>          /* fprintf, fopen, ftello64 */
29 #include <errno.h>          /* errno */
30 #include <assert.h>
31 
32 #include "timefn.h"         /* UTIL_time_t, UTIL_clockSpanMicro, UTIL_getTime */
33 #include "../lib/common/mem.h"  /* read */
34 #include "../lib/common/error_private.h"
35 #include "dibio.h"
36 
37 
38 /*-*************************************
39 *  Constants
40 ***************************************/
41 #define KB *(1 <<10)
42 #define MB *(1 <<20)
43 #define GB *(1U<<30)
44 
45 #define SAMPLESIZE_MAX (128 KB)
46 #define MEMMULT 11    /* rough estimation : memory cost to analyze 1 byte of sample */
47 #define COVER_MEMMULT 9    /* rough estimation : memory cost to analyze 1 byte of sample */
48 #define FASTCOVER_MEMMULT 1    /* rough estimation : memory cost to analyze 1 byte of sample */
49 static const size_t g_maxMemory = (sizeof(size_t) == 4) ? (2 GB - 64 MB) : ((size_t)(512 MB) << sizeof(size_t));
50 
51 #define NOISELENGTH 32
52 #define MAX_SAMPLES_SIZE (2 GB) /* training dataset limited to 2GB */
53 
54 
55 /*-*************************************
56 *  Console display
57 ***************************************/
58 #define DISPLAY(...)         fprintf(stderr, __VA_ARGS__)
59 #define DISPLAYLEVEL(l, ...) if (displayLevel>=l) { DISPLAY(__VA_ARGS__); }
60 
61 static const U64 g_refreshRate = SEC_TO_MICRO / 6;
62 static UTIL_time_t g_displayClock = UTIL_TIME_INITIALIZER;
63 
64 #define DISPLAYUPDATE(l, ...) { if (displayLevel>=l) { \
65             if ((UTIL_clockSpanMicro(g_displayClock) > g_refreshRate) || (displayLevel>=4)) \
66             { g_displayClock = UTIL_getTime(); DISPLAY(__VA_ARGS__); \
67             if (displayLevel>=4) fflush(stderr); } } }
68 
69 /*-*************************************
70 *  Exceptions
71 ***************************************/
72 #ifndef DEBUG
73 #  define DEBUG 0
74 #endif
75 #define DEBUGOUTPUT(...) if (DEBUG) DISPLAY(__VA_ARGS__);
76 #define EXM_THROW(error, ...)                                             \
77 {                                                                         \
78     DEBUGOUTPUT("Error defined at %s, line %i : \n", __FILE__, __LINE__); \
79     DISPLAY("Error %i : ", error);                                        \
80     DISPLAY(__VA_ARGS__);                                                 \
81     DISPLAY("\n");                                                        \
82     exit(error);                                                          \
83 }
84 
85 
86 /* ********************************************************
87 *  Helper functions
88 **********************************************************/
89 #undef MIN
90 #define MIN(a,b)    ((a) < (b) ? (a) : (b))
91 
92 /**
93   Returns the size of a file.
94   If error returns -1.
95 */
DiB_getFileSize(const char * fileName)96 static S64 DiB_getFileSize (const char * fileName)
97 {
98     U64 const fileSize = UTIL_getFileSize(fileName);
99     return (fileSize == UTIL_FILESIZE_UNKNOWN) ? -1 : (S64)fileSize;
100 }
101 
102 /* ********************************************************
103 *  File related operations
104 **********************************************************/
105 /** DiB_loadFiles() :
106  *  load samples from files listed in fileNamesTable into buffer.
107  *  works even if buffer is too small to load all samples.
108  *  Also provides the size of each sample into sampleSizes table
109  *  which must be sized correctly, using DiB_fileStats().
110  * @return : nb of samples effectively loaded into `buffer`
111  * *bufferSizePtr is modified, it provides the amount data loaded within buffer.
112  *  sampleSizes is filled with the size of each sample.
113  */
DiB_loadFiles(void * buffer,size_t * bufferSizePtr,size_t * sampleSizes,int sstSize,const char ** fileNamesTable,int nbFiles,size_t targetChunkSize,int displayLevel)114 static int DiB_loadFiles(
115     void* buffer, size_t* bufferSizePtr,
116     size_t* sampleSizes, int sstSize,
117     const char** fileNamesTable, int nbFiles,
118     size_t targetChunkSize, int displayLevel )
119 {
120     char* const buff = (char*)buffer;
121     size_t totalDataLoaded = 0;
122     int nbSamplesLoaded = 0;
123     int fileIndex = 0;
124     FILE * f = NULL;
125 
126     assert(targetChunkSize <= SAMPLESIZE_MAX);
127 
128     while ( nbSamplesLoaded < sstSize && fileIndex < nbFiles ) {
129         size_t fileDataLoaded;
130         S64 const fileSize = DiB_getFileSize(fileNamesTable[fileIndex]);
131         if (fileSize <= 0) /* skip if zero-size or file error */
132             continue;
133 
134         f = fopen( fileNamesTable[fileIndex], "rb");
135         if (f == NULL)
136             EXM_THROW(10, "zstd: dictBuilder: %s %s ", fileNamesTable[fileIndex], strerror(errno));
137         DISPLAYUPDATE(2, "Loading %s...       \r", fileNamesTable[fileIndex]);
138 
139         /* Load the first chunk of data from the file */
140         fileDataLoaded = targetChunkSize > 0 ?
141                             (size_t)MIN(fileSize, (S64)targetChunkSize) :
142                             (size_t)MIN(fileSize, SAMPLESIZE_MAX );
143         if (totalDataLoaded + fileDataLoaded > *bufferSizePtr)
144             break;
145         if (fread( buff+totalDataLoaded, 1, fileDataLoaded, f ) != fileDataLoaded)
146             EXM_THROW(11, "Pb reading %s", fileNamesTable[fileIndex]);
147         sampleSizes[nbSamplesLoaded++] = fileDataLoaded;
148         totalDataLoaded += fileDataLoaded;
149 
150         /* If file-chunking is enabled, load the rest of the file as more samples */
151         if (targetChunkSize > 0) {
152             while( (S64)fileDataLoaded < fileSize && nbSamplesLoaded < sstSize ) {
153                 size_t const chunkSize = MIN((size_t)(fileSize-fileDataLoaded), targetChunkSize);
154                 if (totalDataLoaded + chunkSize > *bufferSizePtr) /* buffer is full */
155                     break;
156 
157                 if (fread( buff+totalDataLoaded, 1, chunkSize, f ) != chunkSize)
158                     EXM_THROW(11, "Pb reading %s", fileNamesTable[fileIndex]);
159                 sampleSizes[nbSamplesLoaded++] = chunkSize;
160                 totalDataLoaded += chunkSize;
161                 fileDataLoaded += chunkSize;
162             }
163         }
164         fileIndex += 1;
165         fclose(f); f = NULL;
166     }
167     if (f != NULL)
168         fclose(f);
169 
170     DISPLAYLEVEL(2, "\r%79s\r", "");
171     DISPLAYLEVEL(4, "Loaded %d KB total training data, %d nb samples \n",
172         (int)(totalDataLoaded / (1 KB)), nbSamplesLoaded );
173     *bufferSizePtr = totalDataLoaded;
174     return nbSamplesLoaded;
175 }
176 
177 #define DiB_rotl32(x,r) ((x << r) | (x >> (32 - r)))
DiB_rand(U32 * src)178 static U32 DiB_rand(U32* src)
179 {
180     static const U32 prime1 = 2654435761U;
181     static const U32 prime2 = 2246822519U;
182     U32 rand32 = *src;
183     rand32 *= prime1;
184     rand32 ^= prime2;
185     rand32  = DiB_rotl32(rand32, 13);
186     *src = rand32;
187     return rand32 >> 5;
188 }
189 
190 /* DiB_shuffle() :
191  * shuffle a table of file names in a semi-random way
192  * It improves dictionary quality by reducing "locality" impact, so if sample set is very large,
193  * it will load random elements from it, instead of just the first ones. */
DiB_shuffle(const char ** fileNamesTable,unsigned nbFiles)194 static void DiB_shuffle(const char** fileNamesTable, unsigned nbFiles) {
195     U32 seed = 0xFD2FB528;
196     unsigned i;
197     assert(nbFiles >= 1);
198     for (i = nbFiles - 1; i > 0; --i) {
199         unsigned const j = DiB_rand(&seed) % (i + 1);
200         const char* const tmp = fileNamesTable[j];
201         fileNamesTable[j] = fileNamesTable[i];
202         fileNamesTable[i] = tmp;
203     }
204 }
205 
206 
207 /*-********************************************************
208 *  Dictionary training functions
209 **********************************************************/
DiB_findMaxMem(unsigned long long requiredMem)210 static size_t DiB_findMaxMem(unsigned long long requiredMem)
211 {
212     size_t const step = 8 MB;
213     void* testmem = NULL;
214 
215     requiredMem = (((requiredMem >> 23) + 1) << 23);
216     requiredMem += step;
217     if (requiredMem > g_maxMemory) requiredMem = g_maxMemory;
218 
219     while (!testmem) {
220         testmem = malloc((size_t)requiredMem);
221         requiredMem -= step;
222     }
223 
224     free(testmem);
225     return (size_t)requiredMem;
226 }
227 
228 
DiB_fillNoise(void * buffer,size_t length)229 static void DiB_fillNoise(void* buffer, size_t length)
230 {
231     unsigned const prime1 = 2654435761U;
232     unsigned const prime2 = 2246822519U;
233     unsigned acc = prime1;
234     size_t p=0;
235 
236     for (p=0; p<length; p++) {
237         acc *= prime2;
238         ((unsigned char*)buffer)[p] = (unsigned char)(acc >> 21);
239     }
240 }
241 
242 
DiB_saveDict(const char * dictFileName,const void * buff,size_t buffSize)243 static void DiB_saveDict(const char* dictFileName,
244                          const void* buff, size_t buffSize)
245 {
246     FILE* const f = fopen(dictFileName, "wb");
247     if (f==NULL) EXM_THROW(3, "cannot open %s ", dictFileName);
248 
249     { size_t const n = fwrite(buff, 1, buffSize, f);
250       if (n!=buffSize) EXM_THROW(4, "%s : write error", dictFileName) }
251 
252     { size_t const n = (size_t)fclose(f);
253       if (n!=0) EXM_THROW(5, "%s : flush error", dictFileName) }
254 }
255 
256 typedef struct {
257     S64 totalSizeToLoad;
258     int nbSamples;
259     int oneSampleTooLarge;
260 } fileStats;
261 
262 /*! DiB_fileStats() :
263  *  Given a list of files, and a chunkSize (0 == no chunk, whole files)
264  *  provides the amount of data to be loaded and the resulting nb of samples.
265  *  This is useful primarily for allocation purpose => sample buffer, and sample sizes table.
266  */
DiB_fileStats(const char ** fileNamesTable,int nbFiles,size_t chunkSize,int displayLevel)267 static fileStats DiB_fileStats(const char** fileNamesTable, int nbFiles, size_t chunkSize, int displayLevel)
268 {
269     fileStats fs;
270     int n;
271     memset(&fs, 0, sizeof(fs));
272 
273     // We assume that if chunking is requested, the chunk size is < SAMPLESIZE_MAX
274     assert( chunkSize <= SAMPLESIZE_MAX );
275 
276     for (n=0; n<nbFiles; n++) {
277       S64 const fileSize = DiB_getFileSize(fileNamesTable[n]);
278       // TODO: is there a minimum sample size? What if the file is 1-byte?
279       if (fileSize == 0) {
280         DISPLAYLEVEL(3, "Sample file '%s' has zero size, skipping...\n", fileNamesTable[n]);
281         continue;
282       }
283 
284       /* the case where we are breaking up files in sample chunks */
285       if (chunkSize > 0)
286       {
287         // TODO: is there a minimum sample size? Can we have a 1-byte sample?
288         fs.nbSamples += (int)((fileSize + chunkSize-1) / chunkSize);
289         fs.totalSizeToLoad += fileSize;
290       }
291       else {
292       /* the case where one file is one sample */
293         if (fileSize > SAMPLESIZE_MAX) {
294           /* flag excessively large sample files */
295           fs.oneSampleTooLarge |= (fileSize > 2*SAMPLESIZE_MAX);
296 
297           /* Limit to the first SAMPLESIZE_MAX (128kB) of the file */
298           DISPLAYLEVEL(3, "Sample file '%s' is too large, limiting to %d KB",
299               fileNamesTable[n], SAMPLESIZE_MAX / (1 KB));
300         }
301         fs.nbSamples += 1;
302         fs.totalSizeToLoad += MIN(fileSize, SAMPLESIZE_MAX);
303       }
304     }
305     DISPLAYLEVEL(4, "Found training data %d files, %d KB, %d samples\n", nbFiles, (int)(fs.totalSizeToLoad / (1 KB)), fs.nbSamples);
306     return fs;
307 }
308 
DiB_trainFromFiles(const char * dictFileName,size_t maxDictSize,const char ** fileNamesTable,int nbFiles,size_t chunkSize,ZDICT_legacy_params_t * params,ZDICT_cover_params_t * coverParams,ZDICT_fastCover_params_t * fastCoverParams,int optimize,unsigned memLimit)309 int DiB_trainFromFiles(const char* dictFileName, size_t maxDictSize,
310                        const char** fileNamesTable, int nbFiles, size_t chunkSize,
311                        ZDICT_legacy_params_t* params, ZDICT_cover_params_t* coverParams,
312                        ZDICT_fastCover_params_t* fastCoverParams, int optimize, unsigned memLimit)
313 {
314     fileStats fs;
315     size_t* sampleSizes; /* vector of sample sizes. Each sample can be up to SAMPLESIZE_MAX */
316     int nbSamplesLoaded; /* nb of samples effectively loaded in srcBuffer */
317     size_t loadedSize; /* total data loaded in srcBuffer for all samples */
318     void* srcBuffer /* contiguous buffer with training data/samples */;
319     void* const dictBuffer = malloc(maxDictSize);
320     int result = 0;
321 
322     int const displayLevel = params ? params->zParams.notificationLevel :
323         coverParams ? coverParams->zParams.notificationLevel :
324         fastCoverParams ? fastCoverParams->zParams.notificationLevel : 0;
325 
326     /* Shuffle input files before we start assessing how much sample datA to load.
327        The purpose of the shuffle is to pick random samples when the sample
328        set is larger than what we can load in memory. */
329     DISPLAYLEVEL(3, "Shuffling input files\n");
330     DiB_shuffle(fileNamesTable, nbFiles);
331 
332     /* Figure out how much sample data to load with how many samples */
333     fs = DiB_fileStats(fileNamesTable, nbFiles, chunkSize, displayLevel);
334 
335     {
336         int const memMult = params ? MEMMULT :
337                             coverParams ? COVER_MEMMULT:
338                             FASTCOVER_MEMMULT;
339         size_t const maxMem =  DiB_findMaxMem(fs.totalSizeToLoad * memMult) / memMult;
340         /* Limit the size of the training data to the free memory */
341         /* Limit the size of the training data to 2GB */
342         /* TODO: there is opportunity to stop DiB_fileStats() early when the data limit is reached */
343         loadedSize = (size_t)MIN( MIN((S64)maxMem, fs.totalSizeToLoad), MAX_SAMPLES_SIZE );
344         if (memLimit != 0) {
345             DISPLAYLEVEL(2, "!  Warning : setting manual memory limit for dictionary training data at %u MB \n",
346                 (unsigned)(memLimit / (1 MB)));
347             loadedSize = (size_t)MIN(loadedSize, memLimit);
348         }
349         srcBuffer = malloc(loadedSize+NOISELENGTH);
350         sampleSizes = (size_t*)malloc(fs.nbSamples * sizeof(size_t));
351     }
352 
353     /* Checks */
354     if ((!sampleSizes) || (!srcBuffer) || (!dictBuffer))
355         EXM_THROW(12, "not enough memory for DiB_trainFiles");   /* should not happen */
356     if (fs.oneSampleTooLarge) {
357         DISPLAYLEVEL(2, "!  Warning : some sample(s) are very large \n");
358         DISPLAYLEVEL(2, "!  Note that dictionary is only useful for small samples. \n");
359         DISPLAYLEVEL(2, "!  As a consequence, only the first %u bytes of each sample are loaded \n", SAMPLESIZE_MAX);
360     }
361     if (fs.nbSamples < 5) {
362         DISPLAYLEVEL(2, "!  Warning : nb of samples too low for proper processing ! \n");
363         DISPLAYLEVEL(2, "!  Please provide _one file per sample_. \n");
364         DISPLAYLEVEL(2, "!  Alternatively, split files into fixed-size blocks representative of samples, with -B# \n");
365         EXM_THROW(14, "nb of samples too low");   /* we now clearly forbid this case */
366     }
367     if (fs.totalSizeToLoad < (S64)maxDictSize * 8) {
368         DISPLAYLEVEL(2, "!  Warning : data size of samples too small for target dictionary size \n");
369         DISPLAYLEVEL(2, "!  Samples should be about 100x larger than target dictionary size \n");
370     }
371 
372     /* init */
373     if ((S64)loadedSize < fs.totalSizeToLoad)
374         DISPLAYLEVEL(1, "Training samples set too large (%u MB); training on %u MB only...\n",
375             (unsigned)(fs.totalSizeToLoad / (1 MB)),
376             (unsigned)(loadedSize / (1 MB)));
377 
378     /* Load input buffer */
379     nbSamplesLoaded = DiB_loadFiles(
380         srcBuffer, &loadedSize, sampleSizes, fs.nbSamples, fileNamesTable,
381         nbFiles, chunkSize, displayLevel);
382 
383     {   size_t dictSize;
384         if (params) {
385             DiB_fillNoise((char*)srcBuffer + loadedSize, NOISELENGTH);   /* guard band, for end of buffer condition */
386             dictSize = ZDICT_trainFromBuffer_legacy(dictBuffer, maxDictSize,
387                                                     srcBuffer, sampleSizes, nbSamplesLoaded,
388                                                     *params);
389         } else if (coverParams) {
390             if (optimize) {
391               dictSize = ZDICT_optimizeTrainFromBuffer_cover(dictBuffer, maxDictSize,
392                                                              srcBuffer, sampleSizes, nbSamplesLoaded,
393                                                              coverParams);
394               if (!ZDICT_isError(dictSize)) {
395                   unsigned splitPercentage = (unsigned)(coverParams->splitPoint * 100);
396                   DISPLAYLEVEL(2, "k=%u\nd=%u\nsteps=%u\nsplit=%u\n", coverParams->k, coverParams->d,
397                               coverParams->steps, splitPercentage);
398               }
399             } else {
400               dictSize = ZDICT_trainFromBuffer_cover(dictBuffer, maxDictSize, srcBuffer,
401                                                      sampleSizes, nbSamplesLoaded, *coverParams);
402             }
403         } else {
404             assert(fastCoverParams != NULL);
405             if (optimize) {
406               dictSize = ZDICT_optimizeTrainFromBuffer_fastCover(dictBuffer, maxDictSize,
407                                                               srcBuffer, sampleSizes, nbSamplesLoaded,
408                                                               fastCoverParams);
409               if (!ZDICT_isError(dictSize)) {
410                 unsigned splitPercentage = (unsigned)(fastCoverParams->splitPoint * 100);
411                 DISPLAYLEVEL(2, "k=%u\nd=%u\nf=%u\nsteps=%u\nsplit=%u\naccel=%u\n", fastCoverParams->k,
412                             fastCoverParams->d, fastCoverParams->f, fastCoverParams->steps, splitPercentage,
413                             fastCoverParams->accel);
414               }
415             } else {
416               dictSize = ZDICT_trainFromBuffer_fastCover(dictBuffer, maxDictSize, srcBuffer,
417                                                         sampleSizes, nbSamplesLoaded, *fastCoverParams);
418             }
419         }
420         if (ZDICT_isError(dictSize)) {
421             DISPLAYLEVEL(1, "dictionary training failed : %s \n", ZDICT_getErrorName(dictSize));   /* should not happen */
422             result = 1;
423             goto _cleanup;
424         }
425         /* save dict */
426         DISPLAYLEVEL(2, "Save dictionary of size %u into file %s \n", (unsigned)dictSize, dictFileName);
427         DiB_saveDict(dictFileName, dictBuffer, dictSize);
428     }
429 
430     /* clean up */
431 _cleanup:
432     free(srcBuffer);
433     free(sampleSizes);
434     free(dictBuffer);
435     return result;
436 }
437