1 /*
2 * Copyright (c) Yann Collet, Facebook, Inc.
3 * All rights reserved.
4 *
5 * This source code is licensed under both the BSD-style license (found in the
6 * LICENSE file in the root directory of this source tree) and the GPLv2 (found
7 * in the COPYING file in the root directory of this source tree).
8 * You may select, at your option, one of the above-listed licenses.
9 */
10
11
12
13 /* **************************************
14 * Compiler Warnings
15 ****************************************/
16 #ifdef _MSC_VER
17 # pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */
18 #endif
19
20
21 /*-*************************************
22 * Includes
23 ***************************************/
24 #include "platform.h" /* Large Files support */
25 #include "util.h" /* UTIL_getFileSize, UTIL_getTotalFileSize */
26 #include <stdlib.h> /* malloc, free */
27 #include <string.h> /* memset */
28 #include <stdio.h> /* fprintf, fopen, ftello64 */
29 #include <errno.h> /* errno */
30 #include <assert.h>
31
32 #include "timefn.h" /* UTIL_time_t, UTIL_clockSpanMicro, UTIL_getTime */
33 #include "../lib/common/mem.h" /* read */
34 #include "../lib/common/error_private.h"
35 #include "dibio.h"
36
37
38 /*-*************************************
39 * Constants
40 ***************************************/
41 #define KB *(1 <<10)
42 #define MB *(1 <<20)
43 #define GB *(1U<<30)
44
45 #define SAMPLESIZE_MAX (128 KB)
46 #define MEMMULT 11 /* rough estimation : memory cost to analyze 1 byte of sample */
47 #define COVER_MEMMULT 9 /* rough estimation : memory cost to analyze 1 byte of sample */
48 #define FASTCOVER_MEMMULT 1 /* rough estimation : memory cost to analyze 1 byte of sample */
49 static const size_t g_maxMemory = (sizeof(size_t) == 4) ? (2 GB - 64 MB) : ((size_t)(512 MB) << sizeof(size_t));
50
51 #define NOISELENGTH 32
52 #define MAX_SAMPLES_SIZE (2 GB) /* training dataset limited to 2GB */
53
54
55 /*-*************************************
56 * Console display
57 ***************************************/
58 #define DISPLAY(...) fprintf(stderr, __VA_ARGS__)
59 #define DISPLAYLEVEL(l, ...) if (displayLevel>=l) { DISPLAY(__VA_ARGS__); }
60
61 static const U64 g_refreshRate = SEC_TO_MICRO / 6;
62 static UTIL_time_t g_displayClock = UTIL_TIME_INITIALIZER;
63
64 #define DISPLAYUPDATE(l, ...) { if (displayLevel>=l) { \
65 if ((UTIL_clockSpanMicro(g_displayClock) > g_refreshRate) || (displayLevel>=4)) \
66 { g_displayClock = UTIL_getTime(); DISPLAY(__VA_ARGS__); \
67 if (displayLevel>=4) fflush(stderr); } } }
68
69 /*-*************************************
70 * Exceptions
71 ***************************************/
72 #ifndef DEBUG
73 # define DEBUG 0
74 #endif
75 #define DEBUGOUTPUT(...) if (DEBUG) DISPLAY(__VA_ARGS__);
76 #define EXM_THROW(error, ...) \
77 { \
78 DEBUGOUTPUT("Error defined at %s, line %i : \n", __FILE__, __LINE__); \
79 DISPLAY("Error %i : ", error); \
80 DISPLAY(__VA_ARGS__); \
81 DISPLAY("\n"); \
82 exit(error); \
83 }
84
85
86 /* ********************************************************
87 * Helper functions
88 **********************************************************/
89 #undef MIN
90 #define MIN(a,b) ((a) < (b) ? (a) : (b))
91
92 /**
93 Returns the size of a file.
94 If error returns -1.
95 */
DiB_getFileSize(const char * fileName)96 static S64 DiB_getFileSize (const char * fileName)
97 {
98 U64 const fileSize = UTIL_getFileSize(fileName);
99 return (fileSize == UTIL_FILESIZE_UNKNOWN) ? -1 : (S64)fileSize;
100 }
101
102 /* ********************************************************
103 * File related operations
104 **********************************************************/
105 /** DiB_loadFiles() :
106 * load samples from files listed in fileNamesTable into buffer.
107 * works even if buffer is too small to load all samples.
108 * Also provides the size of each sample into sampleSizes table
109 * which must be sized correctly, using DiB_fileStats().
110 * @return : nb of samples effectively loaded into `buffer`
111 * *bufferSizePtr is modified, it provides the amount data loaded within buffer.
112 * sampleSizes is filled with the size of each sample.
113 */
DiB_loadFiles(void * buffer,size_t * bufferSizePtr,size_t * sampleSizes,int sstSize,const char ** fileNamesTable,int nbFiles,size_t targetChunkSize,int displayLevel)114 static int DiB_loadFiles(
115 void* buffer, size_t* bufferSizePtr,
116 size_t* sampleSizes, int sstSize,
117 const char** fileNamesTable, int nbFiles,
118 size_t targetChunkSize, int displayLevel )
119 {
120 char* const buff = (char*)buffer;
121 size_t totalDataLoaded = 0;
122 int nbSamplesLoaded = 0;
123 int fileIndex = 0;
124 FILE * f = NULL;
125
126 assert(targetChunkSize <= SAMPLESIZE_MAX);
127
128 while ( nbSamplesLoaded < sstSize && fileIndex < nbFiles ) {
129 size_t fileDataLoaded;
130 S64 const fileSize = DiB_getFileSize(fileNamesTable[fileIndex]);
131 if (fileSize <= 0) /* skip if zero-size or file error */
132 continue;
133
134 f = fopen( fileNamesTable[fileIndex], "rb");
135 if (f == NULL)
136 EXM_THROW(10, "zstd: dictBuilder: %s %s ", fileNamesTable[fileIndex], strerror(errno));
137 DISPLAYUPDATE(2, "Loading %s... \r", fileNamesTable[fileIndex]);
138
139 /* Load the first chunk of data from the file */
140 fileDataLoaded = targetChunkSize > 0 ?
141 (size_t)MIN(fileSize, (S64)targetChunkSize) :
142 (size_t)MIN(fileSize, SAMPLESIZE_MAX );
143 if (totalDataLoaded + fileDataLoaded > *bufferSizePtr)
144 break;
145 if (fread( buff+totalDataLoaded, 1, fileDataLoaded, f ) != fileDataLoaded)
146 EXM_THROW(11, "Pb reading %s", fileNamesTable[fileIndex]);
147 sampleSizes[nbSamplesLoaded++] = fileDataLoaded;
148 totalDataLoaded += fileDataLoaded;
149
150 /* If file-chunking is enabled, load the rest of the file as more samples */
151 if (targetChunkSize > 0) {
152 while( (S64)fileDataLoaded < fileSize && nbSamplesLoaded < sstSize ) {
153 size_t const chunkSize = MIN((size_t)(fileSize-fileDataLoaded), targetChunkSize);
154 if (totalDataLoaded + chunkSize > *bufferSizePtr) /* buffer is full */
155 break;
156
157 if (fread( buff+totalDataLoaded, 1, chunkSize, f ) != chunkSize)
158 EXM_THROW(11, "Pb reading %s", fileNamesTable[fileIndex]);
159 sampleSizes[nbSamplesLoaded++] = chunkSize;
160 totalDataLoaded += chunkSize;
161 fileDataLoaded += chunkSize;
162 }
163 }
164 fileIndex += 1;
165 fclose(f); f = NULL;
166 }
167 if (f != NULL)
168 fclose(f);
169
170 DISPLAYLEVEL(2, "\r%79s\r", "");
171 DISPLAYLEVEL(4, "Loaded %d KB total training data, %d nb samples \n",
172 (int)(totalDataLoaded / (1 KB)), nbSamplesLoaded );
173 *bufferSizePtr = totalDataLoaded;
174 return nbSamplesLoaded;
175 }
176
177 #define DiB_rotl32(x,r) ((x << r) | (x >> (32 - r)))
DiB_rand(U32 * src)178 static U32 DiB_rand(U32* src)
179 {
180 static const U32 prime1 = 2654435761U;
181 static const U32 prime2 = 2246822519U;
182 U32 rand32 = *src;
183 rand32 *= prime1;
184 rand32 ^= prime2;
185 rand32 = DiB_rotl32(rand32, 13);
186 *src = rand32;
187 return rand32 >> 5;
188 }
189
190 /* DiB_shuffle() :
191 * shuffle a table of file names in a semi-random way
192 * It improves dictionary quality by reducing "locality" impact, so if sample set is very large,
193 * it will load random elements from it, instead of just the first ones. */
DiB_shuffle(const char ** fileNamesTable,unsigned nbFiles)194 static void DiB_shuffle(const char** fileNamesTable, unsigned nbFiles) {
195 U32 seed = 0xFD2FB528;
196 unsigned i;
197 assert(nbFiles >= 1);
198 for (i = nbFiles - 1; i > 0; --i) {
199 unsigned const j = DiB_rand(&seed) % (i + 1);
200 const char* const tmp = fileNamesTable[j];
201 fileNamesTable[j] = fileNamesTable[i];
202 fileNamesTable[i] = tmp;
203 }
204 }
205
206
207 /*-********************************************************
208 * Dictionary training functions
209 **********************************************************/
DiB_findMaxMem(unsigned long long requiredMem)210 static size_t DiB_findMaxMem(unsigned long long requiredMem)
211 {
212 size_t const step = 8 MB;
213 void* testmem = NULL;
214
215 requiredMem = (((requiredMem >> 23) + 1) << 23);
216 requiredMem += step;
217 if (requiredMem > g_maxMemory) requiredMem = g_maxMemory;
218
219 while (!testmem) {
220 testmem = malloc((size_t)requiredMem);
221 requiredMem -= step;
222 }
223
224 free(testmem);
225 return (size_t)requiredMem;
226 }
227
228
DiB_fillNoise(void * buffer,size_t length)229 static void DiB_fillNoise(void* buffer, size_t length)
230 {
231 unsigned const prime1 = 2654435761U;
232 unsigned const prime2 = 2246822519U;
233 unsigned acc = prime1;
234 size_t p=0;
235
236 for (p=0; p<length; p++) {
237 acc *= prime2;
238 ((unsigned char*)buffer)[p] = (unsigned char)(acc >> 21);
239 }
240 }
241
242
DiB_saveDict(const char * dictFileName,const void * buff,size_t buffSize)243 static void DiB_saveDict(const char* dictFileName,
244 const void* buff, size_t buffSize)
245 {
246 FILE* const f = fopen(dictFileName, "wb");
247 if (f==NULL) EXM_THROW(3, "cannot open %s ", dictFileName);
248
249 { size_t const n = fwrite(buff, 1, buffSize, f);
250 if (n!=buffSize) EXM_THROW(4, "%s : write error", dictFileName) }
251
252 { size_t const n = (size_t)fclose(f);
253 if (n!=0) EXM_THROW(5, "%s : flush error", dictFileName) }
254 }
255
256 typedef struct {
257 S64 totalSizeToLoad;
258 int nbSamples;
259 int oneSampleTooLarge;
260 } fileStats;
261
262 /*! DiB_fileStats() :
263 * Given a list of files, and a chunkSize (0 == no chunk, whole files)
264 * provides the amount of data to be loaded and the resulting nb of samples.
265 * This is useful primarily for allocation purpose => sample buffer, and sample sizes table.
266 */
DiB_fileStats(const char ** fileNamesTable,int nbFiles,size_t chunkSize,int displayLevel)267 static fileStats DiB_fileStats(const char** fileNamesTable, int nbFiles, size_t chunkSize, int displayLevel)
268 {
269 fileStats fs;
270 int n;
271 memset(&fs, 0, sizeof(fs));
272
273 // We assume that if chunking is requested, the chunk size is < SAMPLESIZE_MAX
274 assert( chunkSize <= SAMPLESIZE_MAX );
275
276 for (n=0; n<nbFiles; n++) {
277 S64 const fileSize = DiB_getFileSize(fileNamesTable[n]);
278 // TODO: is there a minimum sample size? What if the file is 1-byte?
279 if (fileSize == 0) {
280 DISPLAYLEVEL(3, "Sample file '%s' has zero size, skipping...\n", fileNamesTable[n]);
281 continue;
282 }
283
284 /* the case where we are breaking up files in sample chunks */
285 if (chunkSize > 0)
286 {
287 // TODO: is there a minimum sample size? Can we have a 1-byte sample?
288 fs.nbSamples += (int)((fileSize + chunkSize-1) / chunkSize);
289 fs.totalSizeToLoad += fileSize;
290 }
291 else {
292 /* the case where one file is one sample */
293 if (fileSize > SAMPLESIZE_MAX) {
294 /* flag excessively large sample files */
295 fs.oneSampleTooLarge |= (fileSize > 2*SAMPLESIZE_MAX);
296
297 /* Limit to the first SAMPLESIZE_MAX (128kB) of the file */
298 DISPLAYLEVEL(3, "Sample file '%s' is too large, limiting to %d KB",
299 fileNamesTable[n], SAMPLESIZE_MAX / (1 KB));
300 }
301 fs.nbSamples += 1;
302 fs.totalSizeToLoad += MIN(fileSize, SAMPLESIZE_MAX);
303 }
304 }
305 DISPLAYLEVEL(4, "Found training data %d files, %d KB, %d samples\n", nbFiles, (int)(fs.totalSizeToLoad / (1 KB)), fs.nbSamples);
306 return fs;
307 }
308
DiB_trainFromFiles(const char * dictFileName,size_t maxDictSize,const char ** fileNamesTable,int nbFiles,size_t chunkSize,ZDICT_legacy_params_t * params,ZDICT_cover_params_t * coverParams,ZDICT_fastCover_params_t * fastCoverParams,int optimize,unsigned memLimit)309 int DiB_trainFromFiles(const char* dictFileName, size_t maxDictSize,
310 const char** fileNamesTable, int nbFiles, size_t chunkSize,
311 ZDICT_legacy_params_t* params, ZDICT_cover_params_t* coverParams,
312 ZDICT_fastCover_params_t* fastCoverParams, int optimize, unsigned memLimit)
313 {
314 fileStats fs;
315 size_t* sampleSizes; /* vector of sample sizes. Each sample can be up to SAMPLESIZE_MAX */
316 int nbSamplesLoaded; /* nb of samples effectively loaded in srcBuffer */
317 size_t loadedSize; /* total data loaded in srcBuffer for all samples */
318 void* srcBuffer /* contiguous buffer with training data/samples */;
319 void* const dictBuffer = malloc(maxDictSize);
320 int result = 0;
321
322 int const displayLevel = params ? params->zParams.notificationLevel :
323 coverParams ? coverParams->zParams.notificationLevel :
324 fastCoverParams ? fastCoverParams->zParams.notificationLevel : 0;
325
326 /* Shuffle input files before we start assessing how much sample datA to load.
327 The purpose of the shuffle is to pick random samples when the sample
328 set is larger than what we can load in memory. */
329 DISPLAYLEVEL(3, "Shuffling input files\n");
330 DiB_shuffle(fileNamesTable, nbFiles);
331
332 /* Figure out how much sample data to load with how many samples */
333 fs = DiB_fileStats(fileNamesTable, nbFiles, chunkSize, displayLevel);
334
335 {
336 int const memMult = params ? MEMMULT :
337 coverParams ? COVER_MEMMULT:
338 FASTCOVER_MEMMULT;
339 size_t const maxMem = DiB_findMaxMem(fs.totalSizeToLoad * memMult) / memMult;
340 /* Limit the size of the training data to the free memory */
341 /* Limit the size of the training data to 2GB */
342 /* TODO: there is opportunity to stop DiB_fileStats() early when the data limit is reached */
343 loadedSize = (size_t)MIN( MIN((S64)maxMem, fs.totalSizeToLoad), MAX_SAMPLES_SIZE );
344 if (memLimit != 0) {
345 DISPLAYLEVEL(2, "! Warning : setting manual memory limit for dictionary training data at %u MB \n",
346 (unsigned)(memLimit / (1 MB)));
347 loadedSize = (size_t)MIN(loadedSize, memLimit);
348 }
349 srcBuffer = malloc(loadedSize+NOISELENGTH);
350 sampleSizes = (size_t*)malloc(fs.nbSamples * sizeof(size_t));
351 }
352
353 /* Checks */
354 if ((!sampleSizes) || (!srcBuffer) || (!dictBuffer))
355 EXM_THROW(12, "not enough memory for DiB_trainFiles"); /* should not happen */
356 if (fs.oneSampleTooLarge) {
357 DISPLAYLEVEL(2, "! Warning : some sample(s) are very large \n");
358 DISPLAYLEVEL(2, "! Note that dictionary is only useful for small samples. \n");
359 DISPLAYLEVEL(2, "! As a consequence, only the first %u bytes of each sample are loaded \n", SAMPLESIZE_MAX);
360 }
361 if (fs.nbSamples < 5) {
362 DISPLAYLEVEL(2, "! Warning : nb of samples too low for proper processing ! \n");
363 DISPLAYLEVEL(2, "! Please provide _one file per sample_. \n");
364 DISPLAYLEVEL(2, "! Alternatively, split files into fixed-size blocks representative of samples, with -B# \n");
365 EXM_THROW(14, "nb of samples too low"); /* we now clearly forbid this case */
366 }
367 if (fs.totalSizeToLoad < (S64)maxDictSize * 8) {
368 DISPLAYLEVEL(2, "! Warning : data size of samples too small for target dictionary size \n");
369 DISPLAYLEVEL(2, "! Samples should be about 100x larger than target dictionary size \n");
370 }
371
372 /* init */
373 if ((S64)loadedSize < fs.totalSizeToLoad)
374 DISPLAYLEVEL(1, "Training samples set too large (%u MB); training on %u MB only...\n",
375 (unsigned)(fs.totalSizeToLoad / (1 MB)),
376 (unsigned)(loadedSize / (1 MB)));
377
378 /* Load input buffer */
379 nbSamplesLoaded = DiB_loadFiles(
380 srcBuffer, &loadedSize, sampleSizes, fs.nbSamples, fileNamesTable,
381 nbFiles, chunkSize, displayLevel);
382
383 { size_t dictSize;
384 if (params) {
385 DiB_fillNoise((char*)srcBuffer + loadedSize, NOISELENGTH); /* guard band, for end of buffer condition */
386 dictSize = ZDICT_trainFromBuffer_legacy(dictBuffer, maxDictSize,
387 srcBuffer, sampleSizes, nbSamplesLoaded,
388 *params);
389 } else if (coverParams) {
390 if (optimize) {
391 dictSize = ZDICT_optimizeTrainFromBuffer_cover(dictBuffer, maxDictSize,
392 srcBuffer, sampleSizes, nbSamplesLoaded,
393 coverParams);
394 if (!ZDICT_isError(dictSize)) {
395 unsigned splitPercentage = (unsigned)(coverParams->splitPoint * 100);
396 DISPLAYLEVEL(2, "k=%u\nd=%u\nsteps=%u\nsplit=%u\n", coverParams->k, coverParams->d,
397 coverParams->steps, splitPercentage);
398 }
399 } else {
400 dictSize = ZDICT_trainFromBuffer_cover(dictBuffer, maxDictSize, srcBuffer,
401 sampleSizes, nbSamplesLoaded, *coverParams);
402 }
403 } else {
404 assert(fastCoverParams != NULL);
405 if (optimize) {
406 dictSize = ZDICT_optimizeTrainFromBuffer_fastCover(dictBuffer, maxDictSize,
407 srcBuffer, sampleSizes, nbSamplesLoaded,
408 fastCoverParams);
409 if (!ZDICT_isError(dictSize)) {
410 unsigned splitPercentage = (unsigned)(fastCoverParams->splitPoint * 100);
411 DISPLAYLEVEL(2, "k=%u\nd=%u\nf=%u\nsteps=%u\nsplit=%u\naccel=%u\n", fastCoverParams->k,
412 fastCoverParams->d, fastCoverParams->f, fastCoverParams->steps, splitPercentage,
413 fastCoverParams->accel);
414 }
415 } else {
416 dictSize = ZDICT_trainFromBuffer_fastCover(dictBuffer, maxDictSize, srcBuffer,
417 sampleSizes, nbSamplesLoaded, *fastCoverParams);
418 }
419 }
420 if (ZDICT_isError(dictSize)) {
421 DISPLAYLEVEL(1, "dictionary training failed : %s \n", ZDICT_getErrorName(dictSize)); /* should not happen */
422 result = 1;
423 goto _cleanup;
424 }
425 /* save dict */
426 DISPLAYLEVEL(2, "Save dictionary of size %u into file %s \n", (unsigned)dictSize, dictFileName);
427 DiB_saveDict(dictFileName, dictBuffer, dictSize);
428 }
429
430 /* clean up */
431 _cleanup:
432 free(srcBuffer);
433 free(sampleSizes);
434 free(dictBuffer);
435 return result;
436 }
437