1 /*
2 * Copyright (c) Meta Platforms, Inc. and affiliates.
3 * All rights reserved.
4 *
5 * This source code is licensed under both the BSD-style license (found in the
6 * LICENSE file in the root directory of this source tree) and the GPLv2 (found
7 * in the COPYING file in the root directory of this source tree).
8 * You may select, at your option, one of the above-listed licenses.
9 */
10
11 #include "zstd_compress_internal.h" /* ZSTD_hashPtr, ZSTD_count, ZSTD_storeSeq */
12 #include "zstd_fast.h"
13
14 static
15 ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
ZSTD_fillHashTableForCDict(ZSTD_MatchState_t * ms,const void * const end,ZSTD_dictTableLoadMethod_e dtlm)16 void ZSTD_fillHashTableForCDict(ZSTD_MatchState_t* ms,
17 const void* const end,
18 ZSTD_dictTableLoadMethod_e dtlm)
19 {
20 const ZSTD_compressionParameters* const cParams = &ms->cParams;
21 U32* const hashTable = ms->hashTable;
22 U32 const hBits = cParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS;
23 U32 const mls = cParams->minMatch;
24 const BYTE* const base = ms->window.base;
25 const BYTE* ip = base + ms->nextToUpdate;
26 const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE;
27 const U32 fastHashFillStep = 3;
28
29 /* Currently, we always use ZSTD_dtlm_full for filling CDict tables.
30 * Feel free to remove this assert if there's a good reason! */
31 assert(dtlm == ZSTD_dtlm_full);
32
33 /* Always insert every fastHashFillStep position into the hash table.
34 * Insert the other positions if their hash entry is empty.
35 */
36 for ( ; ip + fastHashFillStep < iend + 2; ip += fastHashFillStep) {
37 U32 const curr = (U32)(ip - base);
38 { size_t const hashAndTag = ZSTD_hashPtr(ip, hBits, mls);
39 ZSTD_writeTaggedIndex(hashTable, hashAndTag, curr); }
40
41 if (dtlm == ZSTD_dtlm_fast) continue;
42 /* Only load extra positions for ZSTD_dtlm_full */
43 { U32 p;
44 for (p = 1; p < fastHashFillStep; ++p) {
45 size_t const hashAndTag = ZSTD_hashPtr(ip + p, hBits, mls);
46 if (hashTable[hashAndTag >> ZSTD_SHORT_CACHE_TAG_BITS] == 0) { /* not yet filled */
47 ZSTD_writeTaggedIndex(hashTable, hashAndTag, curr + p);
48 } } } }
49 }
50
51 static
52 ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
ZSTD_fillHashTableForCCtx(ZSTD_MatchState_t * ms,const void * const end,ZSTD_dictTableLoadMethod_e dtlm)53 void ZSTD_fillHashTableForCCtx(ZSTD_MatchState_t* ms,
54 const void* const end,
55 ZSTD_dictTableLoadMethod_e dtlm)
56 {
57 const ZSTD_compressionParameters* const cParams = &ms->cParams;
58 U32* const hashTable = ms->hashTable;
59 U32 const hBits = cParams->hashLog;
60 U32 const mls = cParams->minMatch;
61 const BYTE* const base = ms->window.base;
62 const BYTE* ip = base + ms->nextToUpdate;
63 const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE;
64 const U32 fastHashFillStep = 3;
65
66 /* Currently, we always use ZSTD_dtlm_fast for filling CCtx tables.
67 * Feel free to remove this assert if there's a good reason! */
68 assert(dtlm == ZSTD_dtlm_fast);
69
70 /* Always insert every fastHashFillStep position into the hash table.
71 * Insert the other positions if their hash entry is empty.
72 */
73 for ( ; ip + fastHashFillStep < iend + 2; ip += fastHashFillStep) {
74 U32 const curr = (U32)(ip - base);
75 size_t const hash0 = ZSTD_hashPtr(ip, hBits, mls);
76 hashTable[hash0] = curr;
77 if (dtlm == ZSTD_dtlm_fast) continue;
78 /* Only load extra positions for ZSTD_dtlm_full */
79 { U32 p;
80 for (p = 1; p < fastHashFillStep; ++p) {
81 size_t const hash = ZSTD_hashPtr(ip + p, hBits, mls);
82 if (hashTable[hash] == 0) { /* not yet filled */
83 hashTable[hash] = curr + p;
84 } } } }
85 }
86
ZSTD_fillHashTable(ZSTD_MatchState_t * ms,const void * const end,ZSTD_dictTableLoadMethod_e dtlm,ZSTD_tableFillPurpose_e tfp)87 void ZSTD_fillHashTable(ZSTD_MatchState_t* ms,
88 const void* const end,
89 ZSTD_dictTableLoadMethod_e dtlm,
90 ZSTD_tableFillPurpose_e tfp)
91 {
92 if (tfp == ZSTD_tfp_forCDict) {
93 ZSTD_fillHashTableForCDict(ms, end, dtlm);
94 } else {
95 ZSTD_fillHashTableForCCtx(ms, end, dtlm);
96 }
97 }
98
99
100 typedef int (*ZSTD_match4Found) (const BYTE* currentPtr, const BYTE* matchAddress, U32 matchIdx, U32 idxLowLimit);
101
102 static int
ZSTD_match4Found_cmov(const BYTE * currentPtr,const BYTE * matchAddress,U32 matchIdx,U32 idxLowLimit)103 ZSTD_match4Found_cmov(const BYTE* currentPtr, const BYTE* matchAddress, U32 matchIdx, U32 idxLowLimit)
104 {
105 /* Array of ~random data, should have low probability of matching data.
106 * Load from here if the index is invalid.
107 * Used to avoid unpredictable branches. */
108 static const BYTE dummy[] = {0x12,0x34,0x56,0x78};
109
110 /* currentIdx >= lowLimit is a (somewhat) unpredictable branch.
111 * However expression below compiles into conditional move.
112 */
113 const BYTE* mvalAddr = ZSTD_selectAddr(matchIdx, idxLowLimit, matchAddress, dummy);
114 /* Note: this used to be written as : return test1 && test2;
115 * Unfortunately, once inlined, these tests become branches,
116 * in which case it becomes critical that they are executed in the right order (test1 then test2).
117 * So we have to write these tests in a specific manner to ensure their ordering.
118 */
119 if (MEM_read32(currentPtr) != MEM_read32(mvalAddr)) return 0;
120 /* force ordering of these tests, which matters once the function is inlined, as they become branches */
121 #if defined(__GNUC__)
122 __asm__("");
123 #endif
124 return matchIdx >= idxLowLimit;
125 }
126
127 static int
ZSTD_match4Found_branch(const BYTE * currentPtr,const BYTE * matchAddress,U32 matchIdx,U32 idxLowLimit)128 ZSTD_match4Found_branch(const BYTE* currentPtr, const BYTE* matchAddress, U32 matchIdx, U32 idxLowLimit)
129 {
130 /* using a branch instead of a cmov,
131 * because it's faster in scenarios where matchIdx >= idxLowLimit is generally true,
132 * aka almost all candidates are within range */
133 U32 mval;
134 if (matchIdx >= idxLowLimit) {
135 mval = MEM_read32(matchAddress);
136 } else {
137 mval = MEM_read32(currentPtr) ^ 1; /* guaranteed to not match. */
138 }
139
140 return (MEM_read32(currentPtr) == mval);
141 }
142
143
144 /**
145 * If you squint hard enough (and ignore repcodes), the search operation at any
146 * given position is broken into 4 stages:
147 *
148 * 1. Hash (map position to hash value via input read)
149 * 2. Lookup (map hash val to index via hashtable read)
150 * 3. Load (map index to value at that position via input read)
151 * 4. Compare
152 *
153 * Each of these steps involves a memory read at an address which is computed
154 * from the previous step. This means these steps must be sequenced and their
155 * latencies are cumulative.
156 *
157 * Rather than do 1->2->3->4 sequentially for a single position before moving
158 * onto the next, this implementation interleaves these operations across the
159 * next few positions:
160 *
161 * R = Repcode Read & Compare
162 * H = Hash
163 * T = Table Lookup
164 * M = Match Read & Compare
165 *
166 * Pos | Time -->
167 * ----+-------------------
168 * N | ... M
169 * N+1 | ... TM
170 * N+2 | R H T M
171 * N+3 | H TM
172 * N+4 | R H T M
173 * N+5 | H ...
174 * N+6 | R ...
175 *
176 * This is very much analogous to the pipelining of execution in a CPU. And just
177 * like a CPU, we have to dump the pipeline when we find a match (i.e., take a
178 * branch).
179 *
180 * When this happens, we throw away our current state, and do the following prep
181 * to re-enter the loop:
182 *
183 * Pos | Time -->
184 * ----+-------------------
185 * N | H T
186 * N+1 | H
187 *
188 * This is also the work we do at the beginning to enter the loop initially.
189 */
190 FORCE_INLINE_TEMPLATE
191 ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
ZSTD_compressBlock_fast_noDict_generic(ZSTD_MatchState_t * ms,SeqStore_t * seqStore,U32 rep[ZSTD_REP_NUM],void const * src,size_t srcSize,U32 const mls,int useCmov)192 size_t ZSTD_compressBlock_fast_noDict_generic(
193 ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
194 void const* src, size_t srcSize,
195 U32 const mls, int useCmov)
196 {
197 const ZSTD_compressionParameters* const cParams = &ms->cParams;
198 U32* const hashTable = ms->hashTable;
199 U32 const hlog = cParams->hashLog;
200 size_t const stepSize = cParams->targetLength + !(cParams->targetLength) + 1; /* min 2 */
201 const BYTE* const base = ms->window.base;
202 const BYTE* const istart = (const BYTE*)src;
203 const U32 endIndex = (U32)((size_t)(istart - base) + srcSize);
204 const U32 prefixStartIndex = ZSTD_getLowestPrefixIndex(ms, endIndex, cParams->windowLog);
205 const BYTE* const prefixStart = base + prefixStartIndex;
206 const BYTE* const iend = istart + srcSize;
207 const BYTE* const ilimit = iend - HASH_READ_SIZE;
208
209 const BYTE* anchor = istart;
210 const BYTE* ip0 = istart;
211 const BYTE* ip1;
212 const BYTE* ip2;
213 const BYTE* ip3;
214 U32 current0;
215
216 U32 rep_offset1 = rep[0];
217 U32 rep_offset2 = rep[1];
218 U32 offsetSaved1 = 0, offsetSaved2 = 0;
219
220 size_t hash0; /* hash for ip0 */
221 size_t hash1; /* hash for ip1 */
222 U32 matchIdx; /* match idx for ip0 */
223
224 U32 offcode;
225 const BYTE* match0;
226 size_t mLength;
227
228 /* ip0 and ip1 are always adjacent. The targetLength skipping and
229 * uncompressibility acceleration is applied to every other position,
230 * matching the behavior of #1562. step therefore represents the gap
231 * between pairs of positions, from ip0 to ip2 or ip1 to ip3. */
232 size_t step;
233 const BYTE* nextStep;
234 const size_t kStepIncr = (1 << (kSearchStrength - 1));
235 const ZSTD_match4Found matchFound = useCmov ? ZSTD_match4Found_cmov : ZSTD_match4Found_branch;
236
237 DEBUGLOG(5, "ZSTD_compressBlock_fast_generic");
238 ip0 += (ip0 == prefixStart);
239 { U32 const curr = (U32)(ip0 - base);
240 U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, curr, cParams->windowLog);
241 U32 const maxRep = curr - windowLow;
242 if (rep_offset2 > maxRep) offsetSaved2 = rep_offset2, rep_offset2 = 0;
243 if (rep_offset1 > maxRep) offsetSaved1 = rep_offset1, rep_offset1 = 0;
244 }
245
246 /* start each op */
247 _start: /* Requires: ip0 */
248
249 step = stepSize;
250 nextStep = ip0 + kStepIncr;
251
252 /* calculate positions, ip0 - anchor == 0, so we skip step calc */
253 ip1 = ip0 + 1;
254 ip2 = ip0 + step;
255 ip3 = ip2 + 1;
256
257 if (ip3 >= ilimit) {
258 goto _cleanup;
259 }
260
261 hash0 = ZSTD_hashPtr(ip0, hlog, mls);
262 hash1 = ZSTD_hashPtr(ip1, hlog, mls);
263
264 matchIdx = hashTable[hash0];
265
266 do {
267 /* load repcode match for ip[2]*/
268 const U32 rval = MEM_read32(ip2 - rep_offset1);
269
270 /* write back hash table entry */
271 current0 = (U32)(ip0 - base);
272 hashTable[hash0] = current0;
273
274 /* check repcode at ip[2] */
275 if ((MEM_read32(ip2) == rval) & (rep_offset1 > 0)) {
276 ip0 = ip2;
277 match0 = ip0 - rep_offset1;
278 mLength = ip0[-1] == match0[-1];
279 ip0 -= mLength;
280 match0 -= mLength;
281 offcode = REPCODE1_TO_OFFBASE;
282 mLength += 4;
283
284 /* Write next hash table entry: it's already calculated.
285 * This write is known to be safe because ip1 is before the
286 * repcode (ip2). */
287 hashTable[hash1] = (U32)(ip1 - base);
288
289 goto _match;
290 }
291
292 if (matchFound(ip0, base + matchIdx, matchIdx, prefixStartIndex)) {
293 /* Write next hash table entry (it's already calculated).
294 * This write is known to be safe because the ip1 == ip0 + 1,
295 * so searching will resume after ip1 */
296 hashTable[hash1] = (U32)(ip1 - base);
297
298 goto _offset;
299 }
300
301 /* lookup ip[1] */
302 matchIdx = hashTable[hash1];
303
304 /* hash ip[2] */
305 hash0 = hash1;
306 hash1 = ZSTD_hashPtr(ip2, hlog, mls);
307
308 /* advance to next positions */
309 ip0 = ip1;
310 ip1 = ip2;
311 ip2 = ip3;
312
313 /* write back hash table entry */
314 current0 = (U32)(ip0 - base);
315 hashTable[hash0] = current0;
316
317 if (matchFound(ip0, base + matchIdx, matchIdx, prefixStartIndex)) {
318 /* Write next hash table entry, since it's already calculated */
319 if (step <= 4) {
320 /* Avoid writing an index if it's >= position where search will resume.
321 * The minimum possible match has length 4, so search can resume at ip0 + 4.
322 */
323 hashTable[hash1] = (U32)(ip1 - base);
324 }
325 goto _offset;
326 }
327
328 /* lookup ip[1] */
329 matchIdx = hashTable[hash1];
330
331 /* hash ip[2] */
332 hash0 = hash1;
333 hash1 = ZSTD_hashPtr(ip2, hlog, mls);
334
335 /* advance to next positions */
336 ip0 = ip1;
337 ip1 = ip2;
338 ip2 = ip0 + step;
339 ip3 = ip1 + step;
340
341 /* calculate step */
342 if (ip2 >= nextStep) {
343 step++;
344 PREFETCH_L1(ip1 + 64);
345 PREFETCH_L1(ip1 + 128);
346 nextStep += kStepIncr;
347 }
348 } while (ip3 < ilimit);
349
350 _cleanup:
351 /* Note that there are probably still a couple positions one could search.
352 * However, it seems to be a meaningful performance hit to try to search
353 * them. So let's not. */
354
355 /* When the repcodes are outside of the prefix, we set them to zero before the loop.
356 * When the offsets are still zero, we need to restore them after the block to have a correct
357 * repcode history. If only one offset was invalid, it is easy. The tricky case is when both
358 * offsets were invalid. We need to figure out which offset to refill with.
359 * - If both offsets are zero they are in the same order.
360 * - If both offsets are non-zero, we won't restore the offsets from `offsetSaved[12]`.
361 * - If only one is zero, we need to decide which offset to restore.
362 * - If rep_offset1 is non-zero, then rep_offset2 must be offsetSaved1.
363 * - It is impossible for rep_offset2 to be non-zero.
364 *
365 * So if rep_offset1 started invalid (offsetSaved1 != 0) and became valid (rep_offset1 != 0), then
366 * set rep[0] = rep_offset1 and rep[1] = offsetSaved1.
367 */
368 offsetSaved2 = ((offsetSaved1 != 0) && (rep_offset1 != 0)) ? offsetSaved1 : offsetSaved2;
369
370 /* save reps for next block */
371 rep[0] = rep_offset1 ? rep_offset1 : offsetSaved1;
372 rep[1] = rep_offset2 ? rep_offset2 : offsetSaved2;
373
374 /* Return the last literals size */
375 return (size_t)(iend - anchor);
376
377 _offset: /* Requires: ip0, idx */
378
379 /* Compute the offset code. */
380 match0 = base + matchIdx;
381 rep_offset2 = rep_offset1;
382 rep_offset1 = (U32)(ip0-match0);
383 offcode = OFFSET_TO_OFFBASE(rep_offset1);
384 mLength = 4;
385
386 /* Count the backwards match length. */
387 while (((ip0>anchor) & (match0>prefixStart)) && (ip0[-1] == match0[-1])) {
388 ip0--;
389 match0--;
390 mLength++;
391 }
392
393 _match: /* Requires: ip0, match0, offcode */
394
395 /* Count the forward length. */
396 mLength += ZSTD_count(ip0 + mLength, match0 + mLength, iend);
397
398 ZSTD_storeSeq(seqStore, (size_t)(ip0 - anchor), anchor, iend, offcode, mLength);
399
400 ip0 += mLength;
401 anchor = ip0;
402
403 /* Fill table and check for immediate repcode. */
404 if (ip0 <= ilimit) {
405 /* Fill Table */
406 assert(base+current0+2 > istart); /* check base overflow */
407 hashTable[ZSTD_hashPtr(base+current0+2, hlog, mls)] = current0+2; /* here because current+2 could be > iend-8 */
408 hashTable[ZSTD_hashPtr(ip0-2, hlog, mls)] = (U32)(ip0-2-base);
409
410 if (rep_offset2 > 0) { /* rep_offset2==0 means rep_offset2 is invalidated */
411 while ( (ip0 <= ilimit) && (MEM_read32(ip0) == MEM_read32(ip0 - rep_offset2)) ) {
412 /* store sequence */
413 size_t const rLength = ZSTD_count(ip0+4, ip0+4-rep_offset2, iend) + 4;
414 { U32 const tmpOff = rep_offset2; rep_offset2 = rep_offset1; rep_offset1 = tmpOff; } /* swap rep_offset2 <=> rep_offset1 */
415 hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = (U32)(ip0-base);
416 ip0 += rLength;
417 ZSTD_storeSeq(seqStore, 0 /*litLen*/, anchor, iend, REPCODE1_TO_OFFBASE, rLength);
418 anchor = ip0;
419 continue; /* faster when present (confirmed on gcc-8) ... (?) */
420 } } }
421
422 goto _start;
423 }
424
425 #define ZSTD_GEN_FAST_FN(dictMode, mml, cmov) \
426 static size_t ZSTD_compressBlock_fast_##dictMode##_##mml##_##cmov( \
427 ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], \
428 void const* src, size_t srcSize) \
429 { \
430 return ZSTD_compressBlock_fast_##dictMode##_generic(ms, seqStore, rep, src, srcSize, mml, cmov); \
431 }
432
433 ZSTD_GEN_FAST_FN(noDict, 4, 1)
434 ZSTD_GEN_FAST_FN(noDict, 5, 1)
435 ZSTD_GEN_FAST_FN(noDict, 6, 1)
436 ZSTD_GEN_FAST_FN(noDict, 7, 1)
437
438 ZSTD_GEN_FAST_FN(noDict, 4, 0)
439 ZSTD_GEN_FAST_FN(noDict, 5, 0)
440 ZSTD_GEN_FAST_FN(noDict, 6, 0)
441 ZSTD_GEN_FAST_FN(noDict, 7, 0)
442
ZSTD_compressBlock_fast(ZSTD_MatchState_t * ms,SeqStore_t * seqStore,U32 rep[ZSTD_REP_NUM],void const * src,size_t srcSize)443 size_t ZSTD_compressBlock_fast(
444 ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
445 void const* src, size_t srcSize)
446 {
447 U32 const mml = ms->cParams.minMatch;
448 /* use cmov when "candidate in range" branch is likely unpredictable */
449 int const useCmov = ms->cParams.windowLog < 19;
450 assert(ms->dictMatchState == NULL);
451 if (useCmov) {
452 switch(mml)
453 {
454 default: /* includes case 3 */
455 case 4 :
456 return ZSTD_compressBlock_fast_noDict_4_1(ms, seqStore, rep, src, srcSize);
457 case 5 :
458 return ZSTD_compressBlock_fast_noDict_5_1(ms, seqStore, rep, src, srcSize);
459 case 6 :
460 return ZSTD_compressBlock_fast_noDict_6_1(ms, seqStore, rep, src, srcSize);
461 case 7 :
462 return ZSTD_compressBlock_fast_noDict_7_1(ms, seqStore, rep, src, srcSize);
463 }
464 } else {
465 /* use a branch instead */
466 switch(mml)
467 {
468 default: /* includes case 3 */
469 case 4 :
470 return ZSTD_compressBlock_fast_noDict_4_0(ms, seqStore, rep, src, srcSize);
471 case 5 :
472 return ZSTD_compressBlock_fast_noDict_5_0(ms, seqStore, rep, src, srcSize);
473 case 6 :
474 return ZSTD_compressBlock_fast_noDict_6_0(ms, seqStore, rep, src, srcSize);
475 case 7 :
476 return ZSTD_compressBlock_fast_noDict_7_0(ms, seqStore, rep, src, srcSize);
477 }
478 }
479 }
480
481 FORCE_INLINE_TEMPLATE
482 ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
ZSTD_compressBlock_fast_dictMatchState_generic(ZSTD_MatchState_t * ms,SeqStore_t * seqStore,U32 rep[ZSTD_REP_NUM],void const * src,size_t srcSize,U32 const mls,U32 const hasStep)483 size_t ZSTD_compressBlock_fast_dictMatchState_generic(
484 ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
485 void const* src, size_t srcSize, U32 const mls, U32 const hasStep)
486 {
487 const ZSTD_compressionParameters* const cParams = &ms->cParams;
488 U32* const hashTable = ms->hashTable;
489 U32 const hlog = cParams->hashLog;
490 /* support stepSize of 0 */
491 U32 const stepSize = cParams->targetLength + !(cParams->targetLength);
492 const BYTE* const base = ms->window.base;
493 const BYTE* const istart = (const BYTE*)src;
494 const BYTE* ip0 = istart;
495 const BYTE* ip1 = ip0 + stepSize; /* we assert below that stepSize >= 1 */
496 const BYTE* anchor = istart;
497 const U32 prefixStartIndex = ms->window.dictLimit;
498 const BYTE* const prefixStart = base + prefixStartIndex;
499 const BYTE* const iend = istart + srcSize;
500 const BYTE* const ilimit = iend - HASH_READ_SIZE;
501 U32 offset_1=rep[0], offset_2=rep[1];
502
503 const ZSTD_MatchState_t* const dms = ms->dictMatchState;
504 const ZSTD_compressionParameters* const dictCParams = &dms->cParams ;
505 const U32* const dictHashTable = dms->hashTable;
506 const U32 dictStartIndex = dms->window.dictLimit;
507 const BYTE* const dictBase = dms->window.base;
508 const BYTE* const dictStart = dictBase + dictStartIndex;
509 const BYTE* const dictEnd = dms->window.nextSrc;
510 const U32 dictIndexDelta = prefixStartIndex - (U32)(dictEnd - dictBase);
511 const U32 dictAndPrefixLength = (U32)(istart - prefixStart + dictEnd - dictStart);
512 const U32 dictHBits = dictCParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS;
513
514 /* if a dictionary is still attached, it necessarily means that
515 * it is within window size. So we just check it. */
516 const U32 maxDistance = 1U << cParams->windowLog;
517 const U32 endIndex = (U32)((size_t)(istart - base) + srcSize);
518 assert(endIndex - prefixStartIndex <= maxDistance);
519 (void)maxDistance; (void)endIndex; /* these variables are not used when assert() is disabled */
520
521 (void)hasStep; /* not currently specialized on whether it's accelerated */
522
523 /* ensure there will be no underflow
524 * when translating a dict index into a local index */
525 assert(prefixStartIndex >= (U32)(dictEnd - dictBase));
526
527 if (ms->prefetchCDictTables) {
528 size_t const hashTableBytes = (((size_t)1) << dictCParams->hashLog) * sizeof(U32);
529 PREFETCH_AREA(dictHashTable, hashTableBytes);
530 }
531
532 /* init */
533 DEBUGLOG(5, "ZSTD_compressBlock_fast_dictMatchState_generic");
534 ip0 += (dictAndPrefixLength == 0);
535 /* dictMatchState repCode checks don't currently handle repCode == 0
536 * disabling. */
537 assert(offset_1 <= dictAndPrefixLength);
538 assert(offset_2 <= dictAndPrefixLength);
539
540 /* Outer search loop */
541 assert(stepSize >= 1);
542 while (ip1 <= ilimit) { /* repcode check at (ip0 + 1) is safe because ip0 < ip1 */
543 size_t mLength;
544 size_t hash0 = ZSTD_hashPtr(ip0, hlog, mls);
545
546 size_t const dictHashAndTag0 = ZSTD_hashPtr(ip0, dictHBits, mls);
547 U32 dictMatchIndexAndTag = dictHashTable[dictHashAndTag0 >> ZSTD_SHORT_CACHE_TAG_BITS];
548 int dictTagsMatch = ZSTD_comparePackedTags(dictMatchIndexAndTag, dictHashAndTag0);
549
550 U32 matchIndex = hashTable[hash0];
551 U32 curr = (U32)(ip0 - base);
552 size_t step = stepSize;
553 const size_t kStepIncr = 1 << kSearchStrength;
554 const BYTE* nextStep = ip0 + kStepIncr;
555
556 /* Inner search loop */
557 while (1) {
558 const BYTE* match = base + matchIndex;
559 const U32 repIndex = curr + 1 - offset_1;
560 const BYTE* repMatch = (repIndex < prefixStartIndex) ?
561 dictBase + (repIndex - dictIndexDelta) :
562 base + repIndex;
563 const size_t hash1 = ZSTD_hashPtr(ip1, hlog, mls);
564 size_t const dictHashAndTag1 = ZSTD_hashPtr(ip1, dictHBits, mls);
565 hashTable[hash0] = curr; /* update hash table */
566
567 if ((ZSTD_index_overlap_check(prefixStartIndex, repIndex))
568 && (MEM_read32(repMatch) == MEM_read32(ip0 + 1))) {
569 const BYTE* const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
570 mLength = ZSTD_count_2segments(ip0 + 1 + 4, repMatch + 4, iend, repMatchEnd, prefixStart) + 4;
571 ip0++;
572 ZSTD_storeSeq(seqStore, (size_t) (ip0 - anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength);
573 break;
574 }
575
576 if (dictTagsMatch) {
577 /* Found a possible dict match */
578 const U32 dictMatchIndex = dictMatchIndexAndTag >> ZSTD_SHORT_CACHE_TAG_BITS;
579 const BYTE* dictMatch = dictBase + dictMatchIndex;
580 if (dictMatchIndex > dictStartIndex &&
581 MEM_read32(dictMatch) == MEM_read32(ip0)) {
582 /* To replicate extDict parse behavior, we only use dict matches when the normal matchIndex is invalid */
583 if (matchIndex <= prefixStartIndex) {
584 U32 const offset = (U32) (curr - dictMatchIndex - dictIndexDelta);
585 mLength = ZSTD_count_2segments(ip0 + 4, dictMatch + 4, iend, dictEnd, prefixStart) + 4;
586 while (((ip0 > anchor) & (dictMatch > dictStart))
587 && (ip0[-1] == dictMatch[-1])) {
588 ip0--;
589 dictMatch--;
590 mLength++;
591 } /* catch up */
592 offset_2 = offset_1;
593 offset_1 = offset;
594 ZSTD_storeSeq(seqStore, (size_t) (ip0 - anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
595 break;
596 }
597 }
598 }
599
600 if (ZSTD_match4Found_cmov(ip0, match, matchIndex, prefixStartIndex)) {
601 /* found a regular match of size >= 4 */
602 U32 const offset = (U32) (ip0 - match);
603 mLength = ZSTD_count(ip0 + 4, match + 4, iend) + 4;
604 while (((ip0 > anchor) & (match > prefixStart))
605 && (ip0[-1] == match[-1])) {
606 ip0--;
607 match--;
608 mLength++;
609 } /* catch up */
610 offset_2 = offset_1;
611 offset_1 = offset;
612 ZSTD_storeSeq(seqStore, (size_t) (ip0 - anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
613 break;
614 }
615
616 /* Prepare for next iteration */
617 dictMatchIndexAndTag = dictHashTable[dictHashAndTag1 >> ZSTD_SHORT_CACHE_TAG_BITS];
618 dictTagsMatch = ZSTD_comparePackedTags(dictMatchIndexAndTag, dictHashAndTag1);
619 matchIndex = hashTable[hash1];
620
621 if (ip1 >= nextStep) {
622 step++;
623 nextStep += kStepIncr;
624 }
625 ip0 = ip1;
626 ip1 = ip1 + step;
627 if (ip1 > ilimit) goto _cleanup;
628
629 curr = (U32)(ip0 - base);
630 hash0 = hash1;
631 } /* end inner search loop */
632
633 /* match found */
634 assert(mLength);
635 ip0 += mLength;
636 anchor = ip0;
637
638 if (ip0 <= ilimit) {
639 /* Fill Table */
640 assert(base+curr+2 > istart); /* check base overflow */
641 hashTable[ZSTD_hashPtr(base+curr+2, hlog, mls)] = curr+2; /* here because curr+2 could be > iend-8 */
642 hashTable[ZSTD_hashPtr(ip0-2, hlog, mls)] = (U32)(ip0-2-base);
643
644 /* check immediate repcode */
645 while (ip0 <= ilimit) {
646 U32 const current2 = (U32)(ip0-base);
647 U32 const repIndex2 = current2 - offset_2;
648 const BYTE* repMatch2 = repIndex2 < prefixStartIndex ?
649 dictBase - dictIndexDelta + repIndex2 :
650 base + repIndex2;
651 if ( (ZSTD_index_overlap_check(prefixStartIndex, repIndex2))
652 && (MEM_read32(repMatch2) == MEM_read32(ip0))) {
653 const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend;
654 size_t const repLength2 = ZSTD_count_2segments(ip0+4, repMatch2+4, iend, repEnd2, prefixStart) + 4;
655 U32 tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; /* swap offset_2 <=> offset_1 */
656 ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, repLength2);
657 hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = current2;
658 ip0 += repLength2;
659 anchor = ip0;
660 continue;
661 }
662 break;
663 }
664 }
665
666 /* Prepare for next iteration */
667 assert(ip0 == anchor);
668 ip1 = ip0 + stepSize;
669 }
670
671 _cleanup:
672 /* save reps for next block */
673 rep[0] = offset_1;
674 rep[1] = offset_2;
675
676 /* Return the last literals size */
677 return (size_t)(iend - anchor);
678 }
679
680
681 ZSTD_GEN_FAST_FN(dictMatchState, 4, 0)
682 ZSTD_GEN_FAST_FN(dictMatchState, 5, 0)
683 ZSTD_GEN_FAST_FN(dictMatchState, 6, 0)
684 ZSTD_GEN_FAST_FN(dictMatchState, 7, 0)
685
ZSTD_compressBlock_fast_dictMatchState(ZSTD_MatchState_t * ms,SeqStore_t * seqStore,U32 rep[ZSTD_REP_NUM],void const * src,size_t srcSize)686 size_t ZSTD_compressBlock_fast_dictMatchState(
687 ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
688 void const* src, size_t srcSize)
689 {
690 U32 const mls = ms->cParams.minMatch;
691 assert(ms->dictMatchState != NULL);
692 switch(mls)
693 {
694 default: /* includes case 3 */
695 case 4 :
696 return ZSTD_compressBlock_fast_dictMatchState_4_0(ms, seqStore, rep, src, srcSize);
697 case 5 :
698 return ZSTD_compressBlock_fast_dictMatchState_5_0(ms, seqStore, rep, src, srcSize);
699 case 6 :
700 return ZSTD_compressBlock_fast_dictMatchState_6_0(ms, seqStore, rep, src, srcSize);
701 case 7 :
702 return ZSTD_compressBlock_fast_dictMatchState_7_0(ms, seqStore, rep, src, srcSize);
703 }
704 }
705
706
707 static
708 ZSTD_ALLOW_POINTER_OVERFLOW_ATTR
ZSTD_compressBlock_fast_extDict_generic(ZSTD_MatchState_t * ms,SeqStore_t * seqStore,U32 rep[ZSTD_REP_NUM],void const * src,size_t srcSize,U32 const mls,U32 const hasStep)709 size_t ZSTD_compressBlock_fast_extDict_generic(
710 ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
711 void const* src, size_t srcSize, U32 const mls, U32 const hasStep)
712 {
713 const ZSTD_compressionParameters* const cParams = &ms->cParams;
714 U32* const hashTable = ms->hashTable;
715 U32 const hlog = cParams->hashLog;
716 /* support stepSize of 0 */
717 size_t const stepSize = cParams->targetLength + !(cParams->targetLength) + 1;
718 const BYTE* const base = ms->window.base;
719 const BYTE* const dictBase = ms->window.dictBase;
720 const BYTE* const istart = (const BYTE*)src;
721 const BYTE* anchor = istart;
722 const U32 endIndex = (U32)((size_t)(istart - base) + srcSize);
723 const U32 lowLimit = ZSTD_getLowestMatchIndex(ms, endIndex, cParams->windowLog);
724 const U32 dictStartIndex = lowLimit;
725 const BYTE* const dictStart = dictBase + dictStartIndex;
726 const U32 dictLimit = ms->window.dictLimit;
727 const U32 prefixStartIndex = dictLimit < lowLimit ? lowLimit : dictLimit;
728 const BYTE* const prefixStart = base + prefixStartIndex;
729 const BYTE* const dictEnd = dictBase + prefixStartIndex;
730 const BYTE* const iend = istart + srcSize;
731 const BYTE* const ilimit = iend - 8;
732 U32 offset_1=rep[0], offset_2=rep[1];
733 U32 offsetSaved1 = 0, offsetSaved2 = 0;
734
735 const BYTE* ip0 = istart;
736 const BYTE* ip1;
737 const BYTE* ip2;
738 const BYTE* ip3;
739 U32 current0;
740
741
742 size_t hash0; /* hash for ip0 */
743 size_t hash1; /* hash for ip1 */
744 U32 idx; /* match idx for ip0 */
745 const BYTE* idxBase; /* base pointer for idx */
746
747 U32 offcode;
748 const BYTE* match0;
749 size_t mLength;
750 const BYTE* matchEnd = 0; /* initialize to avoid warning, assert != 0 later */
751
752 size_t step;
753 const BYTE* nextStep;
754 const size_t kStepIncr = (1 << (kSearchStrength - 1));
755
756 (void)hasStep; /* not currently specialized on whether it's accelerated */
757
758 DEBUGLOG(5, "ZSTD_compressBlock_fast_extDict_generic (offset_1=%u)", offset_1);
759
760 /* switch to "regular" variant if extDict is invalidated due to maxDistance */
761 if (prefixStartIndex == dictStartIndex)
762 return ZSTD_compressBlock_fast(ms, seqStore, rep, src, srcSize);
763
764 { U32 const curr = (U32)(ip0 - base);
765 U32 const maxRep = curr - dictStartIndex;
766 if (offset_2 >= maxRep) offsetSaved2 = offset_2, offset_2 = 0;
767 if (offset_1 >= maxRep) offsetSaved1 = offset_1, offset_1 = 0;
768 }
769
770 /* start each op */
771 _start: /* Requires: ip0 */
772
773 step = stepSize;
774 nextStep = ip0 + kStepIncr;
775
776 /* calculate positions, ip0 - anchor == 0, so we skip step calc */
777 ip1 = ip0 + 1;
778 ip2 = ip0 + step;
779 ip3 = ip2 + 1;
780
781 if (ip3 >= ilimit) {
782 goto _cleanup;
783 }
784
785 hash0 = ZSTD_hashPtr(ip0, hlog, mls);
786 hash1 = ZSTD_hashPtr(ip1, hlog, mls);
787
788 idx = hashTable[hash0];
789 idxBase = idx < prefixStartIndex ? dictBase : base;
790
791 do {
792 { /* load repcode match for ip[2] */
793 U32 const current2 = (U32)(ip2 - base);
794 U32 const repIndex = current2 - offset_1;
795 const BYTE* const repBase = repIndex < prefixStartIndex ? dictBase : base;
796 U32 rval;
797 if ( ((U32)(prefixStartIndex - repIndex) >= 4) /* intentional underflow */
798 & (offset_1 > 0) ) {
799 rval = MEM_read32(repBase + repIndex);
800 } else {
801 rval = MEM_read32(ip2) ^ 1; /* guaranteed to not match. */
802 }
803
804 /* write back hash table entry */
805 current0 = (U32)(ip0 - base);
806 hashTable[hash0] = current0;
807
808 /* check repcode at ip[2] */
809 if (MEM_read32(ip2) == rval) {
810 ip0 = ip2;
811 match0 = repBase + repIndex;
812 matchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
813 assert((match0 != prefixStart) & (match0 != dictStart));
814 mLength = ip0[-1] == match0[-1];
815 ip0 -= mLength;
816 match0 -= mLength;
817 offcode = REPCODE1_TO_OFFBASE;
818 mLength += 4;
819 goto _match;
820 } }
821
822 { /* load match for ip[0] */
823 U32 const mval = idx >= dictStartIndex ?
824 MEM_read32(idxBase + idx) :
825 MEM_read32(ip0) ^ 1; /* guaranteed not to match */
826
827 /* check match at ip[0] */
828 if (MEM_read32(ip0) == mval) {
829 /* found a match! */
830 goto _offset;
831 } }
832
833 /* lookup ip[1] */
834 idx = hashTable[hash1];
835 idxBase = idx < prefixStartIndex ? dictBase : base;
836
837 /* hash ip[2] */
838 hash0 = hash1;
839 hash1 = ZSTD_hashPtr(ip2, hlog, mls);
840
841 /* advance to next positions */
842 ip0 = ip1;
843 ip1 = ip2;
844 ip2 = ip3;
845
846 /* write back hash table entry */
847 current0 = (U32)(ip0 - base);
848 hashTable[hash0] = current0;
849
850 { /* load match for ip[0] */
851 U32 const mval = idx >= dictStartIndex ?
852 MEM_read32(idxBase + idx) :
853 MEM_read32(ip0) ^ 1; /* guaranteed not to match */
854
855 /* check match at ip[0] */
856 if (MEM_read32(ip0) == mval) {
857 /* found a match! */
858 goto _offset;
859 } }
860
861 /* lookup ip[1] */
862 idx = hashTable[hash1];
863 idxBase = idx < prefixStartIndex ? dictBase : base;
864
865 /* hash ip[2] */
866 hash0 = hash1;
867 hash1 = ZSTD_hashPtr(ip2, hlog, mls);
868
869 /* advance to next positions */
870 ip0 = ip1;
871 ip1 = ip2;
872 ip2 = ip0 + step;
873 ip3 = ip1 + step;
874
875 /* calculate step */
876 if (ip2 >= nextStep) {
877 step++;
878 PREFETCH_L1(ip1 + 64);
879 PREFETCH_L1(ip1 + 128);
880 nextStep += kStepIncr;
881 }
882 } while (ip3 < ilimit);
883
884 _cleanup:
885 /* Note that there are probably still a couple positions we could search.
886 * However, it seems to be a meaningful performance hit to try to search
887 * them. So let's not. */
888
889 /* If offset_1 started invalid (offsetSaved1 != 0) and became valid (offset_1 != 0),
890 * rotate saved offsets. See comment in ZSTD_compressBlock_fast_noDict for more context. */
891 offsetSaved2 = ((offsetSaved1 != 0) && (offset_1 != 0)) ? offsetSaved1 : offsetSaved2;
892
893 /* save reps for next block */
894 rep[0] = offset_1 ? offset_1 : offsetSaved1;
895 rep[1] = offset_2 ? offset_2 : offsetSaved2;
896
897 /* Return the last literals size */
898 return (size_t)(iend - anchor);
899
900 _offset: /* Requires: ip0, idx, idxBase */
901
902 /* Compute the offset code. */
903 { U32 const offset = current0 - idx;
904 const BYTE* const lowMatchPtr = idx < prefixStartIndex ? dictStart : prefixStart;
905 matchEnd = idx < prefixStartIndex ? dictEnd : iend;
906 match0 = idxBase + idx;
907 offset_2 = offset_1;
908 offset_1 = offset;
909 offcode = OFFSET_TO_OFFBASE(offset);
910 mLength = 4;
911
912 /* Count the backwards match length. */
913 while (((ip0>anchor) & (match0>lowMatchPtr)) && (ip0[-1] == match0[-1])) {
914 ip0--;
915 match0--;
916 mLength++;
917 } }
918
919 _match: /* Requires: ip0, match0, offcode, matchEnd */
920
921 /* Count the forward length. */
922 assert(matchEnd != 0);
923 mLength += ZSTD_count_2segments(ip0 + mLength, match0 + mLength, iend, matchEnd, prefixStart);
924
925 ZSTD_storeSeq(seqStore, (size_t)(ip0 - anchor), anchor, iend, offcode, mLength);
926
927 ip0 += mLength;
928 anchor = ip0;
929
930 /* write next hash table entry */
931 if (ip1 < ip0) {
932 hashTable[hash1] = (U32)(ip1 - base);
933 }
934
935 /* Fill table and check for immediate repcode. */
936 if (ip0 <= ilimit) {
937 /* Fill Table */
938 assert(base+current0+2 > istart); /* check base overflow */
939 hashTable[ZSTD_hashPtr(base+current0+2, hlog, mls)] = current0+2; /* here because current+2 could be > iend-8 */
940 hashTable[ZSTD_hashPtr(ip0-2, hlog, mls)] = (U32)(ip0-2-base);
941
942 while (ip0 <= ilimit) {
943 U32 const repIndex2 = (U32)(ip0-base) - offset_2;
944 const BYTE* const repMatch2 = repIndex2 < prefixStartIndex ? dictBase + repIndex2 : base + repIndex2;
945 if ( ((ZSTD_index_overlap_check(prefixStartIndex, repIndex2)) & (offset_2 > 0))
946 && (MEM_read32(repMatch2) == MEM_read32(ip0)) ) {
947 const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend;
948 size_t const repLength2 = ZSTD_count_2segments(ip0+4, repMatch2+4, iend, repEnd2, prefixStart) + 4;
949 { U32 const tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; } /* swap offset_2 <=> offset_1 */
950 ZSTD_storeSeq(seqStore, 0 /*litlen*/, anchor, iend, REPCODE1_TO_OFFBASE, repLength2);
951 hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = (U32)(ip0-base);
952 ip0 += repLength2;
953 anchor = ip0;
954 continue;
955 }
956 break;
957 } }
958
959 goto _start;
960 }
961
962 ZSTD_GEN_FAST_FN(extDict, 4, 0)
963 ZSTD_GEN_FAST_FN(extDict, 5, 0)
964 ZSTD_GEN_FAST_FN(extDict, 6, 0)
965 ZSTD_GEN_FAST_FN(extDict, 7, 0)
966
ZSTD_compressBlock_fast_extDict(ZSTD_MatchState_t * ms,SeqStore_t * seqStore,U32 rep[ZSTD_REP_NUM],void const * src,size_t srcSize)967 size_t ZSTD_compressBlock_fast_extDict(
968 ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
969 void const* src, size_t srcSize)
970 {
971 U32 const mls = ms->cParams.minMatch;
972 assert(ms->dictMatchState == NULL);
973 switch(mls)
974 {
975 default: /* includes case 3 */
976 case 4 :
977 return ZSTD_compressBlock_fast_extDict_4_0(ms, seqStore, rep, src, srcSize);
978 case 5 :
979 return ZSTD_compressBlock_fast_extDict_5_0(ms, seqStore, rep, src, srcSize);
980 case 6 :
981 return ZSTD_compressBlock_fast_extDict_6_0(ms, seqStore, rep, src, srcSize);
982 case 7 :
983 return ZSTD_compressBlock_fast_extDict_7_0(ms, seqStore, rep, src, srcSize);
984 }
985 }
986