1 #include "./durchschlag.h"
2
3 #include <algorithm>
4 #include <exception> /* terminate */
5
6 #include "divsufsort.h"
7
8 /* Pointer to position in text. */
9 typedef DurchschlagTextIdx TextIdx;
10
11 /* (Sum of) value(s) of slice(s). */
12 typedef uint32_t Score;
13
14 typedef struct HashSlot {
15 TextIdx next;
16 TextIdx offset;
17 } HashSlot;
18
19 typedef struct MetaSlot {
20 TextIdx mark;
21 Score score;
22 } MetaSlot;
23
24 typedef struct Range {
25 TextIdx start;
26 TextIdx end;
27 } Range;
28
29 typedef struct Candidate {
30 Score score;
31 TextIdx position;
32 } Candidate;
33
34 struct greaterScore {
operator ()greaterScore35 bool operator()(const Candidate& a, const Candidate& b) const {
36 return (a.score > b.score) ||
37 ((a.score == b.score) && (a.position < b.position));
38 }
39 };
40
41 struct lessScore {
operator ()lessScore42 bool operator()(const Candidate& a, const Candidate& b) const {
43 return (a.score < b.score) ||
44 ((a.score == b.score) && (a.position > b.position));
45 }
46 };
47
48 #define CANDIDATE_BUNDLE_SIZE (1 << 18)
49
fatal(const char * error)50 static void fatal(const char* error) {
51 fprintf(stderr, "%s\n", error);
52 std::terminate();
53 }
54
calculateDictionarySize(const std::vector<Range> & ranges)55 static TextIdx calculateDictionarySize(const std::vector<Range>& ranges) {
56 TextIdx result = 0;
57 for (size_t i = 0; i < ranges.size(); ++i) {
58 const Range& r = ranges[i];
59 result += r.end - r.start;
60 }
61 return result;
62 }
63
createDictionary(const uint8_t * data,const std::vector<Range> & ranges,size_t limit)64 static std::string createDictionary(
65 const uint8_t* data, const std::vector<Range>& ranges, size_t limit) {
66 std::string output;
67 output.reserve(calculateDictionarySize(ranges));
68 for (size_t i = 0; i < ranges.size(); ++i) {
69 const Range& r = ranges[i];
70 output.insert(output.end(), &data[r.start], &data[r.end]);
71 }
72 if (output.size() > limit) {
73 output.resize(limit);
74 }
75 return output;
76 }
77
78 /* precondition: span > 0
79 precondition: end + span == len(shortcut) */
buildCandidatesList(std::vector<Candidate> * candidates,std::vector<MetaSlot> * map,TextIdx span,const TextIdx * shortcut,TextIdx end)80 static Score buildCandidatesList(std::vector<Candidate>* candidates,
81 std::vector<MetaSlot>* map, TextIdx span, const TextIdx* shortcut,
82 TextIdx end) {
83 candidates->resize(0);
84
85 size_t n = map->size();
86 MetaSlot* slots = map->data();
87 for (size_t j = 0; j < n; ++j) {
88 slots[j].mark = 0;
89 }
90
91 Score score = 0;
92 /* Consider the whole span, except one last item. The following loop will
93 add the last item to the end of the "chain", evaluate it, and cut one
94 "link" form the beginning. */
95 for (size_t j = 0; j < span - 1; ++j) {
96 MetaSlot& item = slots[shortcut[j]];
97 if (item.mark == 0) {
98 score += item.score;
99 }
100 item.mark++;
101 }
102
103 TextIdx i = 0;
104 TextIdx limit = std::min<TextIdx>(end, CANDIDATE_BUNDLE_SIZE);
105 Score maxScore = 0;
106 for (; i < limit; ++i) {
107 TextIdx slice = shortcut[i + span - 1];
108 MetaSlot& pick = slots[slice];
109 if (pick.mark == 0) {
110 score += pick.score;
111 }
112 pick.mark++;
113
114 if (score > maxScore) {
115 maxScore = score;
116 }
117 candidates->push_back({score, i});
118
119 MetaSlot& drop = slots[shortcut[i]];
120 drop.mark--;
121 if (drop.mark == 0) {
122 score -= drop.score;
123 }
124 }
125
126 std::make_heap(candidates->begin(), candidates->end(), greaterScore());
127 Score minScore = candidates->at(0).score;
128 for (; i < end; ++i) {
129 TextIdx slice = shortcut[i + span - 1];
130 MetaSlot& pick = slots[slice];
131 if (pick.mark == 0) {
132 score += pick.score;
133 }
134 pick.mark++;
135
136 if (score > maxScore) {
137 maxScore = score;
138 }
139 if (score >= minScore) {
140 candidates->push_back({score, i});
141 std::push_heap(candidates->begin(), candidates->end(), greaterScore());
142 if (candidates->size() > CANDIDATE_BUNDLE_SIZE && maxScore != minScore) {
143 while (candidates->at(0).score == minScore) {
144 std::pop_heap(candidates->begin(), candidates->end(), greaterScore());
145 candidates->pop_back();
146 }
147 minScore = candidates->at(0).score;
148 }
149 }
150
151 MetaSlot& drop = slots[shortcut[i]];
152 drop.mark--;
153 if (drop.mark == 0) {
154 score -= drop.score;
155 }
156 }
157
158 for (size_t j = 0; j < n; ++j) {
159 slots[j].mark = 0;
160 }
161
162 std::make_heap(candidates->begin(), candidates->end(), lessScore());
163 return minScore;
164 }
165
166 /* precondition: span > 0
167 precondition: end + span == len(shortcut) */
rebuildCandidatesList(std::vector<TextIdx> * candidates,std::vector<MetaSlot> * map,TextIdx span,const TextIdx * shortcut,TextIdx end,TextIdx * next)168 static Score rebuildCandidatesList(std::vector<TextIdx>* candidates,
169 std::vector<MetaSlot>* map, TextIdx span, const TextIdx* shortcut,
170 TextIdx end, TextIdx* next) {
171 size_t n = candidates->size();
172 TextIdx* data = candidates->data();
173 for (size_t i = 0; i < n; ++i) {
174 data[i] = 0;
175 }
176
177 n = map->size();
178 MetaSlot* slots = map->data();
179 for (size_t i = 0; i < n; ++i) {
180 slots[i].mark = 0;
181 }
182
183 Score score = 0;
184 /* Consider the whole span, except one last item. The following loop will
185 add the last item to the end of the "chain", evaluate it, and cut one
186 "link" form the beginning. */
187 for (TextIdx i = 0; i < span - 1; ++i) {
188 MetaSlot& item = slots[shortcut[i]];
189 if (item.mark == 0) {
190 score += item.score;
191 }
192 item.mark++;
193 }
194
195 Score maxScore = 0;
196 for (TextIdx i = 0; i < end; ++i) {
197 MetaSlot& pick = slots[shortcut[i + span - 1]];
198 if (pick.mark == 0) {
199 score += pick.score;
200 }
201 pick.mark++;
202
203 if (candidates->size() <= score) {
204 candidates->resize(score + 1);
205 }
206 if (score > maxScore) {
207 maxScore = score;
208 }
209 next[i] = candidates->at(score);
210 candidates->at(score) = i;
211
212 MetaSlot& drop = slots[shortcut[i]];
213 drop.mark--;
214 if (drop.mark == 0) {
215 score -= drop.score;
216 }
217 }
218
219 for (size_t i = 0; i < n; ++i) {
220 slots[i].mark = 0;
221 }
222
223 candidates->resize(maxScore + 1);
224 return maxScore;
225 }
226
addRange(std::vector<Range> * ranges,TextIdx start,TextIdx end)227 static void addRange(std::vector<Range>* ranges, TextIdx start, TextIdx end) {
228 for (auto it = ranges->begin(); it != ranges->end();) {
229 if (end < it->start) {
230 ranges->insert(it, {start, end});
231 return;
232 }
233 if (it->end < start) {
234 it++;
235 continue;
236 }
237 // Combine with existing.
238 start = std::min(start, it->start);
239 end = std::max(end, it->end);
240 // Remove consumed vector and continue.
241 it = ranges->erase(it);
242 }
243 ranges->push_back({start, end});
244 }
245
durchschlag_generate(size_t dictionary_size_limit,size_t slice_len,size_t block_len,const std::vector<size_t> & sample_sizes,const uint8_t * sample_data)246 std::string durchschlag_generate(
247 size_t dictionary_size_limit, size_t slice_len, size_t block_len,
248 const std::vector<size_t>& sample_sizes, const uint8_t* sample_data) {
249 DurchschlagContext ctx = durchschlag_prepare(
250 slice_len, sample_sizes, sample_data);
251 return durchschlag_generate(DURCHSCHLAG_COLLABORATIVE,
252 dictionary_size_limit, block_len, ctx, sample_data);
253 }
254
durchschlag_prepare(size_t slice_len,const std::vector<size_t> & sample_sizes,const uint8_t * sample_data)255 DurchschlagContext durchschlag_prepare(size_t slice_len,
256 const std::vector<size_t>& sample_sizes, const uint8_t* sample_data) {
257 /* Parameters aliasing */
258 TextIdx sliceLen = static_cast<TextIdx>(slice_len);
259 if (sliceLen != slice_len) fatal("slice_len is too large");
260 if (sliceLen < 1) fatal("slice_len is too small");
261 const uint8_t* data = sample_data;
262
263 TextIdx total = 0;
264 std::vector<TextIdx> offsets;
265 offsets.reserve(sample_sizes.size());
266 for (size_t i = 0; i < sample_sizes.size(); ++i) {
267 TextIdx delta = static_cast<TextIdx>(sample_sizes[i]);
268 if (delta != sample_sizes[i]) fatal("sample is too large");
269 if (delta == 0) fatal("0-length samples are prohibited");
270 TextIdx next_total = total + delta;
271 if (next_total <= total) fatal("corpus is too large");
272 total = next_total;
273 offsets.push_back(total);
274 }
275
276 if (total < sliceLen) fatal("slice_len is larger than corpus size");
277 TextIdx end = total - static_cast<TextIdx>(sliceLen) + 1;
278 TextIdx hashLen = 11;
279 while (hashLen < 29 && ((1u << hashLen) < end)) {
280 hashLen += 3;
281 }
282 hashLen -= 3;
283 TextIdx hashMask = (1u << hashLen) - 1u;
284 std::vector<TextIdx> hashHead(1 << hashLen);
285 TextIdx hash = 0;
286 TextIdx lShift = 3;
287 TextIdx rShift = hashLen - lShift;
288 for (TextIdx i = 0; i < sliceLen - 1; ++i) {
289 TextIdx v = data[i];
290 hash = (((hash << lShift) | (hash >> rShift)) & hashMask) ^ v;
291 }
292 TextIdx lShiftX = (lShift * (sliceLen - 1)) % hashLen;
293 TextIdx rShiftX = hashLen - lShiftX;
294
295 std::vector<HashSlot> map;
296 map.push_back({0, 0});
297 TextIdx hashSlot = 1;
298 std::vector<TextIdx> sliceMap;
299 sliceMap.reserve(end);
300 for (TextIdx i = 0; i < end; ++i) {
301 TextIdx v = data[i + sliceLen - 1];
302 TextIdx bucket = (((hash << lShift) | (hash >> rShift)) & hashMask) ^ v;
303 v = data[i];
304 hash = bucket ^ (((v << lShiftX) | (v >> rShiftX)) & hashMask);
305 TextIdx slot = hashHead[bucket];
306 while (slot != 0) {
307 HashSlot& item = map[slot];
308 TextIdx start = item.offset;
309 bool miss = false;
310 for (TextIdx j = 0; j < sliceLen; ++j) {
311 if (data[i + j] != data[start + j]) {
312 miss = true;
313 break;
314 }
315 }
316 if (!miss) {
317 sliceMap.push_back(slot);
318 break;
319 }
320 slot = item.next;
321 }
322 if (slot == 0) {
323 map.push_back({hashHead[bucket], i});
324 hashHead[bucket] = hashSlot;
325 sliceMap.push_back(hashSlot);
326 hashSlot++;
327 }
328 }
329
330 return {total, sliceLen, static_cast<TextIdx>(map.size()),
331 std::move(offsets), std::move(sliceMap)};
332 }
333
durchschlag_prepare(size_t slice_len,const std::vector<size_t> & sample_sizes,const DurchschlagIndex & index)334 DurchschlagContext durchschlag_prepare(size_t slice_len,
335 const std::vector<size_t>& sample_sizes, const DurchschlagIndex& index) {
336 /* Parameters aliasing */
337 TextIdx sliceLen = static_cast<TextIdx>(slice_len);
338 if (sliceLen != slice_len) fatal("slice_len is too large");
339 if (sliceLen < 1) fatal("slice_len is too small");
340 const TextIdx* lcp = index.lcp.data();
341 const TextIdx* sa = index.sa.data();
342
343 TextIdx total = 0;
344 std::vector<TextIdx> offsets;
345 offsets.reserve(sample_sizes.size());
346 for (size_t i = 0; i < sample_sizes.size(); ++i) {
347 TextIdx delta = static_cast<TextIdx>(sample_sizes[i]);
348 if (delta != sample_sizes[i]) fatal("sample is too large");
349 if (delta == 0) fatal("0-length samples are prohibited");
350 TextIdx next_total = total + delta;
351 if (next_total <= total) fatal("corpus is too large");
352 total = next_total;
353 offsets.push_back(total);
354 }
355
356 if (total < sliceLen) fatal("slice_len is larger than corpus size");
357 TextIdx counter = 1;
358 TextIdx end = total - sliceLen + 1;
359 std::vector<TextIdx> sliceMap(total);
360 TextIdx last = 0;
361 TextIdx current = 1;
362 while (current <= total) {
363 if (lcp[current - 1] < sliceLen) {
364 for (TextIdx i = last; i < current; ++i) {
365 sliceMap[sa[i]] = counter;
366 }
367 counter++;
368 last = current;
369 }
370 current++;
371 }
372 sliceMap.resize(end);
373
374 // Reorder items for the better locality.
375 std::vector<TextIdx> reorder(counter);
376 counter = 1;
377 for (TextIdx i = 0; i < end; ++i) {
378 if (reorder[sliceMap[i]] == 0) {
379 reorder[sliceMap[i]] = counter++;
380 }
381 }
382 for (TextIdx i = 0; i < end; ++i) {
383 sliceMap[i] = reorder[sliceMap[i]];
384 }
385
386 return {total, sliceLen, counter, std::move(offsets), std::move(sliceMap)};
387 }
388
durchschlag_index(const std::vector<uint8_t> & data)389 DurchschlagIndex durchschlag_index(const std::vector<uint8_t>& data) {
390 TextIdx total = static_cast<TextIdx>(data.size());
391 if (total != data.size()) fatal("corpus is too large");
392 saidx_t saTotal = static_cast<saidx_t>(total);
393 if (saTotal < 0) fatal("corpus is too large");
394 if (static_cast<TextIdx>(saTotal) != total) fatal("corpus is too large");
395 std::vector<TextIdx> sa(total);
396 /* Hopefully, non-negative int32_t values match TextIdx ones. */
397 if (sizeof(TextIdx) != sizeof(int32_t)) fatal("type length mismatch");
398 int32_t* saData = reinterpret_cast<int32_t*>(sa.data());
399 divsufsort(data.data(), saData, saTotal);
400
401 std::vector<TextIdx> isa(total);
402 for (TextIdx i = 0; i < total; ++i) isa[sa[i]] = i;
403
404 // TODO: borrowed -> unknown efficiency.
405 std::vector<TextIdx> lcp(total);
406 TextIdx k = 0;
407 lcp[total - 1] = 0;
408 for (TextIdx i = 0; i < total; ++i) {
409 TextIdx current = isa[i];
410 if (current == total - 1) {
411 k = 0;
412 continue;
413 }
414 TextIdx j = sa[current + 1]; // Suffix which follow i-th suffix.
415 while ((i + k < total) && (j + k < total) && (data[i + k] == data[j + k])) {
416 ++k;
417 }
418 lcp[current] = k;
419 if (k > 0) --k;
420 }
421
422 return {std::move(lcp), std::move(sa)};
423 }
424
ScoreSlices(const std::vector<TextIdx> & offsets,std::vector<MetaSlot> & map,const TextIdx * shortcut,TextIdx end)425 static void ScoreSlices(const std::vector<TextIdx>& offsets,
426 std::vector<MetaSlot>& map, const TextIdx* shortcut, TextIdx end) {
427 TextIdx piece = 0;
428 /* Fresh map contains all zeroes -> initial mark should be different. */
429 TextIdx mark = 1;
430 for (TextIdx i = 0; i < end; ++i) {
431 if (offsets[piece] == i) {
432 piece++;
433 mark++;
434 }
435 MetaSlot& item = map[shortcut[i]];
436 if (item.mark != mark) {
437 item.mark = mark;
438 item.score++;
439 }
440 }
441 }
442
durchschlagGenerateExclusive(size_t dictionary_size_limit,size_t block_len,const DurchschlagContext & context,const uint8_t * sample_data)443 static std::string durchschlagGenerateExclusive(
444 size_t dictionary_size_limit, size_t block_len,
445 const DurchschlagContext& context, const uint8_t* sample_data) {
446 /* Parameters aliasing */
447 TextIdx targetSize = static_cast<TextIdx>(dictionary_size_limit);
448 if (targetSize != dictionary_size_limit) {
449 fprintf(stderr, "dictionary_size_limit is too large\n");
450 return "";
451 }
452 TextIdx sliceLen = context.sliceLen;
453 TextIdx total = context.dataSize;
454 TextIdx blockLen = static_cast<TextIdx>(block_len);
455 if (blockLen != block_len) {
456 fprintf(stderr, "block_len is too large\n");
457 return "";
458 }
459 const uint8_t* data = sample_data;
460 const std::vector<TextIdx>& offsets = context.offsets;
461 std::vector<MetaSlot> map(context.numUniqueSlices);
462 const TextIdx* shortcut = context.sliceMap.data();
463
464 /* Initialization */
465 if (blockLen < sliceLen) {
466 fprintf(stderr, "sliceLen is larger than block_len\n");
467 return "";
468 }
469 if (targetSize < blockLen || total < blockLen) {
470 fprintf(stderr, "block_len is too large\n");
471 return "";
472 }
473 TextIdx end = total - sliceLen + 1;
474 ScoreSlices(offsets, map, shortcut, end);
475 TextIdx span = blockLen - sliceLen + 1;
476 end = static_cast<TextIdx>(context.sliceMap.size()) - span;
477 std::vector<TextIdx> candidates;
478 std::vector<TextIdx> next(end);
479 Score maxScore = rebuildCandidatesList(
480 &candidates, &map, span, shortcut, end, next.data());
481
482 /* Block selection */
483 const size_t triesLimit = (600 * 1000000) / span;
484 const size_t candidatesLimit = (150 * 1000000) / span;
485 std::vector<Range> ranges;
486 TextIdx mark = 0;
487 size_t numTries = 0;
488 while (true) {
489 TextIdx dictSize = calculateDictionarySize(ranges);
490 size_t numCandidates = 0;
491 if (dictSize > targetSize - blockLen) {
492 break;
493 }
494 if (maxScore == 0) {
495 break;
496 }
497 while (true) {
498 TextIdx candidate = 0;
499 while (maxScore > 0) {
500 if (candidates[maxScore] != 0) {
501 candidate = candidates[maxScore];
502 candidates[maxScore] = next[candidate];
503 break;
504 }
505 maxScore--;
506 }
507 if (maxScore == 0) {
508 break;
509 }
510 mark++;
511 numTries++;
512 numCandidates++;
513 Score score = 0;
514 for (size_t j = candidate; j < candidate + span; ++j) {
515 MetaSlot& item = map[shortcut[j]];
516 if (item.mark != mark) {
517 score += item.score;
518 item.mark = mark;
519 }
520 }
521 if (score < maxScore) {
522 if (numTries < triesLimit && numCandidates < candidatesLimit) {
523 next[candidate] = candidates[score];
524 candidates[score] = candidate;
525 } else {
526 maxScore = rebuildCandidatesList(
527 &candidates, &map, span, shortcut, end, next.data());
528 mark = 0;
529 numTries = 0;
530 numCandidates = 0;
531 }
532 continue;
533 } else if (score > maxScore) {
534 fprintf(stderr, "Broken invariant\n");
535 return "";
536 }
537 for (TextIdx j = candidate; j < candidate + span; ++j) {
538 MetaSlot& item = map[shortcut[j]];
539 item.score = 0;
540 }
541 addRange(&ranges, candidate, candidate + blockLen);
542 break;
543 }
544 }
545
546 return createDictionary(data, ranges, targetSize);
547 }
548
durchschlagGenerateCollaborative(size_t dictionary_size_limit,size_t block_len,const DurchschlagContext & context,const uint8_t * sample_data)549 static std::string durchschlagGenerateCollaborative(
550 size_t dictionary_size_limit, size_t block_len,
551 const DurchschlagContext& context, const uint8_t* sample_data) {
552 /* Parameters aliasing */
553 TextIdx targetSize = static_cast<TextIdx>(dictionary_size_limit);
554 if (targetSize != dictionary_size_limit) {
555 fprintf(stderr, "dictionary_size_limit is too large\n");
556 return "";
557 }
558 TextIdx sliceLen = context.sliceLen;
559 TextIdx total = context.dataSize;
560 TextIdx blockLen = static_cast<TextIdx>(block_len);
561 if (blockLen != block_len) {
562 fprintf(stderr, "block_len is too large\n");
563 return "";
564 }
565 const uint8_t* data = sample_data;
566 const std::vector<TextIdx>& offsets = context.offsets;
567 std::vector<MetaSlot> map(context.numUniqueSlices);
568 const TextIdx* shortcut = context.sliceMap.data();
569
570 /* Initialization */
571 if (blockLen < sliceLen) {
572 fprintf(stderr, "sliceLen is larger than block_len\n");
573 return "";
574 }
575 if (targetSize < blockLen || total < blockLen) {
576 fprintf(stderr, "block_len is too large\n");
577 return "";
578 }
579 TextIdx end = total - sliceLen + 1;
580 ScoreSlices(offsets, map, shortcut, end);
581 TextIdx span = blockLen - sliceLen + 1;
582 end = static_cast<TextIdx>(context.sliceMap.size()) - span;
583 std::vector<Candidate> candidates;
584 candidates.reserve(CANDIDATE_BUNDLE_SIZE + 1024);
585 Score minScore = buildCandidatesList(&candidates, &map, span, shortcut, end);
586
587 /* Block selection */
588 std::vector<Range> ranges;
589 TextIdx mark = 0;
590 while (true) {
591 TextIdx dictSize = calculateDictionarySize(ranges);
592 if (dictSize > targetSize - blockLen) {
593 break;
594 }
595 if (minScore == 0 && candidates.empty()) {
596 break;
597 }
598 while (true) {
599 if (candidates.empty()) {
600 minScore = buildCandidatesList(&candidates, &map, span, shortcut, end);
601 mark = 0;
602 }
603 TextIdx candidate = candidates[0].position;
604 Score expectedScore = candidates[0].score;
605 if (expectedScore == 0) {
606 candidates.resize(0);
607 break;
608 }
609 std::pop_heap(candidates.begin(), candidates.end(), lessScore());
610 candidates.pop_back();
611 mark++;
612 Score score = 0;
613 for (TextIdx j = candidate; j < candidate + span; ++j) {
614 MetaSlot& item = map[shortcut[j]];
615 if (item.mark != mark) {
616 score += item.score;
617 item.mark = mark;
618 }
619 }
620 if (score < expectedScore) {
621 if (score >= minScore) {
622 candidates.push_back({score, candidate});
623 std::push_heap(candidates.begin(), candidates.end(), lessScore());
624 }
625 continue;
626 } else if (score > expectedScore) {
627 fatal("Broken invariant");
628 }
629 for (TextIdx j = candidate; j < candidate + span; ++j) {
630 MetaSlot& item = map[shortcut[j]];
631 item.score = 0;
632 }
633 addRange(&ranges, candidate, candidate + blockLen);
634 break;
635 }
636 }
637
638 return createDictionary(data, ranges, targetSize);
639 }
640
durchschlag_generate(DurchschalgResourceStrategy strategy,size_t dictionary_size_limit,size_t block_len,const DurchschlagContext & context,const uint8_t * sample_data)641 std::string durchschlag_generate(DurchschalgResourceStrategy strategy,
642 size_t dictionary_size_limit, size_t block_len,
643 const DurchschlagContext& context, const uint8_t* sample_data) {
644 if (strategy == DURCHSCHLAG_COLLABORATIVE) {
645 return durchschlagGenerateCollaborative(
646 dictionary_size_limit, block_len, context, sample_data);
647 } else {
648 return durchschlagGenerateExclusive(
649 dictionary_size_limit, block_len, context, sample_data);
650 }
651 }
652
durchschlag_distill(size_t slice_len,size_t minimum_population,std::vector<size_t> * sample_sizes,uint8_t * sample_data)653 void durchschlag_distill(size_t slice_len, size_t minimum_population,
654 std::vector<size_t>* sample_sizes, uint8_t* sample_data) {
655 /* Parameters aliasing */
656 uint8_t* data = sample_data;
657
658 /* Build slice map. */
659 DurchschlagContext context = durchschlag_prepare(
660 slice_len, *sample_sizes, data);
661
662 /* Calculate slice population. */
663 const std::vector<TextIdx>& offsets = context.offsets;
664 std::vector<MetaSlot> map(context.numUniqueSlices);
665 const TextIdx* shortcut = context.sliceMap.data();
666 TextIdx sliceLen = context.sliceLen;
667 TextIdx total = context.dataSize;
668 TextIdx end = total - sliceLen + 1;
669 ScoreSlices(offsets, map, shortcut, end);
670
671 /* Condense samples, omitting unique slices. */
672 TextIdx readPos = 0;
673 TextIdx writePos = 0;
674 TextIdx lastNonUniquePos = 0;
675 for (TextIdx i = 0; i < sample_sizes->size(); ++i) {
676 TextIdx sampleStart = writePos;
677 TextIdx oldSampleEnd =
678 readPos + static_cast<TextIdx>(sample_sizes->at(i));
679 while (readPos < oldSampleEnd) {
680 if (readPos < end) {
681 MetaSlot& item = map[shortcut[readPos]];
682 if (item.score >= minimum_population) {
683 lastNonUniquePos = readPos + sliceLen;
684 }
685 }
686 if (readPos < lastNonUniquePos) {
687 data[writePos++] = data[readPos];
688 }
689 readPos++;
690 }
691 sample_sizes->at(i) = writePos - sampleStart;
692 }
693 }
694
durchschlag_purify(size_t slice_len,size_t minimum_population,const std::vector<size_t> & sample_sizes,uint8_t * sample_data)695 void durchschlag_purify(size_t slice_len, size_t minimum_population,
696 const std::vector<size_t>& sample_sizes, uint8_t* sample_data) {
697 /* Parameters aliasing */
698 uint8_t* data = sample_data;
699
700 /* Build slice map. */
701 DurchschlagContext context = durchschlag_prepare(
702 slice_len, sample_sizes, data);
703
704 /* Calculate slice population. */
705 const std::vector<TextIdx>& offsets = context.offsets;
706 std::vector<MetaSlot> map(context.numUniqueSlices);
707 const TextIdx* shortcut = context.sliceMap.data();
708 TextIdx sliceLen = context.sliceLen;
709 TextIdx total = context.dataSize;
710 TextIdx end = total - sliceLen + 1;
711 ScoreSlices(offsets, map, shortcut, end);
712
713 /* Rewrite samples, zeroing out unique slices. */
714 TextIdx lastNonUniquePos = 0;
715 for (TextIdx readPos = 0; readPos < total; ++readPos) {
716 if (readPos < end) {
717 MetaSlot& item = map[shortcut[readPos]];
718 if (item.score >= minimum_population) {
719 lastNonUniquePos = readPos + sliceLen;
720 }
721 }
722 if (readPos >= lastNonUniquePos) {
723 data[readPos] = 0;
724 }
725 }
726 }
727