1 #include "./deorummolae.h"
2
3 #include <array>
4 #include <vector>
5
6 #include "./esaxx/sais.hxx"
7
8 /* Used for quick SA-entry to file mapping. Each file is padded to size that
9 is a multiple of chunk size. */
10 #define CHUNK_SIZE 64
11 /* Length of substring that is considered to be covered by dictionary string. */
12 #define CUT_MATCH 6
13 /* Minimal dictionary entry size. */
14 #define MIN_MATCH 24
15
16 /* Non tunable definitions. */
17 #define CHUNK_MASK (CHUNK_SIZE - 1)
18 #define COVERAGE_SIZE (1 << (LOG_MAX_FILES - 6))
19
20 /* File coverage: every bit set to 1 denotes a file covered by an isle. */
21 typedef std::array<uint64_t, COVERAGE_SIZE> Coverage;
22
popcount(uint64_t u)23 static int popcount(uint64_t u) {
24 return __builtin_popcountll(u);
25 }
26
27 /* Condense terminators and pad file entries. */
rewriteText(std::vector<int> * text)28 static void rewriteText(std::vector<int>* text) {
29 int terminator = text->back();
30 int prev = terminator;
31 size_t to = 0;
32 for (size_t from = 0; from < text->size(); ++from) {
33 int next = text->at(from);
34 if (next < 256 || prev < 256) {
35 text->at(to++) = next;
36 if (next >= 256) terminator = next;
37 }
38 prev = next;
39 }
40 text->resize(to);
41 if (text->empty()) text->push_back(terminator);
42 while (text->size() & CHUNK_MASK) text->push_back(terminator);
43 }
44
45 /* Reenumerate terminators for smaller alphabet. */
remapTerminators(std::vector<int> * text,int * next_terminator)46 static void remapTerminators(std::vector<int>* text, int* next_terminator) {
47 int prev = -1;
48 int x = 256;
49 for (size_t i = 0; i < text->size(); ++i) {
50 int next = text->at(i);
51 if (next < 256) { // Char.
52 // Do nothing.
53 } else if (prev < 256) { // Terminator after char.
54 next = x++;
55 } else { // Terminator after terminator.
56 next = prev;
57 }
58 text->at(i) = next;
59 prev = next;
60 }
61 *next_terminator = x;
62 }
63
64 /* Combine all file entries; create mapping position->file. */
buildFullText(std::vector<std::vector<int>> * data,std::vector<int> * full_text,std::vector<size_t> * file_map,std::vector<size_t> * file_offset,int * next_terminator)65 static void buildFullText(std::vector<std::vector<int>>* data,
66 std::vector<int>* full_text, std::vector<size_t>* file_map,
67 std::vector<size_t>* file_offset, int* next_terminator) {
68 file_map->resize(0);
69 file_offset->resize(0);
70 full_text->resize(0);
71 for (size_t i = 0; i < data->size(); ++i) {
72 file_offset->push_back(full_text->size());
73 std::vector<int>& file = data->at(i);
74 rewriteText(&file);
75 full_text->insert(full_text->end(), file.begin(), file.end());
76 file_map->insert(file_map->end(), file.size() / CHUNK_SIZE, i);
77 }
78 if (false) remapTerminators(full_text, next_terminator);
79 }
80
81 /* Build longest-common-prefix based on suffix array and text.
82 TODO: borrowed -> unknown efficiency. */
buildLcp(std::vector<int> * text,std::vector<int> * sa,std::vector<int> * lcp,std::vector<int> * invese_sa)83 static void buildLcp(std::vector<int>* text, std::vector<int>* sa,
84 std::vector<int>* lcp, std::vector<int>* invese_sa) {
85 int size = static_cast<int>(text->size());
86 lcp->resize(size);
87 int k = 0;
88 lcp->at(size - 1) = 0;
89 for (int i = 0; i < size; ++i) {
90 if (invese_sa->at(i) == size - 1) {
91 k = 0;
92 continue;
93 }
94 int j = sa->at(invese_sa->at(i) + 1); // Suffix which follow i-th suffix.
95 while (i + k < size && j + k < size && text->at(i + k) == text->at(j + k)) {
96 ++k;
97 }
98 lcp->at(invese_sa->at(i)) = k;
99 if (k > 0) --k;
100 }
101 }
102
103 /* Isle is a range in SA with LCP not less than some value.
104 When we raise the LCP requirement, the isle sunks and smaller isles appear
105 instead. */
106 typedef struct {
107 int lcp;
108 int l;
109 int r;
110 Coverage coverage;
111 } Isle;
112
113 /* Helper routine for `cutMatch`. */
poisonData(int pos,int length,std::vector<std::vector<int>> * data,std::vector<size_t> * file_map,std::vector<size_t> * file_offset,int * next_terminator)114 static void poisonData(int pos, int length, std::vector<std::vector<int>>* data,
115 std::vector<size_t>* file_map, std::vector<size_t>* file_offset,
116 int* next_terminator) {
117 size_t f = file_map->at(pos / CHUNK_SIZE);
118 pos -= file_offset->at(f);
119 std::vector<int>& file = data->at(f);
120 int l = (length == CUT_MATCH) ? CUT_MATCH : 1;
121 for (int j = 0; j < l; j++, pos++) {
122 if (file[pos] >= 256) continue;
123 if (file[pos + 1] >= 256) {
124 file[pos] = file[pos + 1];
125 } else if (pos > 0 && file[pos - 1] >= 256) {
126 file[pos] = file[pos - 1];
127 } else {
128 file[pos] = (*next_terminator)++;
129 }
130 }
131 }
132
133 /* Remove substrings of a given match from files.
134 Substrings are replaced with unique terminators, so next iteration SA would
135 not allow to cross removed areas. */
cutMatch(std::vector<std::vector<int>> * data,int index,int length,std::vector<int> * sa,std::vector<int> * lcp,std::vector<int> * invese_sa,int * next_terminator,std::vector<size_t> * file_map,std::vector<size_t> * file_offset)136 static void cutMatch(std::vector<std::vector<int>>* data, int index, int length,
137 std::vector<int>* sa, std::vector<int>* lcp, std::vector<int>* invese_sa,
138 int* next_terminator, std::vector<size_t>* file_map,
139 std::vector<size_t>* file_offset) {
140 while (length >= CUT_MATCH) {
141 int i = index;
142 while (lcp->at(i) >= length) {
143 i++;
144 poisonData(
145 sa->at(i), length, data, file_map, file_offset, next_terminator);
146 }
147 while (true) {
148 poisonData(
149 sa->at(index), length, data, file_map, file_offset, next_terminator);
150 if (index == 0 || lcp->at(index - 1) < length) break;
151 index--;
152 }
153 length--;
154 index = invese_sa->at(sa->at(index) + 1);
155 }
156 }
157
DM_generate(uint8_t * dictionary,size_t dictionary_size_limit,size_t num_samples,const size_t * sample_sizes,const uint8_t * sample_data)158 size_t DM_generate(uint8_t* dictionary, size_t dictionary_size_limit,
159 size_t num_samples, const size_t* sample_sizes,
160 const uint8_t* sample_data) {
161 {
162 uint64_t tmp = 0;
163 if (popcount(tmp - 1u) != 64) {
164 fprintf(stderr, "64-bit platform is required\n");
165 return 0;
166 }
167 }
168
169 /* Could use 256 + '0' for easier debugging. */
170 int next_terminator = 256;
171
172 std::vector<std::vector<int>> data;
173
174 size_t offset = 0;
175 if (num_samples > MAX_FILES) num_samples = MAX_FILES;
176 for (size_t n = 0; n < num_samples; ++n) {
177 size_t next_offset = offset + sample_sizes[n];
178 data.push_back(
179 std::vector<int>(sample_data + offset, sample_data + next_offset));
180 offset = next_offset;
181 data.back().push_back(next_terminator++);
182 }
183
184 /* Most arrays are allocated once, and then just resized to smaller and
185 smaller sizes. */
186 std::vector<int> full_text;
187 std::vector<size_t> file_map;
188 std::vector<size_t> file_offset;
189 std::vector<int> sa;
190 std::vector<int> invese_sa;
191 std::vector<int> lcp;
192 std::vector<Isle> isles;
193 std::vector<char> output_data;
194 size_t total = 0;
195 size_t total_cost = 0;
196 size_t best_cost;
197 Isle best_isle;
198 int min_count = num_samples;
199
200 while (true) {
201 size_t max_match = dictionary_size_limit - total;
202 buildFullText(&data, &full_text, &file_map, &file_offset, &next_terminator);
203 sa.resize(full_text.size());
204 saisxx(full_text.data(), sa.data(), static_cast<int>(full_text.size()),
205 next_terminator);
206 invese_sa.resize(full_text.size());
207 for (int i = 0; i < full_text.size(); ++i) invese_sa[sa[i]] = i;
208 buildLcp(&full_text, &sa, &lcp, &invese_sa);
209
210 /* Do not rebuild SA/LCP, just use different selection. */
211 retry:
212 best_cost = 0;
213 best_isle = {0, 0, 0, {{0}}};
214 isles.resize(0);
215 isles.push_back(best_isle);
216
217 for (int i = 0; i < static_cast<int>(lcp.size()); ++i) {
218 int l = i;
219 Coverage cov = {{0}};
220 int f = file_map[sa[i] / CHUNK_SIZE];
221 cov[f >> 6] = ((uint64_t)1) << (f & 63);
222 while (lcp[i] < isles.back().lcp) {
223 Isle& top = isles.back();
224 top.r = i;
225 l = top.l;
226 for (size_t x = 0; x < cov.size(); ++x) cov[x] |= top.coverage[x];
227 int count = 0;
228 for (size_t x = 0; x < cov.size(); ++x) count += popcount(cov[x]);
229 int effective_lcp = top.lcp;
230 /* Restrict (last) dictionary entry length. */
231 if (effective_lcp > max_match) effective_lcp = max_match;
232 int cost = count * effective_lcp;
233 if (cost > best_cost && count >= min_count &&
234 effective_lcp >= MIN_MATCH) {
235 best_cost = cost;
236 best_isle = top;
237 best_isle.lcp = effective_lcp;
238 }
239 isles.pop_back();
240 for (size_t x = 0; x < cov.size(); ++x) {
241 isles.back().coverage[x] |= cov[x];
242 }
243 }
244 if (lcp[i] > isles.back().lcp) isles.push_back({lcp[i], l, 0, {{0}}});
245 for (size_t x = 0; x < cov.size(); ++x) {
246 isles.back().coverage[x] |= cov[x];
247 }
248 }
249
250 /* When saturated matches do not match length restrictions, lower the
251 saturation requirements. */
252 if (best_cost == 0 || best_isle.lcp < MIN_MATCH) {
253 if (min_count >= 8) {
254 min_count = (min_count * 7) / 8;
255 fprintf(stderr, "Retry: min_count=%d\n", min_count);
256 goto retry;
257 }
258 break;
259 }
260
261 /* Save the entry. */
262 fprintf(stderr,
263 "Savings: %zu+%zu, dictionary: %zu+%d\n",
264 total_cost, best_cost, total, best_isle.lcp);
265 for (size_t i = 0; i < best_isle.lcp; ++i) {
266 dictionary[total + i] =
267 static_cast<uint8_t>(full_text[sa[best_isle.l] + i]);
268 }
269 total += best_isle.lcp;
270 total_cost += best_cost;
271 cutMatch(&data, best_isle.l, best_isle.lcp, &sa, &lcp,
272 &invese_sa, &next_terminator, &file_map, &file_offset);
273 if (total >= dictionary_size_limit) break;
274 }
275 return total;
276 }
277