1 #ifndef BROTLI_RESEARCH_DURCHSCHLAG_H_ 2 #define BROTLI_RESEARCH_DURCHSCHLAG_H_ 3 4 #include <cstddef> 5 #include <cstdint> 6 #include <string> 7 #include <vector> 8 9 /** 10 * Generate a dictionary for given samples. 11 * 12 * @param dictionary_size_limit maximal dictionary size 13 * @param slice_len text slice size 14 * @param block_len score block length 15 * @param sample_sizes vector with sample sizes 16 * @param sample_data concatenated samples 17 * @return generated dictionary 18 */ 19 std::string durchschlag_generate( 20 size_t dictionary_size_limit, size_t slice_len, size_t block_len, 21 const std::vector<size_t>& sample_sizes, const uint8_t* sample_data); 22 23 //------------------------------------------------------------------------------ 24 // Lower level API for repetitive dictionary generation. 25 //------------------------------------------------------------------------------ 26 27 /* Pointer to position in text. */ 28 typedef uint32_t DurchschlagTextIdx; 29 30 /* Context is made public for flexible serialization / deserialization. */ 31 typedef struct DurchschlagContext { 32 DurchschlagTextIdx dataSize; 33 DurchschlagTextIdx sliceLen; 34 DurchschlagTextIdx numUniqueSlices; 35 std::vector<DurchschlagTextIdx> offsets; 36 std::vector<DurchschlagTextIdx> sliceMap; 37 } DurchschlagContext; 38 39 DurchschlagContext durchschlag_prepare(size_t slice_len, 40 const std::vector<size_t>& sample_sizes, const uint8_t* sample_data); 41 42 typedef enum DurchschalgResourceStrategy { 43 // Faster 44 DURCHSCHLAG_EXCLUSIVE = 0, 45 // Uses much less memory 46 DURCHSCHLAG_COLLABORATIVE = 1 47 } DurchschalgResourceStrategy; 48 49 std::string durchschlag_generate(DurchschalgResourceStrategy strategy, 50 size_t dictionary_size_limit, size_t block_len, 51 const DurchschlagContext& context, const uint8_t* sample_data); 52 53 //------------------------------------------------------------------------------ 54 // Suffix Array based preparation. 55 //------------------------------------------------------------------------------ 56 57 typedef struct DurchschlagIndex { 58 std::vector<DurchschlagTextIdx> lcp; 59 std::vector<DurchschlagTextIdx> sa; 60 } DurchschlagIndex; 61 62 DurchschlagIndex durchschlag_index(const std::vector<uint8_t>& data); 63 64 DurchschlagContext durchschlag_prepare(size_t slice_len, 65 const std::vector<size_t>& sample_sizes, const DurchschlagIndex& index); 66 67 //------------------------------------------------------------------------------ 68 // Data preparation. 69 //------------------------------------------------------------------------------ 70 71 /** 72 * Cut out unique slices. 73 * 74 * Both @p sample_sizes and @p sample_data are modified in-place. Number of 75 * samples remains unchanged, but some samples become shorter. 76 * 77 * @param slice_len (unique) slice size 78 * @param minimum_population minimum non-unique slice occurrence 79 * @param sample_sizes [in / out] vector with sample sizes 80 * @param sample_data [in / out] concatenated samples 81 */ 82 void durchschlag_distill(size_t slice_len, size_t minimum_population, 83 std::vector<size_t>* sample_sizes, uint8_t* sample_data); 84 85 /** 86 * Replace unique slices with zeroes. 87 * 88 * @p sample_data is modified in-place. Number of samples and their length 89 * remain unchanged. 90 * 91 * @param slice_len (unique) slice size 92 * @param minimum_population minimum non-unique slice occurrence 93 * @param sample_sizes vector with sample sizes 94 * @param sample_data [in / out] concatenated samples 95 */ 96 void durchschlag_purify(size_t slice_len, size_t minimum_population, 97 const std::vector<size_t>& sample_sizes, uint8_t* sample_data); 98 99 #endif // BROTLI_RESEARCH_DURCHSCHLAG_H_ 100