1 // Copyright 2013 Google Inc. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 //
15 // Function to find backward reference copies.
16
17 #include "./backward_references.h"
18
19 #include <algorithm>
20 #include <vector>
21
22 #include "./command.h"
23
24 namespace brotli {
25
26 template<typename Hasher>
CreateBackwardReferences(size_t num_bytes,size_t position,const uint8_t * ringbuffer,const float * literal_cost,size_t ringbuffer_mask,const size_t max_backward_limit,Hasher * hasher,std::vector<Command> * commands)27 void CreateBackwardReferences(size_t num_bytes,
28 size_t position,
29 const uint8_t* ringbuffer,
30 const float* literal_cost,
31 size_t ringbuffer_mask,
32 const size_t max_backward_limit,
33 Hasher* hasher,
34 std::vector<Command>* commands) {
35 // Length heuristic that seems to help probably by better selection
36 // of lazy matches of similar lengths.
37 int insert_length = 0;
38 size_t i = position & ringbuffer_mask;
39 const int i_diff = position - i;
40 const size_t i_end = i + num_bytes;
41
42 const int random_heuristics_window_size = 512;
43 int apply_random_heuristics = i + random_heuristics_window_size;
44
45 double average_cost = 0.0;
46 for (int k = position; k < position + num_bytes; ++k) {
47 average_cost += literal_cost[k & ringbuffer_mask];
48 }
49 average_cost /= num_bytes;
50 hasher->set_average_cost(average_cost);
51
52 // M1 match is for considering for two repeated copies, if moving
53 // one literal form the previous copy to the current one allows the
54 // current copy to be more efficient (because the way static dictionary
55 // codes words). M1 matching improves text compression density by ~0.15 %.
56 bool match_found_M1 = false;
57 size_t best_len_M1 = 0;
58 size_t best_len_code_M1 = 0;
59 size_t best_dist_M1 = 0;
60 double best_score_M1 = 0;
61 while (i + 2 < i_end) {
62 size_t best_len = 0;
63 size_t best_len_code = 0;
64 size_t best_dist = 0;
65 double best_score = 0;
66 size_t max_distance = std::min(i + i_diff, max_backward_limit);
67 bool in_dictionary;
68 hasher->set_insert_length(insert_length);
69 bool match_found = hasher->FindLongestMatch(
70 ringbuffer, literal_cost, ringbuffer_mask,
71 i + i_diff, i_end - i, max_distance,
72 &best_len, &best_len_code, &best_dist, &best_score,
73 &in_dictionary);
74 bool best_in_dictionary = in_dictionary;
75 if (match_found) {
76 if (match_found_M1 && best_score_M1 > best_score) {
77 // Two copies after each other. Take the last literal from the
78 // last copy, and use it as the first of this one.
79 (commands->rbegin())->copy_length_ -= 1;
80 (commands->rbegin())->copy_length_code_ -= 1;
81 hasher->Store(ringbuffer + i, i + i_diff);
82 --i;
83 best_len = best_len_M1;
84 best_len_code = best_len_code_M1;
85 best_dist = best_dist_M1;
86 best_score = best_score_M1;
87 // in_dictionary doesn't need to be correct, but it is the only
88 // reason why M1 matching should be beneficial here. Setting it here
89 // will only disable further M1 matching against this copy.
90 best_in_dictionary = true;
91 in_dictionary = true;
92 } else {
93 // Found a match. Let's look for something even better ahead.
94 int delayed_backward_references_in_row = 0;
95 while (i + 4 < i_end &&
96 delayed_backward_references_in_row < 4) {
97 size_t best_len_2 = 0;
98 size_t best_len_code_2 = 0;
99 size_t best_dist_2 = 0;
100 double best_score_2 = 0;
101 max_distance = std::min(i + i_diff + 1, max_backward_limit);
102 hasher->Store(ringbuffer + i, i + i_diff);
103 match_found = hasher->FindLongestMatch(
104 ringbuffer, literal_cost, ringbuffer_mask,
105 i + i_diff + 1, i_end - i - 1, max_distance,
106 &best_len_2, &best_len_code_2, &best_dist_2, &best_score_2,
107 &in_dictionary);
108 double cost_diff_lazy = 0;
109 if (best_len >= 4) {
110 cost_diff_lazy +=
111 literal_cost[(i + 4) & ringbuffer_mask] - average_cost;
112 }
113 {
114 const int tail_length = best_len_2 - best_len + 1;
115 for (int k = 0; k < tail_length; ++k) {
116 cost_diff_lazy -=
117 literal_cost[(i + best_len + k) & ringbuffer_mask] -
118 average_cost;
119 }
120 }
121 // If we are not inserting any symbols, inserting one is more
122 // expensive than if we were inserting symbols anyways.
123 if (insert_length < 1) {
124 cost_diff_lazy += 0.97;
125 }
126 // Add bias to slightly avoid lazy matching.
127 cost_diff_lazy += 2.0 + delayed_backward_references_in_row * 0.2;
128 cost_diff_lazy += 0.04 * literal_cost[i & ringbuffer_mask];
129
130 if (match_found && best_score_2 >= best_score + cost_diff_lazy) {
131 // Ok, let's just write one byte for now and start a match from the
132 // next byte.
133 ++insert_length;
134 ++delayed_backward_references_in_row;
135 best_len = best_len_2;
136 best_len_code = best_len_code_2;
137 best_dist = best_dist_2;
138 best_score = best_score_2;
139 best_in_dictionary = in_dictionary;
140 i++;
141 } else {
142 break;
143 }
144 }
145 }
146 apply_random_heuristics =
147 i + 2 * best_len + random_heuristics_window_size;
148 Command cmd;
149 cmd.insert_length_ = insert_length;
150 cmd.copy_length_ = best_len;
151 cmd.copy_length_code_ = best_len_code;
152 cmd.copy_distance_ = best_dist;
153 commands->push_back(cmd);
154 insert_length = 0;
155 ++i;
156 if (best_dist <= std::min(i + i_diff, max_backward_limit)) {
157 hasher->set_last_distance(best_dist);
158 }
159
160 // Copy all copied literals to the hasher, except the last one.
161 // We cannot store the last one yet, otherwise we couldn't find
162 // the possible M1 match.
163 for (int j = 1; j < best_len - 1; ++j) {
164 if (i + 2 < i_end) {
165 hasher->Store(ringbuffer + i, i + i_diff);
166 }
167 ++i;
168 }
169 // Prepare M1 match.
170 if (hasher->HasStaticDictionary() &&
171 best_len >= 4 && i + 20 < i_end && !best_in_dictionary) {
172 max_distance = std::min(i + i_diff, max_backward_limit);
173 match_found_M1 = hasher->FindLongestMatch(
174 ringbuffer, literal_cost, ringbuffer_mask,
175 i + i_diff, i_end - i, max_distance,
176 &best_len_M1, &best_len_code_M1, &best_dist_M1, &best_score_M1,
177 &in_dictionary);
178 } else {
179 match_found_M1 = false;
180 in_dictionary = false;
181 }
182 // This byte is just moved from the previous copy to the current,
183 // that is no gain.
184 best_score_M1 -= literal_cost[i & ringbuffer_mask];
185 // Adjust for losing the opportunity for lazy matching.
186 best_score_M1 -= 3.75;
187
188 // Store the last one of the match.
189 if (i + 2 < i_end) {
190 hasher->Store(ringbuffer + i, i + i_diff);
191 }
192 ++i;
193 } else {
194 match_found_M1 = false;
195 ++insert_length;
196 hasher->Store(ringbuffer + i, i + i_diff);
197 ++i;
198 // If we have not seen matches for a long time, we can skip some
199 // match lookups. Unsuccessful match lookups are very very expensive
200 // and this kind of a heuristic speeds up compression quite
201 // a lot.
202 if (i > apply_random_heuristics) {
203 // Going through uncompressible data, jump.
204 if (i > apply_random_heuristics + 4 * random_heuristics_window_size) {
205 // It is quite a long time since we saw a copy, so we assume
206 // that this data is not compressible, and store hashes less
207 // often. Hashes of non compressible data are less likely to
208 // turn out to be useful in the future, too, so we store less of
209 // them to not to flood out the hash table of good compressible
210 // data.
211 int i_jump = std::min(i + 16, i_end - 4);
212 for (; i < i_jump; i += 4) {
213 hasher->Store(ringbuffer + i, i + i_diff);
214 insert_length += 4;
215 }
216 } else {
217 int i_jump = std::min(i + 8, i_end - 2);
218 for (; i < i_jump; i += 2) {
219 hasher->Store(ringbuffer + i, i + i_diff);
220 insert_length += 2;
221 }
222 }
223 }
224 }
225 }
226 insert_length += (i_end - i);
227
228 if (insert_length > 0) {
229 Command cmd;
230 cmd.insert_length_ = insert_length;
231 cmd.copy_length_ = 0;
232 cmd.copy_distance_ = 0;
233 commands->push_back(cmd);
234 }
235 }
236
CreateBackwardReferences(size_t num_bytes,size_t position,const uint8_t * ringbuffer,const float * literal_cost,size_t ringbuffer_mask,const size_t max_backward_limit,Hashers * hashers,Hashers::Type hash_type,std::vector<Command> * commands)237 void CreateBackwardReferences(size_t num_bytes,
238 size_t position,
239 const uint8_t* ringbuffer,
240 const float* literal_cost,
241 size_t ringbuffer_mask,
242 const size_t max_backward_limit,
243 Hashers* hashers,
244 Hashers::Type hash_type,
245 std::vector<Command>* commands) {
246 switch (hash_type) {
247 case Hashers::HASH_15_8_4:
248 CreateBackwardReferences(
249 num_bytes, position, ringbuffer, literal_cost,
250 ringbuffer_mask, max_backward_limit,
251 hashers->hash_15_8_4.get(),
252 commands);
253 break;
254 case Hashers::HASH_15_8_2:
255 CreateBackwardReferences(
256 num_bytes, position, ringbuffer, literal_cost,
257 ringbuffer_mask, max_backward_limit,
258 hashers->hash_15_8_2.get(),
259 commands);
260 break;
261 default:
262 break;
263 }
264 }
265
266
267 } // namespace brotli
268