• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2013 Google Inc. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 //
15 // Function to find backward reference copies.
16 
17 #include "./backward_references.h"
18 
19 #include <algorithm>
20 #include <vector>
21 
22 #include "./command.h"
23 
24 namespace brotli {
25 
26 template<typename Hasher>
CreateBackwardReferences(size_t num_bytes,size_t position,const uint8_t * ringbuffer,const float * literal_cost,size_t ringbuffer_mask,const size_t max_backward_limit,Hasher * hasher,std::vector<Command> * commands)27 void CreateBackwardReferences(size_t num_bytes,
28                               size_t position,
29                               const uint8_t* ringbuffer,
30                               const float* literal_cost,
31                               size_t ringbuffer_mask,
32                               const size_t max_backward_limit,
33                               Hasher* hasher,
34                               std::vector<Command>* commands) {
35   // Length heuristic that seems to help probably by better selection
36   // of lazy matches of similar lengths.
37   int insert_length = 0;
38   size_t i = position & ringbuffer_mask;
39   const int i_diff = position - i;
40   const size_t i_end = i + num_bytes;
41 
42   const int random_heuristics_window_size = 512;
43   int apply_random_heuristics = i + random_heuristics_window_size;
44 
45   double average_cost = 0.0;
46   for (int k = position; k < position + num_bytes; ++k) {
47     average_cost += literal_cost[k & ringbuffer_mask];
48   }
49   average_cost /= num_bytes;
50   hasher->set_average_cost(average_cost);
51 
52   // M1 match is for considering for two repeated copies, if moving
53   // one literal form the previous copy to the current one allows the
54   // current copy to be more efficient (because the way static dictionary
55   // codes words). M1 matching improves text compression density by ~0.15 %.
56   bool match_found_M1 = false;
57   size_t best_len_M1 = 0;
58   size_t best_len_code_M1 = 0;
59   size_t best_dist_M1 = 0;
60   double best_score_M1 = 0;
61   while (i + 2 < i_end) {
62     size_t best_len = 0;
63     size_t best_len_code = 0;
64     size_t best_dist = 0;
65     double best_score = 0;
66     size_t max_distance = std::min(i + i_diff, max_backward_limit);
67     bool in_dictionary;
68     hasher->set_insert_length(insert_length);
69     bool match_found = hasher->FindLongestMatch(
70         ringbuffer, literal_cost, ringbuffer_mask,
71         i + i_diff, i_end - i, max_distance,
72         &best_len, &best_len_code, &best_dist, &best_score,
73         &in_dictionary);
74     bool best_in_dictionary = in_dictionary;
75     if (match_found) {
76       if (match_found_M1 && best_score_M1 > best_score) {
77         // Two copies after each other. Take the last literal from the
78         // last copy, and use it as the first of this one.
79         (commands->rbegin())->copy_length_ -= 1;
80         (commands->rbegin())->copy_length_code_ -= 1;
81         hasher->Store(ringbuffer + i, i + i_diff);
82         --i;
83         best_len = best_len_M1;
84         best_len_code = best_len_code_M1;
85         best_dist = best_dist_M1;
86         best_score = best_score_M1;
87         // in_dictionary doesn't need to be correct, but it is the only
88         // reason why M1 matching should be beneficial here. Setting it here
89         // will only disable further M1 matching against this copy.
90         best_in_dictionary = true;
91         in_dictionary = true;
92       } else {
93         // Found a match. Let's look for something even better ahead.
94         int delayed_backward_references_in_row = 0;
95         while (i + 4 < i_end &&
96                delayed_backward_references_in_row < 4) {
97           size_t best_len_2 = 0;
98           size_t best_len_code_2 = 0;
99           size_t best_dist_2 = 0;
100           double best_score_2 = 0;
101           max_distance = std::min(i + i_diff + 1, max_backward_limit);
102           hasher->Store(ringbuffer + i, i + i_diff);
103           match_found = hasher->FindLongestMatch(
104               ringbuffer, literal_cost, ringbuffer_mask,
105               i + i_diff + 1, i_end - i - 1, max_distance,
106               &best_len_2, &best_len_code_2, &best_dist_2, &best_score_2,
107               &in_dictionary);
108           double cost_diff_lazy = 0;
109           if (best_len >= 4) {
110             cost_diff_lazy +=
111                 literal_cost[(i + 4) & ringbuffer_mask] - average_cost;
112           }
113           {
114             const int tail_length = best_len_2 - best_len + 1;
115             for (int k = 0; k < tail_length; ++k) {
116               cost_diff_lazy -=
117                   literal_cost[(i + best_len + k) & ringbuffer_mask] -
118                   average_cost;
119             }
120           }
121           // If we are not inserting any symbols, inserting one is more
122           // expensive than if we were inserting symbols anyways.
123           if (insert_length < 1) {
124             cost_diff_lazy += 0.97;
125           }
126           // Add bias to slightly avoid lazy matching.
127           cost_diff_lazy += 2.0 + delayed_backward_references_in_row * 0.2;
128           cost_diff_lazy += 0.04 * literal_cost[i & ringbuffer_mask];
129 
130           if (match_found && best_score_2 >= best_score + cost_diff_lazy) {
131             // Ok, let's just write one byte for now and start a match from the
132             // next byte.
133             ++insert_length;
134             ++delayed_backward_references_in_row;
135             best_len = best_len_2;
136             best_len_code = best_len_code_2;
137             best_dist = best_dist_2;
138             best_score = best_score_2;
139             best_in_dictionary = in_dictionary;
140             i++;
141           } else {
142             break;
143           }
144         }
145       }
146       apply_random_heuristics =
147           i + 2 * best_len + random_heuristics_window_size;
148       Command cmd;
149       cmd.insert_length_ = insert_length;
150       cmd.copy_length_ = best_len;
151       cmd.copy_length_code_ = best_len_code;
152       cmd.copy_distance_ = best_dist;
153       commands->push_back(cmd);
154       insert_length = 0;
155       ++i;
156       if (best_dist <= std::min(i + i_diff, max_backward_limit)) {
157         hasher->set_last_distance(best_dist);
158       }
159 
160       // Copy all copied literals to the hasher, except the last one.
161       // We cannot store the last one yet, otherwise we couldn't find
162       // the possible M1 match.
163       for (int j = 1; j < best_len - 1; ++j) {
164         if (i + 2 < i_end) {
165           hasher->Store(ringbuffer + i, i + i_diff);
166         }
167         ++i;
168       }
169       // Prepare M1 match.
170       if (hasher->HasStaticDictionary() &&
171           best_len >= 4 && i + 20 < i_end && !best_in_dictionary) {
172         max_distance = std::min(i + i_diff, max_backward_limit);
173         match_found_M1 = hasher->FindLongestMatch(
174             ringbuffer, literal_cost, ringbuffer_mask,
175             i + i_diff, i_end - i, max_distance,
176             &best_len_M1, &best_len_code_M1, &best_dist_M1, &best_score_M1,
177             &in_dictionary);
178       } else {
179         match_found_M1 = false;
180         in_dictionary = false;
181       }
182       // This byte is just moved from the previous copy to the current,
183       // that is no gain.
184       best_score_M1 -= literal_cost[i & ringbuffer_mask];
185       // Adjust for losing the opportunity for lazy matching.
186       best_score_M1 -= 3.75;
187 
188       // Store the last one of the match.
189       if (i + 2 < i_end) {
190         hasher->Store(ringbuffer + i, i + i_diff);
191       }
192       ++i;
193     } else {
194       match_found_M1 = false;
195       ++insert_length;
196       hasher->Store(ringbuffer + i, i + i_diff);
197       ++i;
198       // If we have not seen matches for a long time, we can skip some
199       // match lookups. Unsuccessful match lookups are very very expensive
200       // and this kind of a heuristic speeds up compression quite
201       // a lot.
202       if (i > apply_random_heuristics) {
203         // Going through uncompressible data, jump.
204         if (i > apply_random_heuristics + 4 * random_heuristics_window_size) {
205           // It is quite a long time since we saw a copy, so we assume
206           // that this data is not compressible, and store hashes less
207           // often. Hashes of non compressible data are less likely to
208           // turn out to be useful in the future, too, so we store less of
209           // them to not to flood out the hash table of good compressible
210           // data.
211           int i_jump = std::min(i + 16, i_end - 4);
212           for (; i < i_jump; i += 4) {
213             hasher->Store(ringbuffer + i, i + i_diff);
214             insert_length += 4;
215           }
216         } else {
217           int i_jump = std::min(i + 8, i_end - 2);
218           for (; i < i_jump; i += 2) {
219             hasher->Store(ringbuffer + i, i + i_diff);
220             insert_length += 2;
221           }
222         }
223       }
224     }
225   }
226   insert_length += (i_end - i);
227 
228   if (insert_length > 0) {
229     Command cmd;
230     cmd.insert_length_ = insert_length;
231     cmd.copy_length_ = 0;
232     cmd.copy_distance_ = 0;
233     commands->push_back(cmd);
234   }
235 }
236 
CreateBackwardReferences(size_t num_bytes,size_t position,const uint8_t * ringbuffer,const float * literal_cost,size_t ringbuffer_mask,const size_t max_backward_limit,Hashers * hashers,Hashers::Type hash_type,std::vector<Command> * commands)237 void CreateBackwardReferences(size_t num_bytes,
238                               size_t position,
239                               const uint8_t* ringbuffer,
240                               const float* literal_cost,
241                               size_t ringbuffer_mask,
242                               const size_t max_backward_limit,
243                               Hashers* hashers,
244                               Hashers::Type hash_type,
245                               std::vector<Command>* commands) {
246   switch (hash_type) {
247     case Hashers::HASH_15_8_4:
248       CreateBackwardReferences(
249           num_bytes, position, ringbuffer, literal_cost,
250           ringbuffer_mask, max_backward_limit,
251           hashers->hash_15_8_4.get(),
252           commands);
253       break;
254     case Hashers::HASH_15_8_2:
255       CreateBackwardReferences(
256           num_bytes, position, ringbuffer, literal_cost,
257           ringbuffer_mask, max_backward_limit,
258           hashers->hash_15_8_2.get(),
259           commands);
260       break;
261     default:
262       break;
263   }
264 }
265 
266 
267 }  // namespace brotli
268