1 // Copyright 2006, 2008 Google Inc.
2 // Authors: Chandra Chereddi, Lincoln Smith
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 // http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15
16 #include <config.h>
17 #include "blockhash.h"
18 #include <stdint.h> // uint32_t
19 #include <string.h> // memcpy, memcmp
20 #include <algorithm> // std::min
21 #include "compile_assert.h"
22 #include "logging.h"
23 #include "rolling_hash.h"
24
25 namespace open_vcdiff {
26
27 typedef unsigned long uword_t; // a machine word NOLINT
28
BlockHash(const char * source_data,size_t source_size,int starting_offset)29 BlockHash::BlockHash(const char* source_data,
30 size_t source_size,
31 int starting_offset)
32 : source_data_(source_data),
33 source_size_(source_size),
34 hash_table_mask_(0),
35 starting_offset_(starting_offset),
36 last_block_added_(-1) {
37 }
38
~BlockHash()39 BlockHash::~BlockHash() { }
40
41 // kBlockSize must be at least 2 to be meaningful. Since it's a compile-time
42 // constant, check its value at compile time rather than wasting CPU cycles
43 // on runtime checks.
44 VCD_COMPILE_ASSERT(BlockHash::kBlockSize >= 2, kBlockSize_must_be_at_least_2);
45
46 // kBlockSize is required to be a power of 2 because multiplication
47 // (n * kBlockSize), division (n / kBlockSize) and MOD (n % kBlockSize)
48 // are commonly-used operations. If kBlockSize is a compile-time
49 // constant and a power of 2, the compiler can convert these three operations
50 // into bit-shift (>> or <<) and bitwise-AND (&) operations, which are much
51 // more efficient than executing full integer multiply, divide, or remainder
52 // instructions.
53 VCD_COMPILE_ASSERT((BlockHash::kBlockSize & (BlockHash::kBlockSize - 1)) == 0,
54 kBlockSize_must_be_a_power_of_2);
55
Init(bool populate_hash_table)56 bool BlockHash::Init(bool populate_hash_table) {
57 if (!hash_table_.empty() ||
58 !next_block_table_.empty() ||
59 !last_block_table_.empty()) {
60 VCD_DFATAL << "Init() called twice for same BlockHash object" << VCD_ENDL;
61 return false;
62 }
63 const size_t table_size = CalcTableSize(source_size_);
64 if (table_size == 0) {
65 VCD_DFATAL << "Error finding table size for source size " << source_size_
66 << VCD_ENDL;
67 return false;
68 }
69 // Since table_size is a power of 2, (table_size - 1) is a bit mask
70 // containing all the bits below table_size.
71 hash_table_mask_ = static_cast<uint32_t>(table_size - 1);
72 hash_table_.resize(table_size, -1);
73 next_block_table_.resize(GetNumberOfBlocks(), -1);
74 last_block_table_.resize(GetNumberOfBlocks(), -1);
75 if (populate_hash_table) {
76 AddAllBlocks();
77 }
78 return true;
79 }
80
CreateDictionaryHash(const char * dictionary_data,size_t dictionary_size)81 const BlockHash* BlockHash::CreateDictionaryHash(const char* dictionary_data,
82 size_t dictionary_size) {
83 BlockHash* new_dictionary_hash = new BlockHash(dictionary_data,
84 dictionary_size,
85 0);
86 if (!new_dictionary_hash->Init(/* populate_hash_table = */ true)) {
87 delete new_dictionary_hash;
88 return NULL;
89 } else {
90 return new_dictionary_hash;
91 }
92 }
93
CreateTargetHash(const char * target_data,size_t target_size,size_t dictionary_size)94 BlockHash* BlockHash::CreateTargetHash(const char* target_data,
95 size_t target_size,
96 size_t dictionary_size) {
97 BlockHash* new_target_hash = new BlockHash(target_data,
98 target_size,
99 static_cast<int>(dictionary_size));
100 if (!new_target_hash->Init(/* populate_hash_table = */ false)) {
101 delete new_target_hash;
102 return NULL;
103 } else {
104 return new_target_hash;
105 }
106 }
107
108 // Returns zero if an error occurs.
CalcTableSize(const size_t dictionary_size)109 size_t BlockHash::CalcTableSize(const size_t dictionary_size) {
110 // Overallocate the hash table by making it the same size (in bytes)
111 // as the source data. This is a trade-off between space and time:
112 // the empty entries in the hash table will reduce the
113 // probability of a hash collision to (sizeof(int) / kblockSize),
114 // and so save time comparing false matches.
115 const size_t min_size = (dictionary_size / sizeof(int)) + 1; // NOLINT
116 size_t table_size = 1;
117 // Find the smallest power of 2 that is >= min_size, and assign
118 // that value to table_size.
119 while (table_size < min_size) {
120 table_size <<= 1;
121 // Guard against an infinite loop
122 if (table_size <= 0) {
123 VCD_DFATAL << "Internal error: CalcTableSize(dictionary_size = "
124 << dictionary_size
125 << "): resulting table_size " << table_size
126 << " is zero or negative" << VCD_ENDL;
127 return 0;
128 }
129 }
130 // Check size sanity
131 if ((table_size & (table_size - 1)) != 0) {
132 VCD_DFATAL << "Internal error: CalcTableSize(dictionary_size = "
133 << dictionary_size
134 << "): resulting table_size " << table_size
135 << " is not a power of 2" << VCD_ENDL;
136 return 0;
137 }
138 // The loop above tries to find the smallest power of 2 that is >= min_size.
139 // That value must lie somewhere between min_size and (min_size * 2),
140 // except for the case (dictionary_size == 0, table_size == 1).
141 if ((dictionary_size > 0) && (table_size > (min_size * 2))) {
142 VCD_DFATAL << "Internal error: CalcTableSize(dictionary_size = "
143 << dictionary_size
144 << "): resulting table_size " << table_size
145 << " is too large" << VCD_ENDL;
146 return 0;
147 }
148 return table_size;
149 }
150
151 // If the hash value is already available from the rolling hash,
152 // call this function to save time.
AddBlock(uint32_t hash_value)153 void BlockHash::AddBlock(uint32_t hash_value) {
154 if (hash_table_.empty()) {
155 VCD_DFATAL << "BlockHash::AddBlock() called before BlockHash::Init()"
156 << VCD_ENDL;
157 return;
158 }
159 // The initial value of last_block_added_ is -1.
160 int block_number = last_block_added_ + 1;
161 const int total_blocks =
162 static_cast<int>(source_size_ / kBlockSize); // round down
163 if (block_number >= total_blocks) {
164 VCD_DFATAL << "BlockHash::AddBlock() called"
165 " with block number " << block_number
166 << " that is past last block " << (total_blocks - 1)
167 << VCD_ENDL;
168 return;
169 }
170 if (next_block_table_[block_number] != -1) {
171 VCD_DFATAL << "Internal error in BlockHash::AddBlock(): "
172 "block number = " << block_number
173 << ", next block should be -1 but is "
174 << next_block_table_[block_number] << VCD_ENDL;
175 return;
176 }
177 const uint32_t hash_table_index = GetHashTableIndex(hash_value);
178 const int first_matching_block = hash_table_[hash_table_index];
179 if (first_matching_block < 0) {
180 // This is the first entry with this hash value
181 hash_table_[hash_table_index] = block_number;
182 last_block_table_[block_number] = block_number;
183 } else {
184 // Add this entry at the end of the chain of matching blocks
185 const int last_matching_block = last_block_table_[first_matching_block];
186 if (next_block_table_[last_matching_block] != -1) {
187 VCD_DFATAL << "Internal error in BlockHash::AddBlock(): "
188 "first matching block = " << first_matching_block
189 << ", last matching block = " << last_matching_block
190 << ", next block should be -1 but is "
191 << next_block_table_[last_matching_block] << VCD_ENDL;
192 return;
193 }
194 next_block_table_[last_matching_block] = block_number;
195 last_block_table_[first_matching_block] = block_number;
196 }
197 last_block_added_ = block_number;
198 }
199
AddAllBlocks()200 void BlockHash::AddAllBlocks() {
201 AddAllBlocksThroughIndex(static_cast<int>(source_size_));
202 }
203
AddAllBlocksThroughIndex(int end_index)204 void BlockHash::AddAllBlocksThroughIndex(int end_index) {
205 if (end_index > static_cast<int>(source_size_)) {
206 VCD_DFATAL << "BlockHash::AddAllBlocksThroughIndex() called"
207 " with index " << end_index
208 << " higher than end index " << source_size_ << VCD_ENDL;
209 return;
210 }
211 const int last_index_added = last_block_added_ * kBlockSize;
212 if (end_index <= last_index_added) {
213 VCD_DFATAL << "BlockHash::AddAllBlocksThroughIndex() called"
214 " with index " << end_index
215 << " <= last index added ( " << last_index_added
216 << ")" << VCD_ENDL;
217 return;
218 }
219 int end_limit = end_index;
220 // Don't allow reading any indices at or past source_size_.
221 // The Hash function extends (kBlockSize - 1) bytes past the index,
222 // so leave a margin of that size.
223 int last_legal_hash_index = static_cast<int>(source_size() - kBlockSize);
224 if (end_limit > last_legal_hash_index) {
225 end_limit = last_legal_hash_index + 1;
226 }
227 const char* block_ptr = source_data() + NextIndexToAdd();
228 const char* const end_ptr = source_data() + end_limit;
229 while (block_ptr < end_ptr) {
230 AddBlock(RollingHash<kBlockSize>::Hash(block_ptr));
231 block_ptr += kBlockSize;
232 }
233 }
234
235 VCD_COMPILE_ASSERT((BlockHash::kBlockSize % sizeof(uword_t)) == 0,
236 kBlockSize_must_be_a_multiple_of_machine_word_size);
237
238 // A recursive template to compare a fixed number
239 // of (possibly unaligned) machine words starting
240 // at addresses block1 and block2. Returns true or false
241 // depending on whether an exact match was found.
242 template<int number_of_words>
CompareWholeWordValues(const char * block1,const char * block2)243 inline bool CompareWholeWordValues(const char* block1,
244 const char* block2) {
245 return CompareWholeWordValues<1>(block1, block2) &&
246 CompareWholeWordValues<number_of_words - 1>(block1 + sizeof(uword_t),
247 block2 + sizeof(uword_t));
248 }
249
250 // The base of the recursive template: compare one pair of machine words.
251 template<>
CompareWholeWordValues(const char * word1,const char * word2)252 inline bool CompareWholeWordValues<1>(const char* word1,
253 const char* word2) {
254 uword_t aligned_word1, aligned_word2;
255 memcpy(&aligned_word1, word1, sizeof(aligned_word1));
256 memcpy(&aligned_word2, word2, sizeof(aligned_word2));
257 return aligned_word1 == aligned_word2;
258 }
259
260 // A block must be composed of an integral number of machine words
261 // (uword_t values.) This function takes advantage of that fact
262 // by comparing the blocks as series of (possibly unaligned) word values.
263 // A word-sized comparison can be performed as a single
264 // machine instruction. Comparing words instead of bytes means that,
265 // on a 64-bit platform, this function will use 8 times fewer test-and-branch
266 // instructions than a byte-by-byte comparison. Even with the extra
267 // cost of the calls to memcpy, this method is still at least twice as fast
268 // as memcmp (measured using gcc on a 64-bit platform, with a block size
269 // of 32.) For blocks with identical contents (a common case), this method
270 // is over six times faster than memcmp.
BlockCompareWordsInline(const char * block1,const char * block2)271 inline bool BlockCompareWordsInline(const char* block1, const char* block2) {
272 static const size_t kWordsPerBlock = BlockHash::kBlockSize / sizeof(uword_t);
273 return CompareWholeWordValues<kWordsPerBlock>(block1, block2);
274 }
275
BlockCompareWords(const char * block1,const char * block2)276 bool BlockHash::BlockCompareWords(const char* block1, const char* block2) {
277 return BlockCompareWordsInline(block1, block2);
278 }
279
BlockContentsMatchInline(const char * block1,const char * block2)280 inline bool BlockContentsMatchInline(const char* block1, const char* block2) {
281 // Optimize for mismatch in first byte. Since this function is called only
282 // when the hash values of the two blocks match, it is very likely that either
283 // the blocks are identical, or else the first byte does not match.
284 if (*block1 != *block2) {
285 return false;
286 }
287 #ifdef VCDIFF_USE_BLOCK_COMPARE_WORDS
288 return BlockCompareWordsInline(block1, block2);
289 #else // !VCDIFF_USE_BLOCK_COMPARE_WORDS
290 return memcmp(block1, block2, BlockHash::kBlockSize) == 0;
291 #endif // VCDIFF_USE_BLOCK_COMPARE_WORDS
292 }
293
BlockContentsMatch(const char * block1,const char * block2)294 bool BlockHash::BlockContentsMatch(const char* block1, const char* block2) {
295 return BlockContentsMatchInline(block1, block2);
296 }
297
SkipNonMatchingBlocks(int block_number,const char * block_ptr) const298 inline int BlockHash::SkipNonMatchingBlocks(int block_number,
299 const char* block_ptr) const {
300 int probes = 0;
301 while ((block_number >= 0) &&
302 !BlockContentsMatchInline(block_ptr,
303 &source_data_[block_number * kBlockSize])) {
304 if (++probes > kMaxProbes) {
305 return -1; // Avoid too much chaining
306 }
307 block_number = next_block_table_[block_number];
308 }
309 return block_number;
310 }
311
312 // Init() must have been called and returned true before using
313 // FirstMatchingBlock or NextMatchingBlock. No check is performed
314 // for this condition; the code will crash if this condition is violated.
FirstMatchingBlockInline(uint32_t hash_value,const char * block_ptr) const315 inline int BlockHash::FirstMatchingBlockInline(uint32_t hash_value,
316 const char* block_ptr) const {
317 return SkipNonMatchingBlocks(hash_table_[GetHashTableIndex(hash_value)],
318 block_ptr);
319 }
320
FirstMatchingBlock(uint32_t hash_value,const char * block_ptr) const321 int BlockHash::FirstMatchingBlock(uint32_t hash_value,
322 const char* block_ptr) const {
323 return FirstMatchingBlockInline(hash_value, block_ptr);
324 }
325
NextMatchingBlock(int block_number,const char * block_ptr) const326 int BlockHash::NextMatchingBlock(int block_number,
327 const char* block_ptr) const {
328 if (static_cast<size_t>(block_number) >= GetNumberOfBlocks()) {
329 VCD_DFATAL << "NextMatchingBlock called for invalid block number "
330 << block_number << VCD_ENDL;
331 return -1;
332 }
333 return SkipNonMatchingBlocks(next_block_table_[block_number], block_ptr);
334 }
335
336 // Keep a count of the number of matches found. This will throttle the
337 // number of iterations in FindBestMatch. For example, if the entire
338 // dictionary is made up of spaces (' ') and the search string is also
339 // made up of spaces, there will be one match for each block in the
340 // dictionary.
TooManyMatches(int * match_counter)341 inline bool BlockHash::TooManyMatches(int* match_counter) {
342 ++(*match_counter);
343 return (*match_counter) > kMaxMatchesToCheck;
344 }
345
346 // Returns the number of bytes to the left of source_match_start
347 // that match the corresponding bytes to the left of target_match_start.
348 // Will not examine more than max_bytes bytes, which is to say that
349 // the return value will be in the range [0, max_bytes] inclusive.
MatchingBytesToLeft(const char * source_match_start,const char * target_match_start,int max_bytes)350 int BlockHash::MatchingBytesToLeft(const char* source_match_start,
351 const char* target_match_start,
352 int max_bytes) {
353 const char* source_ptr = source_match_start;
354 const char* target_ptr = target_match_start;
355 int bytes_found = 0;
356 while (bytes_found < max_bytes) {
357 --source_ptr;
358 --target_ptr;
359 if (*source_ptr != *target_ptr) {
360 break;
361 }
362 ++bytes_found;
363 }
364 return bytes_found;
365 }
366
367 // Returns the number of bytes starting at source_match_end
368 // that match the corresponding bytes starting at target_match_end.
369 // Will not examine more than max_bytes bytes, which is to say that
370 // the return value will be in the range [0, max_bytes] inclusive.
MatchingBytesToRight(const char * source_match_end,const char * target_match_end,int max_bytes)371 int BlockHash::MatchingBytesToRight(const char* source_match_end,
372 const char* target_match_end,
373 int max_bytes) {
374 const char* source_ptr = source_match_end;
375 const char* target_ptr = target_match_end;
376 int bytes_found = 0;
377 while ((bytes_found < max_bytes) && (*source_ptr == *target_ptr)) {
378 ++bytes_found;
379 ++source_ptr;
380 ++target_ptr;
381 }
382 return bytes_found;
383 }
384
385 // No NULL checks are performed on the pointer arguments. The caller
386 // must guarantee that none of the arguments is NULL, or a crash will occur.
387 //
388 // The vast majority of calls to FindBestMatch enter the loop *zero* times,
389 // which is to say that most candidate blocks find no matches in the dictionary.
390 // The important sections for optimization are therefore the code outside the
391 // loop and the code within the loop conditions. Keep this to a minimum.
FindBestMatch(uint32_t hash_value,const char * target_candidate_start,const char * target_start,size_t target_size,Match * best_match) const392 void BlockHash::FindBestMatch(uint32_t hash_value,
393 const char* target_candidate_start,
394 const char* target_start,
395 size_t target_size,
396 Match* best_match) const {
397 int match_counter = 0;
398 for (int block_number = FirstMatchingBlockInline(hash_value,
399 target_candidate_start);
400 (block_number >= 0) && !TooManyMatches(&match_counter);
401 block_number = NextMatchingBlock(block_number, target_candidate_start)) {
402 int source_match_offset = block_number * kBlockSize;
403 const int source_match_end = source_match_offset + kBlockSize;
404
405 int target_match_offset =
406 static_cast<int>(target_candidate_start - target_start);
407 const int target_match_end = target_match_offset + kBlockSize;
408
409 size_t match_size = kBlockSize;
410 {
411 // Extend match start towards beginning of unencoded data
412 const int limit_bytes_to_left = std::min(source_match_offset,
413 target_match_offset);
414 const int matching_bytes_to_left =
415 MatchingBytesToLeft(source_data_ + source_match_offset,
416 target_start + target_match_offset,
417 limit_bytes_to_left);
418 source_match_offset -= matching_bytes_to_left;
419 target_match_offset -= matching_bytes_to_left;
420 match_size += matching_bytes_to_left;
421 }
422 {
423 // Extend match end towards end of unencoded data
424 const size_t source_bytes_to_right = source_size_ - source_match_end;
425 const size_t target_bytes_to_right = target_size - target_match_end;
426 const size_t limit_bytes_to_right = std::min(source_bytes_to_right,
427 target_bytes_to_right);
428 match_size +=
429 MatchingBytesToRight(source_data_ + source_match_end,
430 target_start + target_match_end,
431 static_cast<int>(limit_bytes_to_right));
432 }
433 // Update in/out parameter if the best match found was better
434 // than any match already stored in *best_match.
435 best_match->ReplaceIfBetterMatch(match_size,
436 source_match_offset + starting_offset_,
437 target_match_offset);
438 }
439 }
440
441 } // namespace open_vcdiff
442