• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2006, 2008 Google Inc.
2 // Authors: Chandra Chereddi, Lincoln Smith
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 //      http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 
16 #include <config.h>
17 #include "blockhash.h"
18 #include "compile_assert.h"
19 #include <stdint.h>  // uint32_t
20 #include <string.h>  // memcpy, memcmp
21 #include "logging.h"
22 #include "rolling_hash.h"
23 
24 namespace open_vcdiff {
25 
26 typedef unsigned long uword_t;  // a machine word                         NOLINT
27 
BlockHash(const char * source_data,size_t source_size,int starting_offset)28 BlockHash::BlockHash(const char* source_data,
29                      size_t source_size,
30                      int starting_offset)
31     : source_data_(source_data),
32       source_size_(source_size),
33       hash_table_mask_(0),
34       starting_offset_(starting_offset),
35       last_block_added_(-1) {
36 }
37 
~BlockHash()38 BlockHash::~BlockHash() { }
39 
40 // kBlockSize must be at least 2 to be meaningful.  Since it's a compile-time
41 // constant, check its value at compile time rather than wasting CPU cycles
42 // on runtime checks.
43 COMPILE_ASSERT(BlockHash::kBlockSize >= 2, kBlockSize_must_be_at_least_2);
44 
45 // kBlockSize is required to be a power of 2 because multiplication
46 // (n * kBlockSize), division (n / kBlockSize) and MOD (n % kBlockSize)
47 // are commonly-used operations.  If kBlockSize is a compile-time
48 // constant and a power of 2, the compiler can convert these three operations
49 // into bit-shift (>> or <<) and bitwise-AND (&) operations, which are much
50 // more efficient than executing full integer multiply, divide, or remainder
51 // instructions.
52 COMPILE_ASSERT((BlockHash::kBlockSize & (BlockHash::kBlockSize - 1)) == 0,
53                kBlockSize_must_be_a_power_of_2);
54 
Init(bool populate_hash_table)55 bool BlockHash::Init(bool populate_hash_table) {
56   if (!hash_table_.empty() ||
57       !next_block_table_.empty() ||
58       !last_block_table_.empty()) {
59     LOG(DFATAL) << "Init() called twice for same BlockHash object" << LOG_ENDL;
60     return false;
61   }
62   const size_t table_size = CalcTableSize(source_size_);
63   if (table_size == 0) {
64     LOG(DFATAL) << "Error finding table size for source size " << source_size_
65                 << LOG_ENDL;
66     return false;
67   }
68   // Since table_size is a power of 2, (table_size - 1) is a bit mask
69   // containing all the bits below table_size.
70   hash_table_mask_ = static_cast<uint32_t>(table_size - 1);
71   hash_table_.resize(table_size, -1);
72   next_block_table_.resize(GetNumberOfBlocks(), -1);
73   last_block_table_.resize(GetNumberOfBlocks(), -1);
74   if (populate_hash_table) {
75     AddAllBlocks();
76   }
77   return true;
78 }
79 
CreateDictionaryHash(const char * dictionary_data,size_t dictionary_size)80 const BlockHash* BlockHash::CreateDictionaryHash(const char* dictionary_data,
81                                                  size_t dictionary_size) {
82   BlockHash* new_dictionary_hash = new BlockHash(dictionary_data,
83                                                  dictionary_size,
84                                                  0);
85   if (!new_dictionary_hash->Init(/* populate_hash_table = */ true)) {
86     delete new_dictionary_hash;
87     return NULL;
88   } else {
89     return new_dictionary_hash;
90   }
91 }
92 
CreateTargetHash(const char * target_data,size_t target_size,size_t dictionary_size)93 BlockHash* BlockHash::CreateTargetHash(const char* target_data,
94                                        size_t target_size,
95                                        size_t dictionary_size) {
96   BlockHash* new_target_hash = new BlockHash(target_data,
97                                              target_size,
98                                              static_cast<int>(dictionary_size));
99   if (!new_target_hash->Init(/* populate_hash_table = */ false)) {
100     delete new_target_hash;
101     return NULL;
102   } else {
103     return new_target_hash;
104   }
105 }
106 
107 // Returns zero if an error occurs.
CalcTableSize(const size_t dictionary_size)108 size_t BlockHash::CalcTableSize(const size_t dictionary_size) {
109   // Overallocate the hash table by making it the same size (in bytes)
110   // as the source data.  This is a trade-off between space and time:
111   // the empty entries in the hash table will reduce the
112   // probability of a hash collision to (sizeof(int) / kblockSize),
113   // and so save time comparing false matches.
114   const size_t min_size = (dictionary_size / sizeof(int)) + 1;  // NOLINT
115   size_t table_size = 1;
116   // Find the smallest power of 2 that is >= min_size, and assign
117   // that value to table_size.
118   while (table_size < min_size) {
119     table_size <<= 1;
120     // Guard against an infinite loop
121     if (table_size <= 0) {
122       LOG(DFATAL) << "Internal error: CalcTableSize(dictionary_size = "
123                   << dictionary_size
124                   << "): resulting table_size " << table_size
125                   << " is zero or negative" << LOG_ENDL;
126       return 0;
127     }
128   }
129   // Check size sanity
130   if ((table_size & (table_size - 1)) != 0) {
131     LOG(DFATAL) << "Internal error: CalcTableSize(dictionary_size = "
132                 << dictionary_size
133                 << "): resulting table_size " << table_size
134                 << " is not a power of 2" << LOG_ENDL;
135     return 0;
136   }
137   // The loop above tries to find the smallest power of 2 that is >= min_size.
138   // That value must lie somewhere between min_size and (min_size * 2),
139   // except for the case (dictionary_size == 0, table_size == 1).
140   if ((dictionary_size > 0) && (table_size > (min_size * 2))) {
141     LOG(DFATAL) << "Internal error: CalcTableSize(dictionary_size = "
142                 << dictionary_size
143                 << "): resulting table_size " << table_size
144                 << " is too large" << LOG_ENDL;
145     return 0;
146   }
147   return table_size;
148 }
149 
150 // If the hash value is already available from the rolling hash,
151 // call this function to save time.
AddBlock(uint32_t hash_value)152 void BlockHash::AddBlock(uint32_t hash_value) {
153   if (hash_table_.empty()) {
154     LOG(DFATAL) << "BlockHash::AddBlock() called before BlockHash::Init()"
155                 << LOG_ENDL;
156     return;
157   }
158   // The initial value of last_block_added_ is -1.
159   int block_number = last_block_added_ + 1;
160   const int total_blocks =
161       static_cast<int>(source_size_ / kBlockSize);  // round down
162   if (block_number >= total_blocks) {
163     LOG(DFATAL) << "BlockHash::AddBlock() called"
164                    " with block number " << block_number
165                 << " that is past last block " << (total_blocks - 1)
166                 << LOG_ENDL;
167     return;
168   }
169   if (next_block_table_[block_number] != -1) {
170     LOG(DFATAL) << "Internal error in BlockHash::AddBlock(): "
171                    "block number = " << block_number
172                 << ", next block should be -1 but is "
173                 << next_block_table_[block_number] << LOG_ENDL;
174     return;
175   }
176   const uint32_t hash_table_index = GetHashTableIndex(hash_value);
177   const int first_matching_block = hash_table_[hash_table_index];
178   if (first_matching_block < 0) {
179     // This is the first entry with this hash value
180     hash_table_[hash_table_index] = block_number;
181     last_block_table_[block_number] = block_number;
182   } else {
183     // Add this entry at the end of the chain of matching blocks
184     const int last_matching_block = last_block_table_[first_matching_block];
185     if (next_block_table_[last_matching_block] != -1) {
186       LOG(DFATAL) << "Internal error in BlockHash::AddBlock(): "
187                      "first matching block = " << first_matching_block
188                   << ", last matching block = " << last_matching_block
189                   << ", next block should be -1 but is "
190                   << next_block_table_[last_matching_block] << LOG_ENDL;
191       return;
192     }
193     next_block_table_[last_matching_block] = block_number;
194     last_block_table_[first_matching_block] = block_number;
195   }
196   last_block_added_ = block_number;
197 }
198 
AddAllBlocks()199 void BlockHash::AddAllBlocks() {
200   AddAllBlocksThroughIndex(static_cast<int>(source_size_));
201 }
202 
AddAllBlocksThroughIndex(int end_index)203 void BlockHash::AddAllBlocksThroughIndex(int end_index) {
204   if (end_index > static_cast<int>(source_size_)) {
205     LOG(DFATAL) << "BlockHash::AddAllBlocksThroughIndex() called"
206                    " with index " << end_index
207                 << " higher than end index  " << source_size_ << LOG_ENDL;
208     return;
209   }
210   const int last_index_added = last_block_added_ * kBlockSize;
211   if (end_index <= last_index_added) {
212     LOG(DFATAL) << "BlockHash::AddAllBlocksThroughIndex() called"
213                    " with index " << end_index
214                 << " <= last index added ( " << last_index_added
215                 << ")" << LOG_ENDL;
216     return;
217   }
218   int end_limit = end_index;
219   // Don't allow reading any indices at or past source_size_.
220   // The Hash function extends (kBlockSize - 1) bytes past the index,
221   // so leave a margin of that size.
222   int last_legal_hash_index = static_cast<int>(source_size() - kBlockSize);
223   if (end_limit > last_legal_hash_index) {
224     end_limit = last_legal_hash_index + 1;
225   }
226   const char* block_ptr = source_data() + NextIndexToAdd();
227   const char* const end_ptr = source_data() + end_limit;
228   while (block_ptr < end_ptr) {
229     AddBlock(RollingHash<kBlockSize>::Hash(block_ptr));
230     block_ptr += kBlockSize;
231   }
232 }
233 
234 COMPILE_ASSERT((BlockHash::kBlockSize % sizeof(uword_t)) == 0,
235                kBlockSize_must_be_a_multiple_of_machine_word_size);
236 
237 // A recursive template to compare a fixed number
238 // of (possibly unaligned) machine words starting
239 // at addresses block1 and block2.  Returns true or false
240 // depending on whether an exact match was found.
241 template<int number_of_words>
CompareWholeWordValues(const char * block1,const char * block2)242 inline bool CompareWholeWordValues(const char* block1,
243                                    const char* block2) {
244   return CompareWholeWordValues<1>(block1, block2) &&
245          CompareWholeWordValues<number_of_words - 1>(block1 + sizeof(uword_t),
246                                                      block2 + sizeof(uword_t));
247 }
248 
249 // The base of the recursive template: compare one pair of machine words.
250 template<>
CompareWholeWordValues(const char * word1,const char * word2)251 inline bool CompareWholeWordValues<1>(const char* word1,
252                                       const char* word2) {
253   uword_t aligned_word1, aligned_word2;
254   memcpy(&aligned_word1, word1, sizeof(aligned_word1));
255   memcpy(&aligned_word2, word2, sizeof(aligned_word2));
256   return aligned_word1 == aligned_word2;
257 }
258 
259 // A block must be composed of an integral number of machine words
260 // (uword_t values.)  This function takes advantage of that fact
261 // by comparing the blocks as series of (possibly unaligned) word values.
262 // A word-sized comparison can be performed as a single
263 // machine instruction.  Comparing words instead of bytes means that,
264 // on a 64-bit platform, this function will use 8 times fewer test-and-branch
265 // instructions than a byte-by-byte comparison.  Even with the extra
266 // cost of the calls to memcpy, this method is still at least twice as fast
267 // as memcmp (measured using gcc on a 64-bit platform, with a block size
268 // of 32.)  For blocks with identical contents (a common case), this method
269 // is over six times faster than memcmp.
BlockCompareWordsInline(const char * block1,const char * block2)270 inline bool BlockCompareWordsInline(const char* block1, const char* block2) {
271   static const size_t kWordsPerBlock = BlockHash::kBlockSize / sizeof(uword_t);
272   return CompareWholeWordValues<kWordsPerBlock>(block1, block2);
273 }
274 
BlockCompareWords(const char * block1,const char * block2)275 bool BlockHash::BlockCompareWords(const char* block1, const char* block2) {
276   return BlockCompareWordsInline(block1, block2);
277 }
278 
BlockContentsMatchInline(const char * block1,const char * block2)279 inline bool BlockContentsMatchInline(const char* block1, const char* block2) {
280   // Optimize for mismatch in first byte.  Since this function is called only
281   // when the hash values of the two blocks match, it is very likely that either
282   // the blocks are identical, or else the first byte does not match.
283   if (*block1 != *block2) {
284     return false;
285   }
286 #ifdef VCDIFF_USE_BLOCK_COMPARE_WORDS
287   return BlockCompareWordsInline(block1, block2);
288 #else  // !VCDIFF_USE_BLOCK_COMPARE_WORDS
289   return memcmp(block1, block2, BlockHash::kBlockSize) == 0;
290 #endif  // VCDIFF_USE_BLOCK_COMPARE_WORDS
291 }
292 
BlockContentsMatch(const char * block1,const char * block2)293 bool BlockHash::BlockContentsMatch(const char* block1, const char* block2) {
294   return BlockContentsMatchInline(block1, block2);
295 }
296 
SkipNonMatchingBlocks(int block_number,const char * block_ptr) const297 inline int BlockHash::SkipNonMatchingBlocks(int block_number,
298                                             const char* block_ptr) const {
299   int probes = 0;
300   while ((block_number >= 0) &&
301          !BlockContentsMatchInline(block_ptr,
302                                    &source_data_[block_number * kBlockSize])) {
303     if (++probes > kMaxProbes) {
304       return -1;  // Avoid too much chaining
305     }
306     block_number = next_block_table_[block_number];
307   }
308   return block_number;
309 }
310 
311 // Init() must have been called and returned true before using
312 // FirstMatchingBlock or NextMatchingBlock.  No check is performed
313 // for this condition; the code will crash if this condition is violated.
FirstMatchingBlockInline(uint32_t hash_value,const char * block_ptr) const314 inline int BlockHash::FirstMatchingBlockInline(uint32_t hash_value,
315                                                const char* block_ptr) const {
316   return SkipNonMatchingBlocks(hash_table_[GetHashTableIndex(hash_value)],
317                                block_ptr);
318 }
319 
FirstMatchingBlock(uint32_t hash_value,const char * block_ptr) const320 int BlockHash::FirstMatchingBlock(uint32_t hash_value,
321                                   const char* block_ptr) const {
322   return FirstMatchingBlockInline(hash_value, block_ptr);
323 }
324 
NextMatchingBlock(int block_number,const char * block_ptr) const325 int BlockHash::NextMatchingBlock(int block_number,
326                                  const char* block_ptr) const {
327   if (static_cast<size_t>(block_number) >= GetNumberOfBlocks()) {
328     LOG(DFATAL) << "NextMatchingBlock called for invalid block number "
329                 << block_number << LOG_ENDL;
330     return -1;
331   }
332   return SkipNonMatchingBlocks(next_block_table_[block_number], block_ptr);
333 }
334 
335 // Keep a count of the number of matches found.  This will throttle the
336 // number of iterations in FindBestMatch.  For example, if the entire
337 // dictionary is made up of spaces (' ') and the search string is also
338 // made up of spaces, there will be one match for each block in the
339 // dictionary.
TooManyMatches(int * match_counter)340 inline bool BlockHash::TooManyMatches(int* match_counter) {
341   ++(*match_counter);
342   return (*match_counter) > kMaxMatchesToCheck;
343 }
344 
345 // Returns the number of bytes to the left of source_match_start
346 // that match the corresponding bytes to the left of target_match_start.
347 // Will not examine more than max_bytes bytes, which is to say that
348 // the return value will be in the range [0, max_bytes] inclusive.
MatchingBytesToLeft(const char * source_match_start,const char * target_match_start,int max_bytes)349 int BlockHash::MatchingBytesToLeft(const char* source_match_start,
350                                    const char* target_match_start,
351                                    int max_bytes) {
352   const char* source_ptr = source_match_start;
353   const char* target_ptr = target_match_start;
354   int bytes_found = 0;
355   while (bytes_found < max_bytes) {
356     --source_ptr;
357     --target_ptr;
358     if (*source_ptr != *target_ptr) {
359       break;
360     }
361     ++bytes_found;
362   }
363   return bytes_found;
364 }
365 
366 // Returns the number of bytes starting at source_match_end
367 // that match the corresponding bytes starting at target_match_end.
368 // Will not examine more than max_bytes bytes, which is to say that
369 // the return value will be in the range [0, max_bytes] inclusive.
MatchingBytesToRight(const char * source_match_end,const char * target_match_end,int max_bytes)370 int BlockHash::MatchingBytesToRight(const char* source_match_end,
371                                     const char* target_match_end,
372                                     int max_bytes) {
373   const char* source_ptr = source_match_end;
374   const char* target_ptr = target_match_end;
375   int bytes_found = 0;
376   while ((bytes_found < max_bytes) && (*source_ptr == *target_ptr)) {
377     ++bytes_found;
378     ++source_ptr;
379     ++target_ptr;
380   }
381   return bytes_found;
382 }
383 
384 // No NULL checks are performed on the pointer arguments.  The caller
385 // must guarantee that none of the arguments is NULL, or a crash will occur.
386 //
387 // The vast majority of calls to FindBestMatch enter the loop *zero* times,
388 // which is to say that most candidate blocks find no matches in the dictionary.
389 // The important sections for optimization are therefore the code outside the
390 // loop and the code within the loop conditions.  Keep this to a minimum.
FindBestMatch(uint32_t hash_value,const char * target_candidate_start,const char * target_start,size_t target_size,Match * best_match) const391 void BlockHash::FindBestMatch(uint32_t hash_value,
392                               const char* target_candidate_start,
393                               const char* target_start,
394                               size_t target_size,
395                               Match* best_match) const {
396   int match_counter = 0;
397   for (int block_number = FirstMatchingBlockInline(hash_value,
398                                                    target_candidate_start);
399        (block_number >= 0) && !TooManyMatches(&match_counter);
400        block_number = NextMatchingBlock(block_number, target_candidate_start)) {
401     int source_match_offset = block_number * kBlockSize;
402     const int source_match_end = source_match_offset + kBlockSize;
403 
404     int target_match_offset =
405         static_cast<int>(target_candidate_start - target_start);
406     const int target_match_end = target_match_offset + kBlockSize;
407 
408     size_t match_size = kBlockSize;
409     {
410       // Extend match start towards beginning of unencoded data
411       const int limit_bytes_to_left = std::min(source_match_offset,
412                                                target_match_offset);
413       const int matching_bytes_to_left =
414           MatchingBytesToLeft(source_data_ + source_match_offset,
415                               target_start + target_match_offset,
416                               limit_bytes_to_left);
417       source_match_offset -= matching_bytes_to_left;
418       target_match_offset -= matching_bytes_to_left;
419       match_size += matching_bytes_to_left;
420     }
421     {
422       // Extend match end towards end of unencoded data
423       const size_t source_bytes_to_right = source_size_ - source_match_end;
424       const size_t target_bytes_to_right = target_size - target_match_end;
425       const size_t limit_bytes_to_right = std::min(source_bytes_to_right,
426                                                    target_bytes_to_right);
427       match_size +=
428           MatchingBytesToRight(source_data_ + source_match_end,
429                                target_start + target_match_end,
430                                static_cast<int>(limit_bytes_to_right));
431     }
432     // Update in/out parameter if the best match found was better
433     // than any match already stored in *best_match.
434     best_match->ReplaceIfBetterMatch(match_size,
435                                      source_match_offset + starting_offset_,
436                                      target_match_offset);
437   }
438 }
439 
440 }  // namespace open_vcdiff
441