1 // Copyright 2006 Google Inc. 2 // Authors: Sanjay Ghemawat, Jeff Dean, Chandra Chereddi, Lincoln Smith 3 // 4 // Licensed under the Apache License, Version 2.0 (the "License"); 5 // you may not use this file except in compliance with the License. 6 // You may obtain a copy of the License at 7 // 8 // http://www.apache.org/licenses/LICENSE-2.0 9 // 10 // Unless required by applicable law or agreed to in writing, software 11 // distributed under the License is distributed on an "AS IS" BASIS, 12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 // See the License for the specific language governing permissions and 14 // limitations under the License. 15 // 16 // Implementation of the Bentley/McIlroy algorithm for finding differences. 17 // Bentley, McIlroy. DCC 1999. Data Compression Using Long Common Strings. 18 // http://citeseer.ist.psu.edu/555557.html 19 20 #ifndef OPEN_VCDIFF_BLOCKHASH_H_ 21 #define OPEN_VCDIFF_BLOCKHASH_H_ 22 23 #include <config.h> 24 #include <stddef.h> // size_t 25 #include <stdint.h> // uint32_t 26 #include <vector> 27 28 namespace open_vcdiff { 29 30 // A generic hash table which will be used to keep track of byte runs 31 // of size kBlockSize in both the incrementally processed target data 32 // and the preprocessed source dictionary. 33 // 34 // A custom hash table implementation is used instead of the standard 35 // hash_map template because we know that there will be exactly one 36 // entry in the BlockHash corresponding to each kBlockSize bytes 37 // in the source data, which makes certain optimizations possible: 38 // * The memory for the hash table and for all hash entries can be allocated 39 // in one step rather than incrementally for each insert operation. 40 // * A single integer can be used to represent both 41 // the index of the next hash entry in the chain 42 // and the position of the entry within the source data 43 // (== kBlockSize * block_number). This greatly reduces the size 44 // of a hash entry. 45 // 46 class BlockHash { 47 public: 48 // Block size as per Bentley/McIlroy; must be a power of two. 49 // 50 // Using (for example) kBlockSize = 4 guarantees that no match smaller 51 // than size 4 will be identified, that some matches having sizes 52 // 4, 5, or 6 may be identified, and that all matches 53 // having size 7 or greater will be identified (because any string of 54 // 7 bytes must contain a complete aligned block of 4 bytes.) 55 // 56 // Increasing kBlockSize by a factor of two will halve the amount of 57 // memory needed for the next block table, and will halve the setup time 58 // for a new BlockHash. However, it also doubles the minimum 59 // match length that is guaranteed to be found in FindBestMatch(), 60 // so that function will be less effective in finding matches. 61 // 62 // Computational effort in FindBestMatch (which is the inner loop of 63 // the encoding algorithm) will be proportional to the number of 64 // matches found, and a low value of kBlockSize will waste time 65 // tracking down small matches. On the other hand, if this value 66 // is set too high, no matches will be found at all. 67 // 68 // It is suggested that different values of kBlockSize be tried against 69 // a representative data set to find the best tradeoff between 70 // memory/CPU and the effectiveness of FindBestMatch(). 71 // 72 // If you change kBlockSize to a smaller value, please increase 73 // kMaxMatchesToCheck accordingly. 74 static const int kBlockSize = 16; 75 76 // This class is used to store the best match found by FindBestMatch() 77 // and return it to the caller. 78 class Match { 79 public: Match()80 Match() : size_(0), source_offset_(-1), target_offset_(-1) { } 81 ReplaceIfBetterMatch(size_t candidate_size,int candidate_source_offset,int candidate_target_offset)82 void ReplaceIfBetterMatch(size_t candidate_size, 83 int candidate_source_offset, 84 int candidate_target_offset) { 85 if (candidate_size > size_) { 86 size_ = candidate_size; 87 source_offset_ = candidate_source_offset; 88 target_offset_ = candidate_target_offset; 89 } 90 } 91 size()92 size_t size() const { return size_; } source_offset()93 int source_offset() const { return source_offset_; } target_offset()94 int target_offset() const { return target_offset_; } 95 96 private: 97 // The size of the best (longest) match passed to ReplaceIfBetterMatch(). 98 size_t size_; 99 100 // The source offset of the match, including the starting_offset_ 101 // of the BlockHash for which the match was found. 102 int source_offset_; 103 104 // The target offset of the match. An offset of 0 corresponds to the 105 // data at target_start, which is an argument of FindBestMatch(). 106 int target_offset_; 107 108 // Making these private avoids implicit copy constructor 109 // & assignment operator 110 Match(const Match&); // NOLINT 111 void operator=(const Match&); 112 }; 113 114 // A BlockHash is created using a buffer of source data. The hash table 115 // will contain one entry for each kBlockSize-byte block in the 116 // source data. 117 // 118 // See the comments for starting_offset_, below, for a description of 119 // the starting_offset argument. For a hash of source (dictionary) data, 120 // starting_offset_ will be zero; for a hash of previously encoded 121 // target data, starting_offset_ will be equal to the dictionary size. 122 // 123 BlockHash(const char* source_data, size_t source_size, int starting_offset); 124 125 ~BlockHash(); 126 127 // Initializes the object before use. 128 // This method must be called after constructing a BlockHash object, 129 // and before any other method may be called. This is because 130 // Init() dynamically allocates hash_table_ and next_block_table_. 131 // Returns true if initialization succeeded, or false if an error occurred, 132 // in which case no other method except the destructor may then be used 133 // on the object. 134 // 135 // If populate_hash_table is true, then AddAllBlocks() will be called 136 // to populate the hash table. If populate_hash_table is false, then 137 // classes that inherit from BlockHash are expected to call AddBlock() 138 // to incrementally populate individual blocks of data. 139 // 140 bool Init(bool populate_hash_table); 141 142 // In the context of the open-vcdiff encoder, BlockHash is used for two 143 // purposes: to hash the source (dictionary) data, and to hash 144 // the previously encoded target data. The main differences between 145 // a dictionary BlockHash and a target BlockHash are as follows: 146 // 147 // 1. The best_match->source_offset() returned from FindBestMatch() 148 // for a target BlockHash is computed in the following manner: 149 // the starting offset of the first byte in the target data 150 // is equal to the dictionary size. FindBestMatch() will add 151 // starting_offset_ to any best_match->source_offset() value it returns, 152 // in order to produce the correct offset value for a target BlockHash. 153 // 2. For a dictionary BlockHash, the entire data set is hashed at once 154 // when Init() is called with the parameter populate_hash_table = true. 155 // For a target BlockHash, because the previously encoded target data 156 // includes only the data seen up to the current encoding position, 157 // the data blocks are hashed incrementally as the encoding position 158 // advances, using AddOneIndexHash() and AddAllBlocksThroughIndex(). 159 // 160 // The following two factory functions can be used to create BlockHash 161 // objects for each of these two purposes. Each factory function calls 162 // the object constructor and also calls Init(). If an error occurs, 163 // NULL is returned; otherwise a valid BlockHash object is returned. 164 // Since a dictionary BlockHash is not expected to be modified after 165 // initialization, a const object is returned. 166 // The caller is responsible for deleting the returned object 167 // (using the C++ delete operator) once it is no longer needed. 168 static const BlockHash* CreateDictionaryHash(const char* dictionary_data, 169 size_t dictionary_size); 170 static BlockHash* CreateTargetHash(const char* target_data, 171 size_t target_size, 172 size_t dictionary_size); 173 174 // This function will be called to add blocks incrementally to the target hash 175 // as the encoding position advances through the target data. It will be 176 // called for every kBlockSize-byte block in the target data, regardless 177 // of whether the block is aligned evenly on a block boundary. The 178 // BlockHash will only store hash entries for the evenly-aligned blocks. 179 // AddOneIndexHash(int index,uint32_t hash_value)180 void AddOneIndexHash(int index, uint32_t hash_value) { 181 if (index == NextIndexToAdd()) { 182 AddBlock(hash_value); 183 } 184 } 185 186 // Calls AddBlock() for each kBlockSize-byte block in the range 187 // (last_block_added_ * kBlockSize, end_index), exclusive of the endpoints. 188 // If end_index <= the last index added (last_block_added_ * kBlockSize), 189 // this function does nothing. 190 // 191 // A partial block beginning anywhere up to (end_index - 1) is also added, 192 // unless it extends outside the end of the source data. Like AddAllBlocks(), 193 // this function computes the hash value for each of the blocks in question 194 // from scratch, so it is not a good option if the hash values have already 195 // been computed for some other purpose. 196 // 197 // Example: assume kBlockSize = 4, last_block_added_ = 1, and there are 198 // 14 bytes of source data. 199 // If AddAllBlocksThroughIndex(9) is invoked, then it will call AddBlock() 200 // only for block number 2 (at index 8). 201 // If, after that, AddAllBlocksThroughIndex(14) is invoked, it will not call 202 // AddBlock() at all, because block 3 (beginning at index 12) would 203 // fall outside the range of source data. 204 // 205 // VCDiffEngine::Encode (in vcdiffengine.cc) uses this function to 206 // add a whole range of data to a target hash when a COPY instruction 207 // is generated. 208 void AddAllBlocksThroughIndex(int end_index); 209 210 // FindBestMatch takes a position within the unencoded target data 211 // (target_candidate_start) and the hash value of the kBlockSize bytes 212 // beginning at that position (hash_value). It attempts to find a matching 213 // set of bytes within the source (== dictionary) data, expanding 214 // the match both below and above the target block. It cannot expand 215 // the match outside the bounds of the source data, or below 216 // target_start within the target data, or past 217 // the end limit of (target_start + target_length). 218 // 219 // target_candidate_start is the start of the candidate block within the 220 // target data for which a match will be sought, while 221 // target_start (which is <= target_candidate_start) 222 // is the start of the target data that has yet to be encoded. 223 // 224 // If a match is found whose size is greater than the size 225 // of best_match, this function populates *best_match with the 226 // size, source_offset, and target_offset of the match found. 227 // best_match->source_offset() will contain the index of the start of the 228 // matching source data, plus starting_offset_ 229 // (see description of starting_offset_ for details); 230 // best_match->target_offset() will contain the offset of the match 231 // beginning with target_start = offset 0, such that 232 // 0 <= best_match->target_offset() 233 // <= (target_candidate_start - target_start); 234 // and best_match->size() will contain the size of the match. 235 // If no such match is found, this function leaves *best_match unmodified. 236 // 237 // On calling FindBestMatch(), best_match must 238 // point to a valid Match object, and cannot be NULL. 239 // The same Match object can be passed 240 // when calling FindBestMatch() on a different BlockHash object 241 // for the same candidate data block, in order to find 242 // the best match possible across both objects. For example: 243 // 244 // open_vcdiff::BlockHash::Match best_match; 245 // uint32_t hash_value = 246 // RollingHash<BlockHash::kBlockSize>::Hash(target_candidate_start); 247 // bh1.FindBestMatch(hash_value, 248 // target_candidate_start, 249 // target_start, 250 // target_length, 251 // &best_match); 252 // bh2.FindBestMatch(hash_value, 253 // target_candidate_start, 254 // target_start, 255 // target_length, 256 // &best_match); 257 // if (best_size >= 0) { 258 // // a match was found; its size, source offset, and target offset 259 // // can be found in best_match 260 // } 261 // 262 // hash_value is passed as a separate parameter from target_candidate_start, 263 // (rather than calculated within FindBestMatch) in order to take 264 // advantage of the rolling hash, which quickly calculates the hash value 265 // of the block starting at target_candidate_start based on 266 // the known hash value of the block starting at (target_candidate_start - 1). 267 // See vcdiffengine.cc for more details. 268 // 269 // Example: 270 // kBlockSize: 4 271 // target text: "ANDREW LLOYD WEBBER" 272 // 1^ 5^2^ 3^ 273 // dictionary: "INSURANCE : LLOYDS OF LONDON" 274 // 4^ 275 // hashed dictionary blocks: 276 // "INSU", "RANC", "E : ", "LLOY", "DS O", "F LON" 277 // 278 // 1: target_start (beginning of unencoded data) 279 // 2: target_candidate_start (for the block "LLOY") 280 // 3: target_length (points one byte beyond the last byte of data.) 281 // 4: best_match->source_offset() (after calling FindBestMatch) 282 // 5: best_match->target_offset() (after calling FindBestMatch) 283 // 284 // Under these conditions, FindBestMatch will find a matching 285 // hashed dictionary block for "LLOY", and will extend the beginning of 286 // this match backwards by one byte, and the end of the match forwards 287 // by one byte, finding that the best match is " LLOYD" 288 // with best_match->source_offset() = 10 289 // (offset of " LLOYD" in the source string), 290 // best_match->target_offset() = 6 291 // (offset of " LLOYD" in the target string), 292 // and best_match->size() = 6. 293 // 294 void FindBestMatch(uint32_t hash_value, 295 const char* target_candidate_start, 296 const char* target_start, 297 size_t target_size, 298 Match* best_match) const; 299 300 protected: 301 // FindBestMatch() will not process more than this number 302 // of matching hash entries. 303 // 304 // It is necessary to have a limit on the maximum number of matches 305 // that will be checked in order to avoid the worst-case performance 306 // possible if, for example, all the blocks in the dictionary have 307 // the same hash value. See the unit test SearchStringFindsTooManyMatches 308 // for an example of such a case. The encoder uses a loop in 309 // VCDiffEngine::Encode over each target byte, containing a loop in 310 // BlockHash::FindBestMatch over the number of matches (up to a maximum 311 // of the number of source blocks), containing two loops that extend 312 // the match forwards and backwards up to the number of source bytes. 313 // Total complexity in the worst case is 314 // O([target size] * source_size_ * source_size_) 315 // Placing a limit on the possible number of matches checked changes this to 316 // O([target size] * source_size_ * kMaxMatchesToCheck) 317 // 318 // In empirical testing on real HTML text, using a block size of 4, 319 // the number of true matches per call to FindBestMatch() did not exceed 78; 320 // with a block size of 32, the number of matches did not exceed 3. 321 // 322 // The expected number of true matches scales super-linearly 323 // with the inverse of kBlockSize, but here a linear scale is used 324 // for block sizes smaller than 32. 325 static const int kMaxMatchesToCheck = (kBlockSize >= 32) ? 32 : 326 (32 * (32 / kBlockSize)); 327 328 // Do not skip more than this number of non-matching hash collisions 329 // to find the next matching entry in the hash chain. 330 static const int kMaxProbes = 16; 331 332 // Internal routine which calculates a hash table size based on kBlockSize and 333 // the dictionary_size. Will return a power of two if successful, or 0 if an 334 // internal error occurs. Some calculations (such as GetHashTableIndex()) 335 // depend on the table size being a power of two. 336 static size_t CalcTableSize(const size_t dictionary_size); 337 GetNumberOfBlocks()338 size_t GetNumberOfBlocks() const { 339 return source_size_ / kBlockSize; 340 } 341 342 // Use the lowest-order bits of the hash value 343 // as the index into the hash table. GetHashTableIndex(uint32_t hash_value)344 uint32_t GetHashTableIndex(uint32_t hash_value) const { 345 return hash_value & hash_table_mask_; 346 } 347 348 // The index within source_data_ of the next block 349 // for which AddBlock() should be called. NextIndexToAdd()350 int NextIndexToAdd() const { 351 return (last_block_added_ + 1) * kBlockSize; 352 } 353 354 static inline bool TooManyMatches(int* match_counter); 355 source_data()356 const char* source_data() { return source_data_; } source_size()357 size_t source_size() { return source_size_; } 358 359 // Adds an entry to the hash table for one block of source data of length 360 // kBlockSize, starting at source_data_[block_number * kBlockSize], 361 // where block_number is always (last_block_added_ + 1). That is, 362 // AddBlock() must be called once for each block in source_data_ 363 // in increasing order. 364 void AddBlock(uint32_t hash_value); 365 366 // Calls AddBlock() for each complete kBlockSize-byte block between 367 // source_data_ and (source_data_ + source_size_). It is equivalent 368 // to calling AddAllBlocksThroughIndex(source_data + source_size). 369 // This function is called when Init(true) is invoked. 370 void AddAllBlocks(); 371 372 // Returns true if the contents of the kBlockSize-byte block 373 // beginning at block1 are identical to the contents of 374 // the block beginning at block2; false otherwise. 375 static bool BlockContentsMatch(const char* block1, const char* block2); 376 377 // Compares each machine word of the two (possibly unaligned) blocks, rather 378 // than each byte, thus reducing the number of test-and-branch instructions 379 // executed. Returns a boolean (do the blocks match?) rather than 380 // the signed byte difference returned by memcmp. 381 // 382 // BlockContentsMatch will use either this function or memcmp to do its work, 383 // depending on which is faster for a particular architecture. 384 // 385 // For gcc on x86-based architectures, this function has been shown to run 386 // about twice as fast as the library function memcmp(), and between five and 387 // nine times faster than the assembly instructions (repz and cmpsb) that gcc 388 // uses by default for builtin memcmp. On other architectures, or using 389 // other compilers, this function has not shown to be faster than memcmp. 390 static bool BlockCompareWords(const char* block1, const char* block2); 391 392 // Finds the first block number within the hashed data 393 // that represents a match for the given hash value. 394 // Returns -1 if no match was found. 395 // 396 // Init() must have been called and returned true before using 397 // FirstMatchingBlock or NextMatchingBlock. No check is performed 398 // for this condition; the code will crash if this condition is violated. 399 // 400 // The hash table is initially populated with -1 (not found) values, 401 // so if this function is called before the hash table has been populated 402 // using AddAllBlocks() or AddBlock(), it will simply return -1 403 // for any value of hash_value. 404 int FirstMatchingBlock(uint32_t hash_value, const char* block_ptr) const; 405 406 // Given a block number returned by FirstMatchingBlock() 407 // or by a previous call to NextMatchingBlock(), returns 408 // the next block number that matches the same hash value. 409 // Returns -1 if no match was found. 410 int NextMatchingBlock(int block_number, const char* block_ptr) const; 411 412 // Inline version of FirstMatchingBlock. This saves the cost of a function 413 // call when this routine is called from within the module. The external 414 // (non-inlined) version is called only by unit tests. 415 inline int FirstMatchingBlockInline(uint32_t hash_value, 416 const char* block_ptr) const; 417 418 // Walk through the hash entry chain, skipping over any false matches 419 // (for which the lowest bits of the fingerprints match, 420 // but the actual block data does not.) Returns the block number of 421 // the first true match found, or -1 if no true match was found. 422 // If block_number is a matching block, the function will return block_number 423 // without skipping to the next block. 424 int SkipNonMatchingBlocks(int block_number, const char* block_ptr) const; 425 426 // Returns the number of bytes to the left of source_match_start 427 // that match the corresponding bytes to the left of target_match_start. 428 // Will not examine more than max_bytes bytes, which is to say that 429 // the return value will be in the range [0, max_bytes] inclusive. 430 static int MatchingBytesToLeft(const char* source_match_start, 431 const char* target_match_start, 432 int max_bytes); 433 434 // Returns the number of bytes starting at source_match_end 435 // that match the corresponding bytes starting at target_match_end. 436 // Will not examine more than max_bytes bytes, which is to say that 437 // the return value will be in the range [0, max_bytes] inclusive. 438 static int MatchingBytesToRight(const char* source_match_end, 439 const char* target_match_end, 440 int max_bytes); 441 442 // The protected functions BlockContentsMatch, FirstMatchingBlock, 443 // NextMatchingBlock, MatchingBytesToLeft, and MatchingBytesToRight 444 // should be made accessible to unit tests. 445 friend class BlockHashTest; 446 447 private: 448 const char* const source_data_; 449 const size_t source_size_; 450 451 // The size of this array is determined using CalcTableSize(). It has at 452 // least one element for each kBlockSize-byte block in the source data. 453 // GetHashTableIndex() returns an index into this table for a given hash 454 // value. The value of each element of hash_table_ is the lowest block 455 // number in the source data whose hash value would return the same value from 456 // GetHashTableIndex(), or -1 if there is no matching block. This value can 457 // then be used as an index into next_block_table_ to retrieve the entire set 458 // of matching block numbers. 459 std::vector<int> hash_table_; 460 461 // An array containing one element for each source block. Each element is 462 // either -1 (== not found) or the index of the next block whose hash value 463 // would produce a matching result from GetHashTableIndex(). 464 std::vector<int> next_block_table_; 465 466 // This vector has the same size as next_block_table_. For every block number 467 // B that is referenced in hash_table_, last_block_table_[B] will contain 468 // the maximum block number that has the same GetHashTableIndex() value 469 // as block B. This number may be B itself. For a block number B' that 470 // is not referenced in hash_table_, the value of last_block_table_[B'] is -1. 471 // This table is used only while populating the hash table, not while looking 472 // up hash values in the table. Keeping track of the last block number in the 473 // chain allows us to construct the block chains as FIFO rather than LIFO 474 // lists, so that the match with the lowest index is returned first. This 475 // should result in a more compact encoding because the VCDIFF format favors 476 // smaller index values and repeated index values. 477 std::vector<int> last_block_table_; 478 479 // Performing a bitwise AND with hash_table_mask_ will produce a value ranging 480 // from 0 to the number of elements in hash_table_. 481 uint32_t hash_table_mask_; 482 483 // The offset of the first byte of source data (the data at source_data_[0]). 484 // For the purpose of computing offsets, the source data and target data 485 // are considered to be concatenated -- not literally in a single memory 486 // buffer, but conceptually as described in the RFC. 487 // The first byte of the previously encoded target data 488 // has an offset that is equal to dictionary_size, i.e., just after 489 // the last byte of source data. 490 // For a hash of source (dictionary) data, starting_offset_ will be zero; 491 // for a hash of previously encoded target data, starting_offset_ will be 492 // equal to the dictionary size. 493 const int starting_offset_; 494 495 // The last index added by AddBlock(). This determines the block number 496 // for successive calls to AddBlock(), and is also 497 // used to determine the starting block for AddAllBlocksThroughIndex(). 498 int last_block_added_; 499 500 // Making these private avoids implicit copy constructor & assignment operator 501 BlockHash(const BlockHash&); // NOLINT 502 void operator=(const BlockHash&); 503 }; 504 505 } // namespace open_vcdiff 506 507 #endif // OPEN_VCDIFF_BLOCKHASH_H_ 508