1 // Copyright 2007, 2008 Google Inc.
2 // Authors: Jeff Dean, Sanjay Ghemawat, Lincoln Smith
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 // http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15
16 #ifndef OPEN_VCDIFF_ROLLING_HASH_H_
17 #define OPEN_VCDIFF_ROLLING_HASH_H_
18
19 #include <config.h>
20 #include <stdint.h> // uint32_t
21 #include "logging.h"
22
23 namespace open_vcdiff {
24
25 // Rabin-Karp hasher module -- this is a faster version with different
26 // constants, so it's not quite Rabin-Karp fingerprinting, but its behavior is
27 // close enough for most applications.
28
29 // Definitions common to all hash window sizes.
30 class RollingHashUtil {
31 public:
32 // Multiplier for incremental hashing. The compiler should be smart enough to
33 // convert (val * kMult) into ((val << 8) + val).
34 static const uint32_t kMult = 257;
35
36 // All hashes are returned modulo "kBase". Current implementation requires
37 // kBase <= 2^32/kMult to avoid overflow. Also, kBase must be a power of two
38 // so that we can compute modulus efficiently.
39 static const uint32_t kBase = (1 << 23);
40
41 // Returns operand % kBase, assuming that kBase is a power of two.
ModBase(uint32_t operand)42 static inline uint32_t ModBase(uint32_t operand) {
43 return operand & (kBase - 1);
44 }
45
46 // Given an unsigned integer "operand", returns an unsigned integer "result"
47 // such that
48 // result < kBase
49 // and
50 // ModBase(operand + result) == 0
FindModBaseInverse(uint32_t operand)51 static inline uint32_t FindModBaseInverse(uint32_t operand) {
52 // The subtraction (0 - operand) produces an unsigned underflow for any
53 // operand except 0. The underflow results in a (very large) unsigned
54 // number. Binary subtraction is used instead of unary negation because
55 // some compilers (e.g. Visual Studio 7+) produce a warning if an unsigned
56 // value is negated.
57 //
58 // The C++ mod operation (operand % kBase) may produce different results for
59 // different compilers if operand is negative. That is not a problem in
60 // this case, since all numbers used are unsigned, and ModBase does its work
61 // using bitwise arithmetic rather than the % operator.
62 return ModBase(uint32_t(0) - operand);
63 }
64
65 // Here's the heart of the hash algorithm. Start with a partial_hash value of
66 // 0, and run this HashStep once against each byte in the data window to be
67 // hashed. The result will be the hash value for the entire data window. The
68 // Hash() function, below, does exactly this, albeit with some refinements.
HashStep(uint32_t partial_hash,unsigned char next_byte)69 static inline uint32_t HashStep(uint32_t partial_hash,
70 unsigned char next_byte) {
71 return ModBase((partial_hash * kMult) + next_byte);
72 }
73
74 // Use this function to start computing a new hash value based on the first
75 // two bytes in the window. It is equivalent to calling
76 // HashStep(HashStep(0, ptr[0]), ptr[1])
77 // but takes advantage of the fact that the maximum value of
78 // (ptr[0] * kMult) + ptr[1] is not large enough to exceed kBase, thus
79 // avoiding an unnecessary ModBase operation.
HashFirstTwoBytes(const char * ptr)80 static inline uint32_t HashFirstTwoBytes(const char* ptr) {
81 return (static_cast<unsigned char>(ptr[0]) * kMult)
82 + static_cast<unsigned char>(ptr[1]);
83 }
84 private:
85 // Making these private avoids copy constructor and assignment operator.
86 // No objects of this type should be constructed.
87 RollingHashUtil();
88 RollingHashUtil(const RollingHashUtil&); // NOLINT
89 void operator=(const RollingHashUtil&);
90 };
91
92 // window_size must be >= 2.
93 template<int window_size>
94 class RollingHash {
95 public:
96 // Perform global initialization that is required in order to instantiate a
97 // RollingHash. This function *must* be called (preferably on startup) by any
98 // program that uses a RollingHash. It is harmless to call this function more
99 // than once. It is not thread-safe, but calling it from two different
100 // threads at the same time can only cause a memory leak, not incorrect
101 // behavior. Make sure to call it before spawning any threads that could use
102 // RollingHash. The function returns true if initialization succeeds, or
103 // false if initialization fails, in which case the caller should not proceed
104 // to construct any objects of type RollingHash.
105 static bool Init();
106
107 // Initialize hasher to maintain a window of the specified size. You need an
108 // instance of this type to use UpdateHash(), but Hash() does not depend on
109 // remove_table_, so it is static.
RollingHash()110 RollingHash() {
111 if (!remove_table_) {
112 LOG(DFATAL) << "RollingHash object instantiated"
113 " before calling RollingHash::Init()" << LOG_ENDL;
114 }
115 }
116
117 // Compute a hash of the window "ptr[0, window_size - 1]".
Hash(const char * ptr)118 static uint32_t Hash(const char* ptr) {
119 uint32_t h = RollingHashUtil::HashFirstTwoBytes(ptr);
120 for (int i = 2; i < window_size; ++i) {
121 h = RollingHashUtil::HashStep(h, ptr[i]);
122 }
123 return h;
124 }
125
126 // Update a hash by removing the oldest byte and adding a new byte.
127 //
128 // UpdateHash takes the hash value of buffer[0] ... buffer[window_size -1]
129 // along with the value of buffer[0] (the "old_first_byte" argument)
130 // and the value of buffer[window_size] (the "new_last_byte" argument).
131 // It quickly computes the hash value of buffer[1] ... buffer[window_size]
132 // without having to run Hash() on the entire window.
133 //
134 // The larger the window, the more advantage comes from using UpdateHash()
135 // (which runs in time independent of window_size) instead of Hash().
136 // Each time window_size doubles, the time to execute Hash() also doubles,
137 // while the time to execute UpdateHash() remains constant. Empirical tests
138 // have borne out this statement.
UpdateHash(uint32_t old_hash,const char old_first_byte,const char new_last_byte)139 uint32_t UpdateHash(uint32_t old_hash,
140 const char old_first_byte,
141 const char new_last_byte) const {
142 uint32_t partial_hash = RemoveFirstByteFromHash(old_hash, old_first_byte);
143 return RollingHashUtil::HashStep(partial_hash, new_last_byte);
144 }
145
146 protected:
147 // Given a full hash value for buffer[0] ... buffer[window_size -1], plus the
148 // value of the first byte buffer[0], this function returns a *partial* hash
149 // value for buffer[1] ... buffer[window_size -1]. See the comments in
150 // Init(), below, for a description of how the contents of remove_table_ are
151 // computed.
RemoveFirstByteFromHash(uint32_t full_hash,unsigned char first_byte)152 static uint32_t RemoveFirstByteFromHash(uint32_t full_hash,
153 unsigned char first_byte) {
154 return RollingHashUtil::ModBase(full_hash + remove_table_[first_byte]);
155 }
156
157 private:
158 // We keep a table that maps from any byte "b" to
159 // (- b * pow(kMult, window_size - 1)) % kBase
160 static const uint32_t* remove_table_;
161 };
162
163 // For each window_size, fill a 256-entry table such that
164 // the hash value of buffer[0] ... buffer[window_size - 1]
165 // + remove_table_[buffer[0]]
166 // == the hash value of buffer[1] ... buffer[window_size - 1]
167 // See the comments in Init(), below, for a description of how the contents of
168 // remove_table_ are computed.
169 template<int window_size>
170 const uint32_t* RollingHash<window_size>::remove_table_ = NULL;
171
172 // Init() checks to make sure that the static object remove_table_ has been
173 // initialized; if not, it does the considerable work of populating it. Once
174 // it's ready, the table can be used for any number of RollingHash objects of
175 // the same window_size.
176 //
177 template<int window_size>
Init()178 bool RollingHash<window_size>::Init() {
179 if (window_size < 2) {
180 LOG(ERROR) << "RollingHash window size " << window_size
181 << " is too small" << LOG_ENDL;
182 return false;
183 }
184 if (remove_table_ == NULL) {
185 // The new object is placed into a local pointer instead of directly into
186 // remove_table_, for two reasons:
187 // 1. remove_table_ is a pointer to const. The table is populated using
188 // the non-const local pointer and then assigned to the global const
189 // pointer once it's ready.
190 // 2. No other thread will ever see remove_table_ pointing to a
191 // partially-initialized table. If two threads happen to call Init()
192 // at the same time, two tables with the same contents may be created
193 // (causing a memory leak), but the results will be consistent
194 // no matter which of the two tables is used.
195 uint32_t* new_remove_table = new uint32_t[256];
196 // Compute multiplier. Concisely, it is:
197 // pow(kMult, (window_size - 1)) % kBase,
198 // but we compute the power in integer form.
199 uint32_t multiplier = 1;
200 for (int i = 0; i < window_size - 1; ++i) {
201 multiplier =
202 RollingHashUtil::ModBase(multiplier * RollingHashUtil::kMult);
203 }
204 // For each character removed_byte, compute
205 // remove_table_[removed_byte] ==
206 // (- (removed_byte * pow(kMult, (window_size - 1)))) % kBase
207 // where the power operator "pow" is taken in integer form.
208 //
209 // If you take a hash value fp representing the hash of
210 // buffer[0] ... buffer[window_size - 1]
211 // and add the value of remove_table_[buffer[0]] to it, the result will be
212 // a partial hash value for
213 // buffer[1] ... buffer[window_size - 1]
214 // that is to say, it no longer includes buffer[0].
215 //
216 // The following byte at buffer[window_size] can then be merged with this
217 // partial hash value to arrive quickly at the hash value for a window that
218 // has advanced by one byte, to
219 // buffer[1] ... buffer[window_size]
220 // In fact, that is precisely what happens in UpdateHash, above.
221 uint32_t byte_times_multiplier = 0;
222 for (int removed_byte = 0; removed_byte < 256; ++removed_byte) {
223 new_remove_table[removed_byte] =
224 RollingHashUtil::FindModBaseInverse(byte_times_multiplier);
225 // Iteratively adding the multiplier in this loop is equivalent to
226 // computing (removed_byte * multiplier), and is faster
227 byte_times_multiplier =
228 RollingHashUtil::ModBase(byte_times_multiplier + multiplier);
229 }
230 remove_table_ = new_remove_table;
231 }
232 return true;
233 }
234
235 } // namespace open_vcdiff
236
237 #endif // OPEN_VCDIFF_ROLLING_HASH_H_
238