1 // Copyright 2007 Google Inc.
2 // Author: Lincoln Smith
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 // http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16 // Implementation of the Address Cache and Address Encoding
17 // algorithms described in sections 5.1 - 5.4 of RFC 3284 -
18 // The VCDIFF Generic Differencing and Compression Data Format.
19 // The RFC text can be found at http://www.faqs.org/rfcs/rfc3284.html
20 //
21 // Assumptions:
22 // * The VCDAddress type is large enough to hold any offset within
23 // the source and target windows. The limit (for int32_t) is 2^31-1 bytes.
24 // The source (dictionary) should not approach this size limit;
25 // to compress a target file that is larger than
26 // INT_MAX - (dictionary size) bytes, the encoder must
27 // break it up into multiple target windows.
28
29 #include <config.h>
30 #include "addrcache.h"
31 #include "logging.h"
32 #include "varint_bigendian.h"
33 #include "vcdiff_defs.h" // RESULT_ERROR
34
35 namespace open_vcdiff {
36
37 // The constructor does not initialize near_addresses_ and same_addresses_.
38 // Therefore, Init() must be called before any other method can be used.
39 //
40 // Arguments:
41 // near_cache_size: Size of the NEAR cache (number of 4-byte integers)
42 // same_cache_size: Size of the SAME cache (number of blocks of
43 // 256 4-byte integers per block)
44 // Because the mode is expressed as a byte value,
45 // near_cache_size + same_cache_size should not exceed 254.
46 //
VCDiffAddressCache(int near_cache_size,int same_cache_size)47 VCDiffAddressCache::VCDiffAddressCache(int near_cache_size,
48 int same_cache_size)
49 : near_cache_size_(near_cache_size),
50 same_cache_size_(same_cache_size),
51 next_slot_(0) { }
52
VCDiffAddressCache()53 VCDiffAddressCache::VCDiffAddressCache()
54 : near_cache_size_(kDefaultNearCacheSize),
55 same_cache_size_(kDefaultSameCacheSize),
56 next_slot_(0) { }
57
58 // Sets up data structures needed to call other methods. Operations that may
59 // fail at runtime (for example, validating the provided near_cache_size_ and
60 // same_cache_size_ parameters against their maximum allowed values) are
61 // confined to this routine in order to guarantee that the class constructor
62 // will never fail. Other methods (except the destructor) cannot be invoked
63 // until this method has been called successfully. After the object has been
64 // initialized and used, Init() can be called again to reset it to its initial
65 // state.
66 //
67 // Return value: "true" if initialization succeeded, "false" if it failed.
68 // No other method except the destructor may be invoked if this function
69 // returns false. The caller is responsible for checking the return value
70 // and providing an exit path in case of error.
71 //
Init()72 bool VCDiffAddressCache::Init() {
73 // The mode is expressed as a byte value, so there is only room for 256 modes,
74 // including the two non-cached modes (SELF and HERE). Do not allow a larger
75 // number of modes to be defined. We do a separate sanity check for
76 // near_cache_size_ and same_cache_size_ because adding them together can
77 // cause an integer overflow if each is set to, say, INT_MAX.
78 if ((near_cache_size_ > (VCD_MAX_MODES - 2)) || (near_cache_size_ < 0)) {
79 LOG(ERROR) << "Near cache size " << near_cache_size_ << " is invalid"
80 << LOG_ENDL;
81 return false;
82 }
83 if ((same_cache_size_ > (VCD_MAX_MODES - 2)) || (same_cache_size_ < 0)) {
84 LOG(ERROR) << "Same cache size " << same_cache_size_ << " is invalid"
85 << LOG_ENDL;
86 return false;
87 }
88 if ((near_cache_size_ + same_cache_size_) > VCD_MAX_MODES - 2) {
89 LOG(ERROR) << "Using near cache size " << near_cache_size_
90 << " and same cache size " << same_cache_size_
91 << " would exceed maximum number of COPY modes ("
92 << VCD_MAX_MODES << ")" << LOG_ENDL;
93 return false;
94 }
95 if (near_cache_size_ > 0) {
96 near_addresses_.assign(near_cache_size_, 0);
97 }
98 if (same_cache_size_ > 0) {
99 same_addresses_.assign(same_cache_size_ * 256, 0);
100 }
101 next_slot_ = 0; // in case Init() is called a second time to reinit
102 return true;
103 }
104
105 // This method will be called whenever an address is calculated for an
106 // encoded or decoded COPY instruction, and will update the contents
107 // of the SAME and NEAR caches. It is vital that the use of
108 // UpdateCache (called cache_update in the RFC examples) exactly match
109 // the RFC standard, and that the same caching logic be used in the
110 // decoder as in the encoder, in order for the decoded addresses to
111 // match.
112 //
113 // Argument:
114 // address: This must be a valid address between 0 and
115 // (source window size + target window size). It is assumed that
116 // these bounds have been checked before calling UpdateCache.
117 //
UpdateCache(VCDAddress address)118 void VCDiffAddressCache::UpdateCache(VCDAddress address) {
119 if (near_cache_size_ > 0) {
120 near_addresses_[next_slot_] = address;
121 next_slot_ = (next_slot_ + 1) % near_cache_size_;
122 }
123 if (same_cache_size_ > 0) {
124 same_addresses_[address % (same_cache_size_ * 256)] = address;
125 }
126 }
127
128 // Determines the address mode that yields the most compact encoding
129 // of the given address value, writes the encoded address into the
130 // address stream, and returns the mode used. The most compact encoding
131 // is found by looking for the numerically lowest encoded address.
132 // The Init() function must already have been called.
133 //
134 // Arguments:
135 // address: The address to be encoded. Must be a non-negative integer
136 // between 0 and (here_address - 1).
137 // here_address: The current location in the target data (i.e., the
138 // position just after the last encoded value.) Must be non-negative.
139 // encoded_addr: Points to an VCDAddress that will be replaced
140 // with the encoded representation of address.
141 // If WriteAddressAsVarintForMode returns true when passed
142 // the return value, then encoded_addr should be written
143 // into the delta file as a variable-length integer (Varint);
144 // otherwise, it should be written as a byte (unsigned char).
145 //
146 // Return value: A mode value between 0 and 255. The mode will tell
147 // how to interpret the next value in the address stream.
148 // The values 0 and 1 correspond to SELF and HERE addressing.
149 //
150 // The function is guaranteed to succeed unless the conditions on the arguments
151 // have not been met, in which case a LOG(DFATAL) message will be produced,
152 // 0 will be returned, and *encoded_addr will be replaced with 0.
153 //
EncodeAddress(VCDAddress address,VCDAddress here_address,VCDAddress * encoded_addr)154 unsigned char VCDiffAddressCache::EncodeAddress(VCDAddress address,
155 VCDAddress here_address,
156 VCDAddress* encoded_addr) {
157 if (address < 0) {
158 LOG(DFATAL) << "EncodeAddress was passed a negative address: "
159 << address << LOG_ENDL;
160 *encoded_addr = 0;
161 return 0;
162 }
163 if (address >= here_address) {
164 LOG(DFATAL) << "EncodeAddress was called with address (" << address
165 << ") < here_address (" << here_address << ")" << LOG_ENDL;
166 *encoded_addr = 0;
167 return 0;
168 }
169 // Try using the SAME cache. This method, if available, always
170 // results in the smallest encoding and takes priority over other modes.
171 if (same_cache_size() > 0) {
172 const VCDAddress same_cache_pos =
173 address % (same_cache_size() * 256);
174 if (SameAddress(same_cache_pos) == address) {
175 // This is the only mode for which an single byte will be written
176 // to the address stream instead of a variable-length integer.
177 UpdateCache(address);
178 *encoded_addr = same_cache_pos % 256;
179 return FirstSameMode() + (same_cache_pos / 256); // SAME mode
180 }
181 }
182
183 // Try SELF mode
184 unsigned char best_mode = VCD_SELF_MODE;
185 VCDAddress best_encoded_address = address;
186
187 // Try HERE mode
188 {
189 const VCDAddress here_encoded_address = here_address - address;
190 if (here_encoded_address < best_encoded_address) {
191 best_mode = VCD_HERE_MODE;
192 best_encoded_address = here_encoded_address;
193 }
194 }
195
196 // Try using the NEAR cache
197 for (int i = 0; i < near_cache_size(); ++i) {
198 const VCDAddress near_encoded_address = address - NearAddress(i);
199 if ((near_encoded_address >= 0) &&
200 (near_encoded_address < best_encoded_address)) {
201 best_mode = FirstNearMode() + i;
202 best_encoded_address = near_encoded_address;
203 }
204 }
205
206 UpdateCache(address);
207 *encoded_addr = best_encoded_address;
208 return best_mode;
209 }
210
211 // Increments *byte_pointer and returns the byte it pointed to before the
212 // increment. The caller must check bounds to ensure that *byte_pointer
213 // points to a valid address in memory.
ParseByte(const char ** byte_pointer)214 static unsigned char ParseByte(const char** byte_pointer) {
215 unsigned char byte_value = static_cast<unsigned char>(**byte_pointer);
216 ++(*byte_pointer);
217 return byte_value;
218 }
219
220 // Checks the given decoded address for validity. Returns true if the
221 // address is valid; otherwise, prints an error message to the log and
222 // returns false.
IsDecodedAddressValid(VCDAddress decoded_address,VCDAddress here_address)223 static bool IsDecodedAddressValid(VCDAddress decoded_address,
224 VCDAddress here_address) {
225 if (decoded_address < 0) {
226 LOG(ERROR) << "Decoded address " << decoded_address << " is invalid"
227 << LOG_ENDL;
228 return false;
229 } else if (decoded_address >= here_address) {
230 LOG(ERROR) << "Decoded address (" << decoded_address
231 << ") is beyond location in target file (" << here_address
232 << ")" << LOG_ENDL;
233 return false;
234 }
235 return true;
236 }
237
238 // Interprets the next value in the address_stream using the provided mode,
239 // which may need to access the SAME or NEAR address cache. Returns the
240 // decoded address.
241 // The Init() function must already have been called.
242 //
243 // Arguments:
244 // here_address: The current location in the source + target data (i.e., the
245 // location into which the COPY instruction will copy.) By definition,
246 // all addresses between 0 and (here_address - 1) are valid, and
247 // any other address is invalid.
248 // mode: A byte value between 0 and (near_cache_size_ + same_cache_size_ + 1)
249 // which tells how to interpret the next value in the address stream.
250 // The values 0 and 1 correspond to SELF and HERE addressing.
251 // The validity of "mode" should already have been checked before
252 // calling this function.
253 // address_stream: Points to a pointer holding the position
254 // in the "Addresses section for COPYs" part of the input data.
255 // That section must already have been uncompressed
256 // using a secondary decompressor (if necessary.)
257 // This is an IN/OUT argument; the value of *address_stream will be
258 // incremented by the size of an integer, or (if the SAME cache
259 // was used) by the size of a byte (1).
260 // address_stream_end: Points to the position just after the end of
261 // the address stream buffer. All addresses between *address_stream
262 // and address_stream_end should contain valid address data.
263 //
264 // Return value: If the input conditions were met, and the address section
265 // of the input data contains properly encoded addresses that match
266 // the instructions section, then an integer between 0 and here_address - 1
267 // will be returned, representing the address from which data should
268 // be copied from the source or target window into the output stream.
269 // If an invalid address value is found in address_stream, then
270 // RESULT_ERROR will be returned. If the limit address_stream_end
271 // is reached before the address can be decoded, then
272 // RESULT_END_OF_DATA will be returned. If more streamed data
273 // is expected, this means that the consumer should block and wait
274 // for more data before continuing to decode. If no more data is expected,
275 // this return value signals an error condition.
276 //
DecodeAddress(VCDAddress here_address,unsigned char mode,const char ** address_stream,const char * address_stream_end)277 VCDAddress VCDiffAddressCache::DecodeAddress(VCDAddress here_address,
278 unsigned char mode,
279 const char** address_stream,
280 const char* address_stream_end) {
281 if (here_address < 0) {
282 LOG(DFATAL) << "DecodeAddress was passed a negative value"
283 " for here_address: " << here_address << LOG_ENDL;
284 return RESULT_ERROR;
285 }
286 const char* new_address_pos = *address_stream;
287 if (new_address_pos >= address_stream_end) {
288 return RESULT_END_OF_DATA;
289 }
290 VCDAddress decoded_address;
291 if (IsSameMode(mode)) {
292 // SAME mode expects a byte value as the encoded address
293 unsigned char encoded_address = ParseByte(&new_address_pos);
294 decoded_address = DecodeSameAddress(mode, encoded_address);
295 } else {
296 // All modes except SAME mode expect a VarintBE as the encoded address
297 int32_t encoded_address = VarintBE<int32_t>::Parse(address_stream_end,
298 &new_address_pos);
299 switch (encoded_address) {
300 case RESULT_ERROR:
301 LOG(ERROR) << "Found invalid variable-length integer "
302 "as encoded address value" << LOG_ENDL;
303 return RESULT_ERROR;
304 case RESULT_END_OF_DATA:
305 return RESULT_END_OF_DATA;
306 default:
307 break;
308 }
309 if (IsSelfMode(mode)) {
310 decoded_address = DecodeSelfAddress(encoded_address);
311 } else if (IsHereMode(mode)) {
312 decoded_address = DecodeHereAddress(encoded_address, here_address);
313 } else if (IsNearMode(mode)) {
314 decoded_address = DecodeNearAddress(mode, encoded_address);
315 } else {
316 LOG(DFATAL) << "Invalid mode value (" << static_cast<int>(mode)
317 << ") passed to DecodeAddress; maximum mode value = "
318 << static_cast<int>(LastMode()) << LOG_ENDL;
319 return RESULT_ERROR;
320 }
321 }
322 // Check for an out-of-bounds address (corrupt/malicious data)
323 if (!IsDecodedAddressValid(decoded_address, here_address)) {
324 return RESULT_ERROR;
325 }
326 *address_stream = new_address_pos;
327 UpdateCache(decoded_address);
328 return decoded_address;
329 }
330
331 } // namespace open_vcdiff
332