1 // Copyright 2008 Google Inc.
2 // Author: Lincoln Smith
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 // http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15
16 #include <config.h>
17 #include "encodetable.h"
18 #include <string>
19 #include "addrcache.h"
20 #include "codetable.h"
21 #include "instruction_map.h"
22 #include "logging.h"
23 #include "google/output_string.h"
24 #include "varint_bigendian.h"
25 #include "vcdiff_defs.h"
26
27 namespace open_vcdiff {
28
29 // VCDiffCodeTableWriter members and methods
30
31 // If interleaved is true, the encoder writes each delta file window
32 // by interleaving instructions and sizes with their corresponding
33 // addresses and data, rather than placing these elements into three
34 // separate sections. This facilitates providing partially
35 // decoded results when only a portion of a delta file window
36 // is received (e.g. when HTTP over TCP is used as the
37 // transmission protocol.) The interleaved format is
38 // not consistent with the VCDIFF draft standard.
39 //
VCDiffCodeTableWriter(bool interleaved)40 VCDiffCodeTableWriter::VCDiffCodeTableWriter(bool interleaved)
41 : max_mode_(VCDiffAddressCache::DefaultLastMode()),
42 dictionary_size_(0),
43 target_length_(0),
44 code_table_data_(&VCDiffCodeTableData::kDefaultCodeTableData),
45 instruction_map_(NULL),
46 last_opcode_index_(-1),
47 add_checksum_(false),
48 checksum_(0),
49 match_counts_(kMaxMatchSize, 0) {
50 InitSectionPointers(interleaved);
51 }
52
VCDiffCodeTableWriter(bool interleaved,int near_cache_size,int same_cache_size,const VCDiffCodeTableData & code_table_data,unsigned char max_mode)53 VCDiffCodeTableWriter::VCDiffCodeTableWriter(
54 bool interleaved,
55 int near_cache_size,
56 int same_cache_size,
57 const VCDiffCodeTableData& code_table_data,
58 unsigned char max_mode)
59 : max_mode_(max_mode),
60 address_cache_(near_cache_size, same_cache_size),
61 dictionary_size_(0),
62 target_length_(0),
63 code_table_data_(&code_table_data),
64 instruction_map_(NULL),
65 last_opcode_index_(-1),
66 add_checksum_(false),
67 checksum_(0) {
68 InitSectionPointers(interleaved);
69 }
70
~VCDiffCodeTableWriter()71 VCDiffCodeTableWriter::~VCDiffCodeTableWriter() {
72 if (code_table_data_ != &VCDiffCodeTableData::kDefaultCodeTableData) {
73 delete instruction_map_;
74 }
75 }
76
InitSectionPointers(bool interleaved)77 void VCDiffCodeTableWriter::InitSectionPointers(bool interleaved) {
78 if (interleaved) {
79 data_for_add_and_run_ = &instructions_and_sizes_;
80 addresses_for_copy_ = &instructions_and_sizes_;
81 } else {
82 data_for_add_and_run_ = &separate_data_for_add_and_run_;
83 addresses_for_copy_ = &separate_addresses_for_copy_;
84 }
85 }
86
Init(size_t dictionary_size)87 bool VCDiffCodeTableWriter::Init(size_t dictionary_size) {
88 dictionary_size_ = dictionary_size;
89 if (!instruction_map_) {
90 if (code_table_data_ == &VCDiffCodeTableData::kDefaultCodeTableData) {
91 instruction_map_ = VCDiffInstructionMap::GetDefaultInstructionMap();
92 } else {
93 instruction_map_ = new VCDiffInstructionMap(*code_table_data_, max_mode_);
94 }
95 if (!instruction_map_) {
96 return false;
97 }
98 }
99 if (!address_cache_.Init()) {
100 return false;
101 }
102 target_length_ = 0;
103 last_opcode_index_ = -1;
104 return true;
105 }
106
107 // The VCDiff format allows each opcode to represent either
108 // one or two delta instructions. This function will first
109 // examine the opcode generated by the last call to EncodeInstruction.
110 // If that opcode was a single-instruction opcode, this function checks
111 // whether there is a compound (double-instruction) opcode that can
112 // combine that single instruction with the instruction that is now
113 // being added, and so save a byte of space. In that case, the
114 // single-instruction opcode at position last_opcode_index_ will be
115 // overwritten with the new double-instruction opcode.
116 //
117 // In the majority of cases, no compound opcode will be possible,
118 // and a new single-instruction opcode will be appended to
119 // instructions_and_sizes_, followed by a representation of its size
120 // if the opcode does not implicitly give the instruction size.
121 //
122 // As an example, say instructions_and_sizes_ contains 10 bytes, the last
123 // of which contains the opcode 0x02 (ADD size 1). Because that was the
124 // most recently added opcode, last_opcode_index_ has the value 10.
125 // EncodeInstruction is then called with inst = VCD_COPY, size = 4, mode = 0.
126 // The function will replace the old opcode 0x02 with the double-instruction
127 // opcode 0xA3 (ADD size 1 + COPY size 4 mode 0).
128 //
129 // All of the double-instruction opcodes in the standard code table
130 // have implicit sizes, meaning that the size of the instruction will not
131 // need to be written to the instructions_and_sizes_ string separately
132 // from the opcode. If a custom code table were used that did not have
133 // this property, then instructions_and_sizes_ might contain a
134 // double-instruction opcode (say, COPY size 0 mode 0 + ADD size 0)
135 // followed by the size of the COPY, then by the size of the ADD.
136 // If using the SDCH interleaved format, the address of the COPY instruction
137 // would follow its size, so the ordering would be
138 // [Compound Opcode][Size of COPY][Address of COPY][Size of ADD]
139 //
EncodeInstruction(VCDiffInstructionType inst,size_t size,unsigned char mode)140 void VCDiffCodeTableWriter::EncodeInstruction(VCDiffInstructionType inst,
141 size_t size,
142 unsigned char mode) {
143 if (!instruction_map_) {
144 LOG(DFATAL) << "EncodeInstruction() called without calling Init()"
145 << LOG_ENDL;
146 return;
147 }
148 if (last_opcode_index_ >= 0) {
149 const unsigned char last_opcode =
150 instructions_and_sizes_[last_opcode_index_];
151 // The encoding engine should not generate two ADD instructions in a row.
152 // This won't cause a failure, but it's inefficient encoding and probably
153 // represents a bug in the higher-level logic of the encoder.
154 if ((inst == VCD_ADD) &&
155 (code_table_data_->inst1[last_opcode] == VCD_ADD)) {
156 LOG(WARNING) << "EncodeInstruction() called for two ADD instructions"
157 " in a row" << LOG_ENDL;
158 }
159 OpcodeOrNone compound_opcode = kNoOpcode;
160 if (size <= UCHAR_MAX) {
161 compound_opcode =
162 instruction_map_->LookupSecondOpcode(last_opcode,
163 inst,
164 static_cast<unsigned char>(size),
165 mode);
166 if (compound_opcode != kNoOpcode) {
167 instructions_and_sizes_[last_opcode_index_] =
168 static_cast<unsigned char>(compound_opcode);
169 last_opcode_index_ = -1;
170 return;
171 }
172 }
173 // Try finding a compound opcode with size 0.
174 compound_opcode = instruction_map_->LookupSecondOpcode(last_opcode,
175 inst,
176 0,
177 mode);
178 if (compound_opcode != kNoOpcode) {
179 instructions_and_sizes_[last_opcode_index_] =
180 static_cast<unsigned char>(compound_opcode);
181 last_opcode_index_ = -1;
182 AppendSizeToString(size, &instructions_and_sizes_);
183 return;
184 }
185 }
186 OpcodeOrNone opcode = kNoOpcode;
187 if (size <= UCHAR_MAX) {
188 opcode =
189 instruction_map_->LookupFirstOpcode(inst,
190 static_cast<unsigned char>(size),
191 mode);
192 if (opcode != kNoOpcode) {
193 instructions_and_sizes_.push_back(static_cast<char>(opcode));
194 last_opcode_index_ = static_cast<int>(instructions_and_sizes_.size() - 1);
195 return;
196 }
197 }
198 // There should always be an opcode with size 0.
199 opcode = instruction_map_->LookupFirstOpcode(inst, 0, mode);
200 if (opcode == kNoOpcode) {
201 LOG(DFATAL) << "No matching opcode found for inst " << inst
202 << ", mode " << mode << ", size 0" << LOG_ENDL;
203 return;
204 }
205 instructions_and_sizes_.push_back(static_cast<char>(opcode));
206 last_opcode_index_ = static_cast<int>(instructions_and_sizes_.size() - 1);
207 AppendSizeToString(size, &instructions_and_sizes_);
208 }
209
Add(const char * data,size_t size)210 void VCDiffCodeTableWriter::Add(const char* data, size_t size) {
211 EncodeInstruction(VCD_ADD, size);
212 data_for_add_and_run_->append(data, size);
213 target_length_ += size;
214 }
215
Copy(int32_t offset,size_t size)216 void VCDiffCodeTableWriter::Copy(int32_t offset, size_t size) {
217 if (!instruction_map_) {
218 LOG(DFATAL) << "VCDiffCodeTableWriter::Copy() called without calling Init()"
219 << LOG_ENDL;
220 return;
221 }
222 // If a single interleaved stream of encoded values is used
223 // instead of separate sections for instructions, addresses, and data,
224 // then the string instructions_and_sizes_ may be the same as
225 // addresses_for_copy_. The address should therefore be encoded
226 // *after* the instruction and its size.
227 int32_t encoded_addr = 0;
228 const unsigned char mode = address_cache_.EncodeAddress(
229 offset,
230 static_cast<VCDAddress>(dictionary_size_ + target_length_),
231 &encoded_addr);
232 EncodeInstruction(VCD_COPY, size, mode);
233 if (address_cache_.WriteAddressAsVarintForMode(mode)) {
234 VarintBE<int32_t>::AppendToString(encoded_addr, addresses_for_copy_);
235 } else {
236 addresses_for_copy_->push_back(static_cast<unsigned char>(encoded_addr));
237 }
238 target_length_ += size;
239 if (size >= match_counts_.size()) {
240 match_counts_.resize(size * 2, 0); // Be generous to avoid resizing again
241 }
242 ++match_counts_[size];
243 }
244
Run(size_t size,unsigned char byte)245 void VCDiffCodeTableWriter::Run(size_t size, unsigned char byte) {
246 EncodeInstruction(VCD_RUN, size);
247 data_for_add_and_run_->push_back(byte);
248 target_length_ += size;
249 }
250
CalculateLengthOfSizeAsVarint(size_t size)251 size_t VCDiffCodeTableWriter::CalculateLengthOfSizeAsVarint(size_t size) {
252 return VarintBE<int32_t>::Length(static_cast<int32_t>(size));
253 }
254
AppendSizeToString(size_t size,string * out)255 void VCDiffCodeTableWriter::AppendSizeToString(size_t size, string* out) {
256 VarintBE<int32_t>::AppendToString(static_cast<int32_t>(size), out);
257 }
258
AppendSizeToOutputString(size_t size,OutputStringInterface * out)259 void VCDiffCodeTableWriter::AppendSizeToOutputString(
260 size_t size,
261 OutputStringInterface* out) {
262 VarintBE<int32_t>::AppendToOutputString(static_cast<int32_t>(size), out);
263 }
264
265 // This calculation must match the items added between "Start of Delta Encoding"
266 // and "End of Delta Encoding" in Output(), below.
CalculateLengthOfTheDeltaEncoding() const267 size_t VCDiffCodeTableWriter::CalculateLengthOfTheDeltaEncoding() const {
268 size_t length_of_the_delta_encoding =
269 CalculateLengthOfSizeAsVarint(target_length_) +
270 1 + // Delta_Indicator
271 CalculateLengthOfSizeAsVarint(separate_data_for_add_and_run_.size()) +
272 CalculateLengthOfSizeAsVarint(instructions_and_sizes_.size()) +
273 CalculateLengthOfSizeAsVarint(separate_addresses_for_copy_.size()) +
274 separate_data_for_add_and_run_.size() +
275 instructions_and_sizes_.size() +
276 separate_addresses_for_copy_.size();
277 if (add_checksum_) {
278 length_of_the_delta_encoding +=
279 VarintBE<int64_t>::Length(static_cast<int64_t>(checksum_));
280 }
281 return length_of_the_delta_encoding;
282 }
283
Output(OutputStringInterface * out)284 void VCDiffCodeTableWriter::Output(OutputStringInterface* out) {
285 if (instructions_and_sizes_.empty()) {
286 LOG(WARNING) << "Empty input; no delta window produced" << LOG_ENDL;
287 } else {
288 const size_t length_of_the_delta_encoding =
289 CalculateLengthOfTheDeltaEncoding();
290 const size_t delta_window_size =
291 length_of_the_delta_encoding +
292 1 + // Win_Indicator
293 CalculateLengthOfSizeAsVarint(dictionary_size_) +
294 CalculateLengthOfSizeAsVarint(0) +
295 CalculateLengthOfSizeAsVarint(length_of_the_delta_encoding);
296 // append() will be called many times on the output string; make sure
297 // the output string is resized only once at most.
298 out->ReserveAdditionalBytes(delta_window_size);
299
300 // Add first element: Win_Indicator
301 if (add_checksum_) {
302 out->push_back(VCD_SOURCE | VCD_CHECKSUM);
303 } else {
304 out->push_back(VCD_SOURCE);
305 }
306 // Source segment size: dictionary size
307 AppendSizeToOutputString(dictionary_size_, out);
308 // Source segment position: 0 (start of dictionary)
309 AppendSizeToOutputString(0, out);
310
311 // [Here is where a secondary compressor would be used
312 // if the encoder and decoder supported that feature.]
313
314 AppendSizeToOutputString(length_of_the_delta_encoding, out);
315 // Start of Delta Encoding
316 const size_t size_before_delta_encoding = out->size();
317 AppendSizeToOutputString(target_length_, out);
318 out->push_back(0x00); // Delta_Indicator: no compression
319 AppendSizeToOutputString(separate_data_for_add_and_run_.size(), out);
320 AppendSizeToOutputString(instructions_and_sizes_.size(), out);
321 AppendSizeToOutputString(separate_addresses_for_copy_.size(), out);
322 if (add_checksum_) {
323 // The checksum is a 32-bit *unsigned* integer. VarintBE requires a
324 // signed type, so use a 64-bit signed integer to store the checksum.
325 VarintBE<int64_t>::AppendToOutputString(static_cast<int64_t>(checksum_),
326 out);
327 }
328 out->append(separate_data_for_add_and_run_.data(),
329 separate_data_for_add_and_run_.size());
330 out->append(instructions_and_sizes_.data(),
331 instructions_and_sizes_.size());
332 out->append(separate_addresses_for_copy_.data(),
333 separate_addresses_for_copy_.size());
334 // End of Delta Encoding
335 const size_t size_after_delta_encoding = out->size();
336 if (length_of_the_delta_encoding !=
337 (size_after_delta_encoding - size_before_delta_encoding)) {
338 LOG(DFATAL) << "Internal error: calculated length of the delta encoding ("
339 << length_of_the_delta_encoding
340 << ") does not match actual length ("
341 << (size_after_delta_encoding - size_before_delta_encoding)
342 << LOG_ENDL;
343 }
344 separate_data_for_add_and_run_.clear();
345 instructions_and_sizes_.clear();
346 separate_addresses_for_copy_.clear();
347 if (target_length_ == 0) {
348 LOG(WARNING) << "Empty target window" << LOG_ENDL;
349 }
350 }
351
352 // Reset state for next window; assume we are using same code table
353 // and dictionary. The caller will have to invoke Init() if a different
354 // dictionary is used.
355 //
356 // Notably, Init() calls address_cache_.Init(). This resets the address
357 // cache between delta windows, as required by RFC section 5.1.
358 if (!Init(dictionary_size_)) {
359 LOG(DFATAL) << "Internal error: calling Init() to reset "
360 "VCDiffCodeTableWriter state failed" << LOG_ENDL;
361 }
362 }
363
364 }; // namespace open_vcdiff
365