1 // Copyright 2008 Google Inc.
2 // Author: Lincoln Smith
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 // http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15
16 #include <config.h>
17 #include <limits.h> // UCHAR_MAX
18 #include <string>
19 #include "addrcache.h"
20 #include "codetable.h"
21 #include "encodetable.h"
22 #include "instruction_map.h"
23 #include "logging.h"
24 #include "google/output_string.h"
25 #include "varint_bigendian.h"
26 #include "vcdiff_defs.h"
27
28 namespace open_vcdiff {
29
30 static const DeltaFileHeader kHeaderStandardFormat = {
31 0xD6, // Header1: "V" | 0x80
32 0xC3, // Header2: "C" | 0x80
33 0xC4, // Header3: "D" | 0x80
34 0x00, // Header4: Draft standard format
35 0x00 }; // Hdr_Indicator:
36 // No compression, no custom code table
37
38 static const DeltaFileHeader kHeaderExtendedFormat = {
39 0xD6, // Header1: "V" | 0x80
40 0xC3, // Header2: "C" | 0x80
41 0xC4, // Header3: "D" | 0x80
42 'S', // Header4: VCDIFF/SDCH, extensions used
43 0x00 }; // Hdr_Indicator:
44 // No compression, no custom code table
45
46 // VCDiffCodeTableWriter members and methods
47
48 // If interleaved is true, the encoder writes each delta file window
49 // by interleaving instructions and sizes with their corresponding
50 // addresses and data, rather than placing these elements into three
51 // separate sections. This facilitates providing partially
52 // decoded results when only a portion of a delta file window
53 // is received (e.g. when HTTP over TCP is used as the
54 // transmission protocol.) The interleaved format is
55 // not consistent with the VCDIFF draft standard.
56 //
VCDiffCodeTableWriter(bool interleaved)57 VCDiffCodeTableWriter::VCDiffCodeTableWriter(bool interleaved)
58 : max_mode_(VCDiffAddressCache::DefaultLastMode()),
59 dictionary_size_(0),
60 target_length_(0),
61 code_table_data_(&VCDiffCodeTableData::kDefaultCodeTableData),
62 instruction_map_(NULL),
63 last_opcode_index_(-1),
64 add_checksum_(false),
65 checksum_(0) {
66 InitSectionPointers(interleaved);
67 }
68
VCDiffCodeTableWriter(bool interleaved,int near_cache_size,int same_cache_size,const VCDiffCodeTableData & code_table_data,unsigned char max_mode)69 VCDiffCodeTableWriter::VCDiffCodeTableWriter(
70 bool interleaved,
71 int near_cache_size,
72 int same_cache_size,
73 const VCDiffCodeTableData& code_table_data,
74 unsigned char max_mode)
75 : max_mode_(max_mode),
76 address_cache_(near_cache_size, same_cache_size),
77 dictionary_size_(0),
78 target_length_(0),
79 code_table_data_(&code_table_data),
80 instruction_map_(NULL),
81 last_opcode_index_(-1),
82 add_checksum_(false),
83 checksum_(0) {
84 InitSectionPointers(interleaved);
85 }
86
~VCDiffCodeTableWriter()87 VCDiffCodeTableWriter::~VCDiffCodeTableWriter() {
88 if (code_table_data_ != &VCDiffCodeTableData::kDefaultCodeTableData) {
89 delete instruction_map_;
90 }
91 }
92
InitSectionPointers(bool interleaved)93 void VCDiffCodeTableWriter::InitSectionPointers(bool interleaved) {
94 if (interleaved) {
95 data_for_add_and_run_ = &instructions_and_sizes_;
96 addresses_for_copy_ = &instructions_and_sizes_;
97 } else {
98 data_for_add_and_run_ = &separate_data_for_add_and_run_;
99 addresses_for_copy_ = &separate_addresses_for_copy_;
100 }
101 }
102
Init(size_t dictionary_size)103 bool VCDiffCodeTableWriter::Init(size_t dictionary_size) {
104 dictionary_size_ = dictionary_size;
105 if (!instruction_map_) {
106 if (code_table_data_ == &VCDiffCodeTableData::kDefaultCodeTableData) {
107 instruction_map_ = VCDiffInstructionMap::GetDefaultInstructionMap();
108 } else {
109 instruction_map_ = new VCDiffInstructionMap(*code_table_data_, max_mode_);
110 }
111 if (!instruction_map_) {
112 return false;
113 }
114 }
115 if (!address_cache_.Init()) {
116 return false;
117 }
118 target_length_ = 0;
119 last_opcode_index_ = -1;
120 return true;
121 }
122
WriteHeader(OutputStringInterface * out,VCDiffFormatExtensionFlags format_extensions)123 void VCDiffCodeTableWriter::WriteHeader(
124 OutputStringInterface* out,
125 VCDiffFormatExtensionFlags format_extensions) {
126 if (format_extensions == VCD_STANDARD_FORMAT) {
127 out->append(reinterpret_cast<const char*>(&kHeaderStandardFormat),
128 sizeof(kHeaderStandardFormat));
129 } else {
130 out->append(reinterpret_cast<const char*>(&kHeaderExtendedFormat),
131 sizeof(kHeaderExtendedFormat));
132 }
133 // If custom cache table sizes or a custom code table were used
134 // for encoding, here is where they would be appended to *output.
135 // This implementation of the encoder does not use those features,
136 // although the decoder can understand and interpret them.
137 }
138
139 // The VCDiff format allows each opcode to represent either
140 // one or two delta instructions. This function will first
141 // examine the opcode generated by the last call to EncodeInstruction.
142 // If that opcode was a single-instruction opcode, this function checks
143 // whether there is a compound (double-instruction) opcode that can
144 // combine that single instruction with the instruction that is now
145 // being added, and so save a byte of space. In that case, the
146 // single-instruction opcode at position last_opcode_index_ will be
147 // overwritten with the new double-instruction opcode.
148 //
149 // In the majority of cases, no compound opcode will be possible,
150 // and a new single-instruction opcode will be appended to
151 // instructions_and_sizes_, followed by a representation of its size
152 // if the opcode does not implicitly give the instruction size.
153 //
154 // As an example, say instructions_and_sizes_ contains 10 bytes, the last
155 // of which contains the opcode 0x02 (ADD size 1). Because that was the
156 // most recently added opcode, last_opcode_index_ has the value 10.
157 // EncodeInstruction is then called with inst = VCD_COPY, size = 4, mode = 0.
158 // The function will replace the old opcode 0x02 with the double-instruction
159 // opcode 0xA3 (ADD size 1 + COPY size 4 mode 0).
160 //
161 // All of the double-instruction opcodes in the standard code table
162 // have implicit sizes, meaning that the size of the instruction will not
163 // need to be written to the instructions_and_sizes_ string separately
164 // from the opcode. If a custom code table were used that did not have
165 // this property, then instructions_and_sizes_ might contain a
166 // double-instruction opcode (say, COPY size 0 mode 0 + ADD size 0)
167 // followed by the size of the COPY, then by the size of the ADD.
168 // If using the SDCH interleaved format, the address of the COPY instruction
169 // would follow its size, so the ordering would be
170 // [Compound Opcode][Size of COPY][Address of COPY][Size of ADD]
171 //
EncodeInstruction(VCDiffInstructionType inst,size_t size,unsigned char mode)172 void VCDiffCodeTableWriter::EncodeInstruction(VCDiffInstructionType inst,
173 size_t size,
174 unsigned char mode) {
175 if (!instruction_map_) {
176 VCD_DFATAL << "EncodeInstruction() called without calling Init()"
177 << VCD_ENDL;
178 return;
179 }
180 if (last_opcode_index_ >= 0) {
181 const unsigned char last_opcode =
182 instructions_and_sizes_[last_opcode_index_];
183 // The encoding engine should not generate two ADD instructions in a row.
184 // This won't cause a failure, but it's inefficient encoding and probably
185 // represents a bug in the higher-level logic of the encoder.
186 if ((inst == VCD_ADD) &&
187 (code_table_data_->inst1[last_opcode] == VCD_ADD)) {
188 VCD_WARNING << "EncodeInstruction() called for two ADD instructions"
189 " in a row" << VCD_ENDL;
190 }
191 OpcodeOrNone compound_opcode = kNoOpcode;
192 if (size <= UCHAR_MAX) {
193 compound_opcode =
194 instruction_map_->LookupSecondOpcode(last_opcode,
195 inst,
196 static_cast<unsigned char>(size),
197 mode);
198 if (compound_opcode != kNoOpcode) {
199 instructions_and_sizes_[last_opcode_index_] =
200 static_cast<unsigned char>(compound_opcode);
201 last_opcode_index_ = -1;
202 return;
203 }
204 }
205 // Try finding a compound opcode with size 0.
206 compound_opcode = instruction_map_->LookupSecondOpcode(last_opcode,
207 inst,
208 0,
209 mode);
210 if (compound_opcode != kNoOpcode) {
211 instructions_and_sizes_[last_opcode_index_] =
212 static_cast<unsigned char>(compound_opcode);
213 last_opcode_index_ = -1;
214 AppendSizeToString(size, &instructions_and_sizes_);
215 return;
216 }
217 }
218 OpcodeOrNone opcode = kNoOpcode;
219 if (size <= UCHAR_MAX) {
220 opcode =
221 instruction_map_->LookupFirstOpcode(inst,
222 static_cast<unsigned char>(size),
223 mode);
224 if (opcode != kNoOpcode) {
225 instructions_and_sizes_.push_back(static_cast<char>(opcode));
226 last_opcode_index_ = static_cast<int>(instructions_and_sizes_.size() - 1);
227 return;
228 }
229 }
230 // There should always be an opcode with size 0.
231 opcode = instruction_map_->LookupFirstOpcode(inst, 0, mode);
232 if (opcode == kNoOpcode) {
233 VCD_DFATAL << "No matching opcode found for inst " << inst
234 << ", mode " << mode << ", size 0" << VCD_ENDL;
235 return;
236 }
237 instructions_and_sizes_.push_back(static_cast<char>(opcode));
238 last_opcode_index_ = static_cast<int>(instructions_and_sizes_.size() - 1);
239 AppendSizeToString(size, &instructions_and_sizes_);
240 }
241
Add(const char * data,size_t size)242 void VCDiffCodeTableWriter::Add(const char* data, size_t size) {
243 EncodeInstruction(VCD_ADD, size);
244 data_for_add_and_run_->append(data, size);
245 target_length_ += size;
246 }
247
Copy(int32_t offset,size_t size)248 void VCDiffCodeTableWriter::Copy(int32_t offset, size_t size) {
249 if (!instruction_map_) {
250 VCD_DFATAL << "VCDiffCodeTableWriter::Copy() called without calling Init()"
251 << VCD_ENDL;
252 return;
253 }
254 // If a single interleaved stream of encoded values is used
255 // instead of separate sections for instructions, addresses, and data,
256 // then the string instructions_and_sizes_ may be the same as
257 // addresses_for_copy_. The address should therefore be encoded
258 // *after* the instruction and its size.
259 int32_t encoded_addr = 0;
260 const unsigned char mode = address_cache_.EncodeAddress(
261 offset,
262 static_cast<VCDAddress>(dictionary_size_ + target_length_),
263 &encoded_addr);
264 EncodeInstruction(VCD_COPY, size, mode);
265 if (address_cache_.WriteAddressAsVarintForMode(mode)) {
266 VarintBE<int32_t>::AppendToString(encoded_addr, addresses_for_copy_);
267 } else {
268 addresses_for_copy_->push_back(static_cast<unsigned char>(encoded_addr));
269 }
270 target_length_ += size;
271 }
272
Run(size_t size,unsigned char byte)273 void VCDiffCodeTableWriter::Run(size_t size, unsigned char byte) {
274 EncodeInstruction(VCD_RUN, size);
275 data_for_add_and_run_->push_back(byte);
276 target_length_ += size;
277 }
278
CalculateLengthOfSizeAsVarint(size_t size)279 size_t VCDiffCodeTableWriter::CalculateLengthOfSizeAsVarint(size_t size) {
280 return VarintBE<int32_t>::Length(static_cast<int32_t>(size));
281 }
282
AppendSizeToString(size_t size,string * out)283 void VCDiffCodeTableWriter::AppendSizeToString(size_t size, string* out) {
284 VarintBE<int32_t>::AppendToString(static_cast<int32_t>(size), out);
285 }
286
AppendSizeToOutputString(size_t size,OutputStringInterface * out)287 void VCDiffCodeTableWriter::AppendSizeToOutputString(
288 size_t size,
289 OutputStringInterface* out) {
290 VarintBE<int32_t>::AppendToOutputString(static_cast<int32_t>(size), out);
291 }
292
293 // This calculation must match the items added between "Start of Delta Encoding"
294 // and "End of Delta Encoding" in Output(), below.
CalculateLengthOfTheDeltaEncoding() const295 size_t VCDiffCodeTableWriter::CalculateLengthOfTheDeltaEncoding() const {
296 size_t length_of_the_delta_encoding =
297 CalculateLengthOfSizeAsVarint(target_length_) +
298 1 + // Delta_Indicator
299 CalculateLengthOfSizeAsVarint(separate_data_for_add_and_run_.size()) +
300 CalculateLengthOfSizeAsVarint(instructions_and_sizes_.size()) +
301 CalculateLengthOfSizeAsVarint(separate_addresses_for_copy_.size()) +
302 separate_data_for_add_and_run_.size() +
303 instructions_and_sizes_.size() +
304 separate_addresses_for_copy_.size();
305 if (add_checksum_) {
306 length_of_the_delta_encoding +=
307 VarintBE<int64_t>::Length(static_cast<int64_t>(checksum_));
308 }
309 return length_of_the_delta_encoding;
310 }
311
Output(OutputStringInterface * out)312 void VCDiffCodeTableWriter::Output(OutputStringInterface* out) {
313 if (instructions_and_sizes_.empty()) {
314 VCD_WARNING << "Empty input; no delta window produced" << VCD_ENDL;
315 } else {
316 const size_t length_of_the_delta_encoding =
317 CalculateLengthOfTheDeltaEncoding();
318 const size_t delta_window_size =
319 length_of_the_delta_encoding +
320 1 + // Win_Indicator
321 CalculateLengthOfSizeAsVarint(dictionary_size_) +
322 CalculateLengthOfSizeAsVarint(0) +
323 CalculateLengthOfSizeAsVarint(length_of_the_delta_encoding);
324 // append() will be called many times on the output string; make sure
325 // the output string is resized only once at most.
326 out->ReserveAdditionalBytes(delta_window_size);
327
328 // Add first element: Win_Indicator
329 if (add_checksum_) {
330 out->push_back(VCD_SOURCE | VCD_CHECKSUM);
331 } else {
332 out->push_back(VCD_SOURCE);
333 }
334 // Source segment size: dictionary size
335 AppendSizeToOutputString(dictionary_size_, out);
336 // Source segment position: 0 (start of dictionary)
337 AppendSizeToOutputString(0, out);
338
339 // [Here is where a secondary compressor would be used
340 // if the encoder and decoder supported that feature.]
341
342 AppendSizeToOutputString(length_of_the_delta_encoding, out);
343 // Start of Delta Encoding
344 const size_t size_before_delta_encoding = out->size();
345 AppendSizeToOutputString(target_length_, out);
346 out->push_back(0x00); // Delta_Indicator: no compression
347 AppendSizeToOutputString(separate_data_for_add_and_run_.size(), out);
348 AppendSizeToOutputString(instructions_and_sizes_.size(), out);
349 AppendSizeToOutputString(separate_addresses_for_copy_.size(), out);
350 if (add_checksum_) {
351 // The checksum is a 32-bit *unsigned* integer. VarintBE requires a
352 // signed type, so use a 64-bit signed integer to store the checksum.
353 VarintBE<int64_t>::AppendToOutputString(static_cast<int64_t>(checksum_),
354 out);
355 }
356 out->append(separate_data_for_add_and_run_.data(),
357 separate_data_for_add_and_run_.size());
358 out->append(instructions_and_sizes_.data(),
359 instructions_and_sizes_.size());
360 out->append(separate_addresses_for_copy_.data(),
361 separate_addresses_for_copy_.size());
362 // End of Delta Encoding
363 const size_t size_after_delta_encoding = out->size();
364 if (length_of_the_delta_encoding !=
365 (size_after_delta_encoding - size_before_delta_encoding)) {
366 VCD_DFATAL << "Internal error: calculated length of the delta encoding ("
367 << length_of_the_delta_encoding
368 << ") does not match actual length ("
369 << (size_after_delta_encoding - size_before_delta_encoding)
370 << VCD_ENDL;
371 }
372 separate_data_for_add_and_run_.clear();
373 instructions_and_sizes_.clear();
374 separate_addresses_for_copy_.clear();
375 if (target_length_ == 0) {
376 VCD_WARNING << "Empty target window" << VCD_ENDL;
377 }
378 }
379
380 // Reset state for next window; assume we are using same code table
381 // and dictionary. The caller will have to invoke Init() if a different
382 // dictionary is used.
383 //
384 // Notably, Init() calls address_cache_.Init(). This resets the address
385 // cache between delta windows, as required by RFC section 5.1.
386 if (!Init(dictionary_size_)) {
387 VCD_DFATAL << "Internal error: calling Init() to reset "
388 "VCDiffCodeTableWriter state failed" << VCD_ENDL;
389 }
390 }
391
392 }; // namespace open_vcdiff
393