// Copyright 2008 Google Inc. // Author: Lincoln Smith // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // // Implements a Decoder for the format described in // RFC 3284 - The VCDIFF Generic Differencing and Compression Data Format. // The RFC text can be found at http://www.faqs.org/rfcs/rfc3284.html // // The RFC describes the possibility of using a secondary compressor // to further reduce the size of each section of the VCDIFF output. // That feature is not supported in this implementation of the encoder // and decoder. // No secondary compressor types have been publicly registered with // the IANA at http://www.iana.org/assignments/vcdiff-comp-ids // in the more than five years since the registry was created, so there // is no standard set of compressor IDs which would be generated by other // encoders or accepted by other decoders. #include #include "google/vcdecoder.h" #include // size_t, ptrdiff_t #include // int32_t #include // memcpy, memset #include // auto_ptr #include #include "addrcache.h" #include "checksum.h" #include "codetable.h" #include "decodetable.h" #include "headerparser.h" #include "logging.h" #include "google/output_string.h" #include "varint_bigendian.h" #include "vcdiff_defs.h" namespace open_vcdiff { // This class is used to parse delta file windows as described // in RFC sections 4.2 and 4.3. Its methods are not thread-safe. // // Here is the window format copied from the RFC: // // Window1 // Win_Indicator - byte // [Source segment size] - integer // [Source segment position] - integer // The delta encoding of the target window // Length of the delta encoding - integer // The delta encoding // Size of the target window - integer // Delta_Indicator - byte // Length of data for ADDs and RUNs - integer // Length of instructions and sizes - integer // Length of addresses for COPYs - integer // Data section for ADDs and RUNs - array of bytes // Instructions and sizes section - array of bytes // Addresses section for COPYs - array of bytes // Window2 // ... // // Sample usage: // // VCDiffDeltaFileWindow delta_window_; // delta_window_.Init(parent); // ParseableChunk parseable_chunk(input_buffer, // input_size, // leftover_unencoded_bytes); // switch (delta_window_.DecodeWindows(&parseable_chunk)) { // case RESULT_END_OF_DATA: // // case RESULT_ERROR: // // } // // DecodeWindows consumes as many windows from the input as it can. It only // needs to be placed within a loop if the loop is used to obtain more input // (delta file) data. // class VCDiffDeltaFileWindow { public: VCDiffDeltaFileWindow(); ~VCDiffDeltaFileWindow(); // Init() should be called immediately after constructing the // VCDiffDeltaFileWindow(). It must be called before DecodeWindows() can be // invoked, or an error will occur. void Init(VCDiffStreamingDecoderImpl* parent); // Resets the pointers to the data sections in the current window. void Reset(); bool UseCodeTable(const VCDiffCodeTableData& code_table_data, unsigned char max_mode) { return reader_.UseCodeTable(code_table_data, max_mode); } // Decodes as many delta windows as possible using the input data from // *parseable_chunk. Appends the decoded target windows to // parent_->decoded_target(). Returns RESULT_SUCCESS on success, or // RESULT_END_OF_DATA if the end of input was reached before the entire window // could be decoded and more input is expected (only possible if // IsInterleaved() is true), or RESULT_ERROR if an error occurred during // decoding. In the RESULT_ERROR case, the value of parseable_chunk->pointer_ // is undefined; otherwise, parseable_chunk->Advance() is called to point to // the input data position just after the data that has been decoded. // // If planned_target_file_size is not set to kUnlimitedBytes, then the decoder // expects *exactly* this number of target bytes to be decoded from one or // more delta file windows. If this number is met exactly after finishing a // delta window, this function will return RESULT_SUCCESS without processing // any more bytes from data_pointer. If this number is exceeded while // decoding a window, but was not met before starting that window, // then RESULT_ERROR will be returned. // VCDiffResult DecodeWindows(ParseableChunk* parseable_chunk); bool FoundWindowHeader() const { return found_header_; } bool MoreDataExpected() const { // When parsing an interleaved-format delta file, // every time DecodeBody() exits, interleaved_bytes_expected_ // will be decremented by the number of bytes parsed. If it // reaches zero, then there is no more data expected because // the size of the interleaved section (given in the window // header) has been reached. return IsInterleaved() && (interleaved_bytes_expected_ > 0); } size_t target_window_start_pos() const { return target_window_start_pos_; } void set_target_window_start_pos(size_t new_start_pos) { target_window_start_pos_ = new_start_pos; } // Returns the number of bytes remaining to be decoded in the target window. // If not in the process of decoding a window, returns 0. size_t TargetBytesRemaining(); private: // Reads the header of the window section as described in RFC sections 4.2 and // 4.3, up to and including the value "Length of addresses for COPYs". If the // entire header is found, this function sets up the DeltaWindowSections // instructions_and_sizes_, data_for_add_and_run_, and addresses_for_copy_ so // that the decoder can begin decoding the opcodes in these sections. Returns // RESULT_ERROR if an error occurred, or RESULT_END_OF_DATA if the end of // available data was reached before the entire header could be read. (The // latter may be an error condition if there is no more data available.) // Otherwise, returns RESULT_SUCCESS and advances parseable_chunk past the // parsed header. // VCDiffResult ReadHeader(ParseableChunk* parseable_chunk); // After the window header has been parsed as far as the Delta_Indicator, // this function is called to parse the following delta window header fields: // // Length of data for ADDs and RUNs - integer (VarintBE format) // Length of instructions and sizes - integer (VarintBE format) // Length of addresses for COPYs - integer (VarintBE format) // // If has_checksum_ is true, it also looks for the following element: // // Adler32 checksum - unsigned 32-bit integer (VarintBE format) // // It sets up the DeltaWindowSections instructions_and_sizes_, // data_for_add_and_run_, and addresses_for_copy_. If the interleaved format // is being used, all three sections will include the entire window body; if // the standard format is used, three non-overlapping window sections will be // defined. Returns RESULT_ERROR if an error occurred, or RESULT_END_OF_DATA // if standard format is being used and there is not enough input data to read // the entire window body. Otherwise, returns RESULT_SUCCESS. VCDiffResult SetUpWindowSections(VCDiffHeaderParser* header_parser); // Decodes the body of the window section as described in RFC sections 4.3, // including the sections "Data section for ADDs and RUNs", "Instructions // and sizes section", and "Addresses section for COPYs". These sections // must already have been set up by ReadWindowHeader(). Returns a // non-negative value on success, or RESULT_END_OF_DATA if the end of input // was reached before the entire window could be decoded (only possible if // IsInterleaved() is true), or RESULT_ERROR if an error occurred during // decoding. Appends as much of the decoded target window as possible to // parent->decoded_target(). // int DecodeBody(ParseableChunk* parseable_chunk); // Returns the number of bytes already decoded into the target window. size_t TargetBytesDecoded(); // Decodes a single ADD instruction, updating parent_->decoded_target_. VCDiffResult DecodeAdd(size_t size); // Decodes a single RUN instruction, updating parent_->decoded_target_. VCDiffResult DecodeRun(size_t size); // Decodes a single COPY instruction, updating parent_->decoded_target_. VCDiffResult DecodeCopy(size_t size, unsigned char mode); // When using the interleaved format, this function is called both on parsing // the header and on resuming after a RESULT_END_OF_DATA was returned from a // previous call to DecodeBody(). It sets up all three section pointers to // reference the same interleaved stream of instructions, sizes, addresses, // and data. These pointers must be reset every time that work resumes on a // delta window, because the input data string may have been changed or // resized since DecodeBody() last returned. void UpdateInterleavedSectionPointers(const char* data_pos, const char* data_end) { const ptrdiff_t available_data = data_end - data_pos; // Don't read past the end of currently-available data if (available_data > interleaved_bytes_expected_) { instructions_and_sizes_.Init(data_pos, interleaved_bytes_expected_); } else { instructions_and_sizes_.Init(data_pos, available_data); } data_for_add_and_run_.Init(&instructions_and_sizes_); addresses_for_copy_.Init(&instructions_and_sizes_); } // If true, the interleaved format described in AllowInterleaved() is used // for the current delta file. Only valid after ReadWindowHeader() has been // called and returned a positive number (i.e., the whole header was parsed), // but before the window has finished decoding. // bool IsInterleaved() const { // If the sections are interleaved, both addresses_for_copy_ and // data_for_add_and_run_ should point at instructions_and_sizes_. return !addresses_for_copy_.IsOwned(); } // Executes a single COPY or ADD instruction, appending data to // parent_->decoded_target(). void CopyBytes(const char* data, size_t size); // Executes a single RUN instruction, appending data to // parent_->decoded_target(). void RunByte(unsigned char byte, size_t size); // Advance *parseable_chunk to point to the current position in the // instructions/sizes section. If interleaved format is used, then // decrement the number of expected bytes in the instructions/sizes section // by the number of instruction/size bytes parsed. void UpdateInstructionPointer(ParseableChunk* parseable_chunk); // The parent object which was passed to Init(). VCDiffStreamingDecoderImpl* parent_; // This value will be true if VCDiffDeltaFileWindow::ReadDeltaWindowHeader() // has been called and succeeded in parsing the delta window header, but the // entire window has not yet been decoded. bool found_header_; // Contents and length of the current source window. source_segment_ptr_ // will be non-NULL if (a) the window section header for the current window // has been read, but the window has not yet finished decoding; or // (b) the window did not specify a source segment. const char* source_segment_ptr_; size_t source_segment_length_; // The delta encoding window sections as defined in RFC section 4.3. // The pointer for each section will be incremented as data is consumed and // decoded from that section. If the interleaved format is used, // data_for_add_and_run_ and addresses_for_copy_ will both point to // instructions_and_sizes_; otherwise, they will be separate data sections. // DeltaWindowSection instructions_and_sizes_; DeltaWindowSection data_for_add_and_run_; DeltaWindowSection addresses_for_copy_; // The expected bytes left to decode in instructions_and_sizes_. Only used // for the interleaved format. int interleaved_bytes_expected_; // The expected length of the target window once it has been decoded. size_t target_window_length_; // The index in decoded_target at which the first byte of the current // target window was/will be written. size_t target_window_start_pos_; // If has_checksum_ is true, then expected_checksum_ contains an Adler32 // checksum of the target window data. This is an extension included in the // VCDIFF 'S' (SDCH) format, but is not part of the RFC 3284 draft standard. bool has_checksum_; VCDChecksum expected_checksum_; VCDiffCodeTableReader reader_; // Making these private avoids implicit copy constructor & assignment operator VCDiffDeltaFileWindow(const VCDiffDeltaFileWindow&); // NOLINT void operator=(const VCDiffDeltaFileWindow&); }; // *** Inline methods for VCDiffDeltaFileWindow inline VCDiffDeltaFileWindow::VCDiffDeltaFileWindow() : parent_(NULL) { Reset(); } inline VCDiffDeltaFileWindow::~VCDiffDeltaFileWindow() { } inline void VCDiffDeltaFileWindow::Init(VCDiffStreamingDecoderImpl* parent) { parent_ = parent; } class VCDiffStreamingDecoderImpl { public: typedef std::string string; // The default maximum target file size (and target window size) if // SetMaximumTargetFileSize() is not called. static const size_t kDefaultMaximumTargetFileSize = 67108864U; // 64 MB // The largest value that can be passed to SetMaximumTargetWindowSize(). // Using a larger value will result in an error. static const size_t kTargetSizeLimit = 2147483647U; // INT32_MAX // A constant that is the default value for planned_target_file_size_, // indicating that the decoder does not have an expected length // for the target data. static const size_t kUnlimitedBytes = static_cast(-3); VCDiffStreamingDecoderImpl(); ~VCDiffStreamingDecoderImpl(); // Resets all member variables to their initial states. void Reset(); // These functions are identical to their counterparts // in VCDiffStreamingDecoder. // void StartDecoding(const char* dictionary_ptr, size_t dictionary_size); bool DecodeChunk(const char* data, size_t len, OutputStringInterface* output_string); bool FinishDecoding(); // If true, the version of VCDIFF used in the current delta file allows // for the interleaved format, in which instructions, addresses and data // are all sent interleaved in the instructions section of each window // rather than being sent in separate sections. This is not part of // the VCDIFF draft standard, so we've defined a special version code // 'S' which implies that this feature is available. Even if interleaving // is supported, it is not mandatory; interleaved format will be implied // if the address and data sections are both zero-length. // bool AllowInterleaved() const { return vcdiff_version_code_ == 'S'; } // If true, the version of VCDIFF used in the current delta file allows // each delta window to contain an Adler32 checksum of the target window data. // If the bit 0x08 (VCD_CHECKSUM) is set in the Win_Indicator flags, then // this checksum will appear as a variable-length integer, just after the // "length of addresses for COPYs" value and before the window data sections. // It is possible for some windows in a delta file to use the checksum feature // and for others not to use it (and leave the flag bit set to 0.) // Just as with AllowInterleaved(), this extension is not part of the draft // standard and is only available when the version code 'S' is specified. // bool AllowChecksum() const { return vcdiff_version_code_ == 'S'; } bool SetMaximumTargetFileSize(size_t new_maximum_target_file_size) { maximum_target_file_size_ = new_maximum_target_file_size; return true; } bool SetMaximumTargetWindowSize(size_t new_maximum_target_window_size) { if (new_maximum_target_window_size > kTargetSizeLimit) { LOG(ERROR) << "Specified maximum target window size " << new_maximum_target_window_size << " exceeds limit of " << kTargetSizeLimit << " bytes" << LOG_ENDL; return false; } maximum_target_window_size_ = new_maximum_target_window_size; return true; } // See description of planned_target_file_size_, below. bool HasPlannedTargetFileSize() const { return planned_target_file_size_ != kUnlimitedBytes; } void SetPlannedTargetFileSize(size_t planned_target_file_size) { planned_target_file_size_ = planned_target_file_size; } void AddToTotalTargetWindowSize(size_t window_size) { total_of_target_window_sizes_ += window_size; } // Checks to see whether the decoded target data has reached its planned size. bool ReachedPlannedTargetFileSize() const { if (!HasPlannedTargetFileSize()) { return false; } // The planned target file size should not have been exceeded. // TargetWindowWouldExceedSizeLimits() ensures that the advertised size of // each target window would not make the target file exceed that limit, and // DecodeBody() will return RESULT_ERROR if the actual decoded output ever // exceeds the advertised target window size. if (total_of_target_window_sizes_ > planned_target_file_size_) { LOG(DFATAL) << "Internal error: Decoded data size " << total_of_target_window_sizes_ << " exceeds planned target file size " << planned_target_file_size_ << LOG_ENDL; return true; } return total_of_target_window_sizes_ == planned_target_file_size_; } // Checks to see whether adding a new target window of the specified size // would exceed the planned target file size, the maximum target file size, // or the maximum target window size. If so, logs an error and returns true; // otherwise, returns false. bool TargetWindowWouldExceedSizeLimits(size_t window_size) const; // Returns the amount of input data passed to the last DecodeChunk() // that was not consumed by the decoder. This is essential if // SetPlannedTargetFileSize() is being used, in order to preserve the // remaining input data stream once the planned target file has been decoded. size_t GetUnconsumedDataSize() const { return unparsed_bytes_.size(); } // This function will return true if the decoder has parsed a complete delta // file header plus zero or more delta file windows, with no data left over. // It will also return true if no delta data at all was decoded. If these // conditions are not met, then FinishDecoding() should not be called. bool IsDecodingComplete() const { if (!FoundFileHeader()) { // No complete delta file header has been parsed yet. DecodeChunk() // may have received some data that it hasn't yet parsed, in which case // decoding is incomplete. return unparsed_bytes_.empty(); } else if (custom_code_table_decoder_.get()) { // The decoder is in the middle of parsing a custom code table. return false; } else if (delta_window_.FoundWindowHeader()) { // The decoder is in the middle of parsing an interleaved format delta // window. return false; } else if (ReachedPlannedTargetFileSize()) { // The decoder found exactly the planned number of bytes. In this case // it is OK for unparsed_bytes_ to be non-empty; it contains the leftover // data after the end of the delta file. return true; } else { // No complete delta file window has been parsed yet. DecodeChunk() // may have received some data that it hasn't yet parsed, in which case // decoding is incomplete. return unparsed_bytes_.empty(); } } const char* dictionary_ptr() const { return dictionary_ptr_; } size_t dictionary_size() const { return dictionary_size_; } VCDiffAddressCache* addr_cache() { return addr_cache_.get(); } string* decoded_target() { return &decoded_target_; } bool allow_vcd_target() const { return allow_vcd_target_; } void SetAllowVcdTarget(bool allow_vcd_target) { if (start_decoding_was_called_) { LOG(DFATAL) << "SetAllowVcdTarget() called after StartDecoding()" << LOG_ENDL; return; } allow_vcd_target_ = allow_vcd_target; } // Removes the contents of decoded_target_ that precede the beginning of the // current window. void TruncateToBeginningOfWindow(); private: // Reads the VCDiff delta file header section as described in RFC section 4.1, // except the custom code table data. Returns RESULT_ERROR if an error // occurred, or RESULT_END_OF_DATA if the end of available data was reached // before the entire header could be read. (The latter may be an error // condition if there is no more data available.) Otherwise, advances // data->position_ past the header and returns RESULT_SUCCESS. // VCDiffResult ReadDeltaFileHeader(ParseableChunk* data); // Indicates whether or not the header has already been read. bool FoundFileHeader() const { return addr_cache_.get() != NULL; } // If ReadDeltaFileHeader() finds the VCD_CODETABLE flag set within the delta // file header, this function parses the custom cache sizes and initializes // a nested VCDiffStreamingDecoderImpl object that will be used to parse the // custom code table in ReadCustomCodeTable(). Returns RESULT_ERROR if an // error occurred, or RESULT_END_OF_DATA if the end of available data was // reached before the custom cache sizes could be read. Otherwise, returns // the number of bytes read. // int InitCustomCodeTable(const char* data_start, const char* data_end); // If a custom code table was specified in the header section that was parsed // by ReadDeltaFileHeader(), this function makes a recursive call to another // VCDiffStreamingDecoderImpl object (custom_code_table_decoder_), since the // custom code table is expected to be supplied as an embedded VCDIFF // encoding that uses the standard code table. Returns RESULT_ERROR if an // error occurs, or RESULT_END_OF_DATA if the end of available data was // reached before the entire custom code table could be read. Otherwise, // returns RESULT_SUCCESS and sets *data_ptr to the position after the encoded // custom code table. If the function returns RESULT_SUCCESS or // RESULT_END_OF_DATA, it advances data->position_ past the parsed bytes. // VCDiffResult ReadCustomCodeTable(ParseableChunk* data); // Contents and length of the source (dictionary) data. const char* dictionary_ptr_; size_t dictionary_size_; // This string will be used to store any unparsed bytes left over when // DecodeChunk() reaches the end of its input and returns RESULT_END_OF_DATA. // It will also be used to concatenate those unparsed bytes with the data // supplied to the next call to DecodeChunk(), so that they appear in // contiguous memory. string unparsed_bytes_; // The portion of the target file that has been decoded so far. This will be // used to fill the output string for DecodeChunk(), and will also be used to // execute COPY instructions that reference target data. Since the source // window can come from a range of addresses in the previously decoded target // data, the entire target file needs to be available to the decoder, not just // the current target window. string decoded_target_; // The VCDIFF version byte (also known as "header4") from the // delta file header. unsigned char vcdiff_version_code_; VCDiffDeltaFileWindow delta_window_; std::auto_ptr addr_cache_; // Will be NULL unless a custom code table has been defined. std::auto_ptr custom_code_table_; // Used to receive the decoded custom code table. string custom_code_table_string_; // If a custom code table is specified, it will be expressed // as an embedded VCDIFF delta file which uses the default code table // as the source file (dictionary). Use a child decoder object // to decode that delta file. std::auto_ptr custom_code_table_decoder_; // If set, then the decoder is expecting *exactly* this number of // target bytes to be decoded from one or more delta file windows. // If this number is exceeded while decoding a window, but was not met // before starting on that window, an error will be reported. // If FinishDecoding() is called before this number is met, an error // will also be reported. This feature is used for decoding the // embedded code table data within a VCDIFF delta file; we want to // stop processing the embedded data once the entire code table has // been decoded, and treat the rest of the available data as part // of the enclosing delta file. size_t planned_target_file_size_; size_t maximum_target_file_size_; size_t maximum_target_window_size_; // Contains the sum of the decoded sizes of all target windows seen so far, // including the expected total size of the current target window in progress // (even if some of the current target window has not yet been decoded.) size_t total_of_target_window_sizes_; // This value is used to ensure the correct order of calls to the interface // functions, i.e., a single call to StartDecoding(), followed by zero or // more calls to DecodeChunk(), followed by a single call to // FinishDecoding(). bool start_decoding_was_called_; // If this value is true then the VCD_TARGET flag can be specified to allow // the source segment to be chosen from the previously-decoded target data. // (This is the default behavior.) If it is false, then specifying the // VCD_TARGET flag is considered an error, and the decoder does not need to // keep in memory any decoded target data prior to the current window. bool allow_vcd_target_; // Making these private avoids implicit copy constructor & assignment operator VCDiffStreamingDecoderImpl(const VCDiffStreamingDecoderImpl&); // NOLINT void operator=(const VCDiffStreamingDecoderImpl&); }; // *** Methods for VCDiffStreamingDecoderImpl const size_t VCDiffStreamingDecoderImpl::kDefaultMaximumTargetFileSize; const size_t VCDiffStreamingDecoderImpl::kUnlimitedBytes; VCDiffStreamingDecoderImpl::VCDiffStreamingDecoderImpl() : maximum_target_file_size_(kDefaultMaximumTargetFileSize), maximum_target_window_size_(kDefaultMaximumTargetFileSize), allow_vcd_target_(true) { delta_window_.Init(this); Reset(); } // Reset() will delete the component objects without reallocating them. VCDiffStreamingDecoderImpl::~VCDiffStreamingDecoderImpl() { Reset(); } void VCDiffStreamingDecoderImpl::Reset() { start_decoding_was_called_ = false; dictionary_ptr_ = NULL; dictionary_size_ = 0; vcdiff_version_code_ = '\0'; planned_target_file_size_ = kUnlimitedBytes; total_of_target_window_sizes_ = 0; addr_cache_.reset(); custom_code_table_.reset(); custom_code_table_decoder_.reset(); delta_window_.Reset(); } void VCDiffStreamingDecoderImpl::TruncateToBeginningOfWindow() { // Conserve the data for the current window that has been partially decoded. decoded_target_.erase(0, delta_window_.target_window_start_pos()); delta_window_.set_target_window_start_pos(0); } void VCDiffStreamingDecoderImpl::StartDecoding(const char* dictionary_ptr, size_t dictionary_size) { if (start_decoding_was_called_) { LOG(DFATAL) << "StartDecoding() called twice without FinishDecoding()" << LOG_ENDL; return; } unparsed_bytes_.clear(); decoded_target_.clear(); // delta_window_.Reset() depends on this Reset(); dictionary_ptr_ = dictionary_ptr; dictionary_size_ = dictionary_size; start_decoding_was_called_ = true; } // Reads the VCDiff delta file header section as described in RFC section 4.1: // // Header1 - byte = 0xD6 (ASCII 'V' | 0x80) // Header2 - byte = 0xC3 (ASCII 'C' | 0x80) // Header3 - byte = 0xC4 (ASCII 'D' | 0x80) // Header4 - byte // Hdr_Indicator - byte // [Secondary compressor ID] - byte // [Length of code table data] - integer // [Code table data] // // Initializes the code table and address cache objects. Returns RESULT_ERROR // if an error occurred, and RESULT_END_OF_DATA if the end of available data was // reached before the entire header could be read. (The latter may be an error // condition if there is no more data available.) Otherwise, returns // RESULT_SUCCESS, and removes the header bytes from the data string. // // It's relatively inefficient to expect this function to parse any number of // input bytes available, down to 1 byte, but it is necessary in case the input // is not a properly formatted VCDIFF delta file. If the entire input consists // of two bytes "12", then we should recognize that it does not match the // initial VCDIFF magic number "VCD" and report an error, rather than waiting // indefinitely for more input that will never arrive. // VCDiffResult VCDiffStreamingDecoderImpl::ReadDeltaFileHeader( ParseableChunk* data) { if (FoundFileHeader()) { return RESULT_SUCCESS; } size_t data_size = data->UnparsedSize(); const DeltaFileHeader* header = reinterpret_cast(data->UnparsedData()); bool wrong_magic_number = false; switch (data_size) { // Verify only the bytes that are available. default: // Found header contents up to and including VCDIFF version vcdiff_version_code_ = header->header4; if ((vcdiff_version_code_ != 0x00) && // Draft standard VCDIFF (RFC 3284) (vcdiff_version_code_ != 'S')) { // Enhancements for SDCH protocol LOG(ERROR) << "Unrecognized VCDIFF format version" << LOG_ENDL; return RESULT_ERROR; } // fall through case 3: if (header->header3 != 0xC4) { // magic value 'D' | 0x80 wrong_magic_number = true; } // fall through case 2: if (header->header2 != 0xC3) { // magic value 'C' | 0x80 wrong_magic_number = true; } // fall through case 1: if (header->header1 != 0xD6) { // magic value 'V' | 0x80 wrong_magic_number = true; } // fall through case 0: if (wrong_magic_number) { LOG(ERROR) << "Did not find VCDIFF header bytes; " "input is not a VCDIFF delta file" << LOG_ENDL; return RESULT_ERROR; } if (data_size < sizeof(DeltaFileHeader)) return RESULT_END_OF_DATA; } // Secondary compressor not supported. if (header->hdr_indicator & VCD_DECOMPRESS) { LOG(ERROR) << "Secondary compression is not supported" << LOG_ENDL; return RESULT_ERROR; } if (header->hdr_indicator & VCD_CODETABLE) { int bytes_parsed = InitCustomCodeTable( data->UnparsedData() + sizeof(DeltaFileHeader), data->End()); switch (bytes_parsed) { case RESULT_ERROR: return RESULT_ERROR; case RESULT_END_OF_DATA: return RESULT_END_OF_DATA; default: data->Advance(sizeof(DeltaFileHeader) + bytes_parsed); } } else { addr_cache_.reset(new VCDiffAddressCache); // addr_cache_->Init() will be called // from VCDiffStreamingDecoderImpl::DecodeChunk() data->Advance(sizeof(DeltaFileHeader)); } return RESULT_SUCCESS; } int VCDiffStreamingDecoderImpl::InitCustomCodeTable(const char* data_start, const char* data_end) { // A custom code table is being specified. Parse the variable-length // cache sizes and begin parsing the encoded custom code table. int32_t near_cache_size = 0, same_cache_size = 0; VCDiffHeaderParser header_parser(data_start, data_end); if (!header_parser.ParseInt32("size of near cache", &near_cache_size)) { return header_parser.GetResult(); } if (!header_parser.ParseInt32("size of same cache", &same_cache_size)) { return header_parser.GetResult(); } custom_code_table_.reset(new struct VCDiffCodeTableData); memset(custom_code_table_.get(), 0, sizeof(struct VCDiffCodeTableData)); custom_code_table_string_.clear(); addr_cache_.reset(new VCDiffAddressCache(near_cache_size, same_cache_size)); // addr_cache_->Init() will be called // from VCDiffStreamingDecoderImpl::DecodeChunk() // If we reach this point (the start of the custom code table) // without encountering a RESULT_END_OF_DATA condition, then we won't call // ReadDeltaFileHeader() again for this delta file. // // Instantiate a recursive decoder to interpret the custom code table // as a VCDIFF encoding of the default code table. custom_code_table_decoder_.reset(new VCDiffStreamingDecoderImpl); custom_code_table_decoder_->StartDecoding( reinterpret_cast( &VCDiffCodeTableData::kDefaultCodeTableData), sizeof(VCDiffCodeTableData::kDefaultCodeTableData)); custom_code_table_decoder_->SetPlannedTargetFileSize( sizeof(*custom_code_table_)); return static_cast(header_parser.ParsedSize()); } VCDiffResult VCDiffStreamingDecoderImpl::ReadCustomCodeTable( ParseableChunk* data) { if (!custom_code_table_decoder_.get()) { return RESULT_SUCCESS; } if (!custom_code_table_.get()) { LOG(DFATAL) << "Internal error: custom_code_table_decoder_ is set," " but custom_code_table_ is NULL" << LOG_ENDL; return RESULT_ERROR; } OutputString output_string(&custom_code_table_string_); if (!custom_code_table_decoder_->DecodeChunk(data->UnparsedData(), data->UnparsedSize(), &output_string)) { return RESULT_ERROR; } if (custom_code_table_string_.length() < sizeof(*custom_code_table_)) { // Skip over the consumed data. data->Finish(); return RESULT_END_OF_DATA; } if (!custom_code_table_decoder_->FinishDecoding()) { return RESULT_ERROR; } if (custom_code_table_string_.length() != sizeof(*custom_code_table_)) { LOG(DFATAL) << "Decoded custom code table size (" << custom_code_table_string_.length() << ") does not match size of a code table (" << sizeof(*custom_code_table_) << ")" << LOG_ENDL; return RESULT_ERROR; } memcpy(custom_code_table_.get(), custom_code_table_string_.data(), sizeof(*custom_code_table_)); custom_code_table_string_.clear(); // Skip over the consumed data. data->FinishExcept(custom_code_table_decoder_->GetUnconsumedDataSize()); custom_code_table_decoder_.reset(); delta_window_.UseCodeTable(*custom_code_table_, addr_cache_->LastMode()); return RESULT_SUCCESS; } namespace { class TrackNewOutputText { public: typedef std::string string; explicit TrackNewOutputText(const string& decoded_target) : decoded_target_(decoded_target), initial_decoded_target_size_(decoded_target.size()) { } void AppendNewOutputText(size_t target_bytes_remaining, OutputStringInterface* output_string) { const size_t bytes_decoded_this_chunk = decoded_target_.size() - initial_decoded_target_size_; if (bytes_decoded_this_chunk > 0) { if (target_bytes_remaining > 0) { // The decoder is midway through decoding a target window. Resize // output_string to match the expected length. The interface guarantees // not to resize the output_string more than once per target window // decoded. output_string->ReserveAdditionalBytes(bytes_decoded_this_chunk + target_bytes_remaining); } output_string->append( decoded_target_.data() + initial_decoded_target_size_, bytes_decoded_this_chunk); } } private: const string& decoded_target_; size_t initial_decoded_target_size_; }; } // anonymous namespace bool VCDiffStreamingDecoderImpl::DecodeChunk( const char* data, size_t len, OutputStringInterface* output_string) { if (!start_decoding_was_called_) { LOG(DFATAL) << "DecodeChunk() called without StartDecoding()" << LOG_ENDL; Reset(); return false; } ParseableChunk parseable_chunk(data, len); if (!unparsed_bytes_.empty()) { unparsed_bytes_.append(data, len); parseable_chunk.SetDataBuffer(unparsed_bytes_.data(), unparsed_bytes_.size()); } TrackNewOutputText output_tracker(decoded_target_); VCDiffResult result = ReadDeltaFileHeader(&parseable_chunk); if (RESULT_SUCCESS == result) { result = ReadCustomCodeTable(&parseable_chunk); } if (RESULT_SUCCESS == result) { result = delta_window_.DecodeWindows(&parseable_chunk); } if (RESULT_ERROR == result) { Reset(); // Don't allow further DecodeChunk calls return false; } unparsed_bytes_.assign(parseable_chunk.UnparsedData(), parseable_chunk.UnparsedSize()); output_tracker.AppendNewOutputText(delta_window_.TargetBytesRemaining(), output_string); if (!allow_vcd_target()) { // VCD_TARGET will never be used to reference target data beyond the start // of the current window, so throw away any earlier target data. TruncateToBeginningOfWindow(); } return true; } // Finishes decoding after all data has been received. Returns true // if decoding of the entire stream was successful. bool VCDiffStreamingDecoderImpl::FinishDecoding() { bool success = true; if (!start_decoding_was_called_) { LOG(WARNING) << "FinishDecoding() called before StartDecoding()," " or called after DecodeChunk() returned false" << LOG_ENDL; success = false; } else if (!IsDecodingComplete()) { LOG(ERROR) << "FinishDecoding() called before parsing entire" " delta file window" << LOG_ENDL; success = false; } // Reset the object state for the next decode operation Reset(); return success; } bool VCDiffStreamingDecoderImpl::TargetWindowWouldExceedSizeLimits( size_t window_size) const { if (window_size > maximum_target_window_size_) { LOG(ERROR) << "Length of target window (" << window_size << ") exceeds limit of " << maximum_target_window_size_ << " bytes" << LOG_ENDL; return true; } if (HasPlannedTargetFileSize()) { // The logical expression to check would be: // // total_of_target_window_sizes_ + window_size > planned_target_file_size_ // // but the addition might cause an integer overflow if target_bytes_to_add // is very large. So it is better to check target_bytes_to_add against // the remaining planned target bytes. size_t remaining_planned_target_file_size = planned_target_file_size_ - total_of_target_window_sizes_; if (window_size > remaining_planned_target_file_size) { LOG(ERROR) << "Length of target window (" << window_size << " bytes) plus previous windows (" << total_of_target_window_sizes_ << " bytes) would exceed planned size of " << planned_target_file_size_ << " bytes" << LOG_ENDL; return true; } } size_t remaining_maximum_target_bytes = maximum_target_file_size_ - total_of_target_window_sizes_; if (window_size > remaining_maximum_target_bytes) { LOG(ERROR) << "Length of target window (" << window_size << " bytes) plus previous windows (" << total_of_target_window_sizes_ << " bytes) would exceed maximum target file size of " << maximum_target_file_size_ << " bytes" << LOG_ENDL; return true; } return false; } // *** Methods for VCDiffDeltaFileWindow void VCDiffDeltaFileWindow::Reset() { found_header_ = false; // Mark the start of the current target window. target_window_start_pos_ = parent_ ? parent_->decoded_target()->size() : 0U; target_window_length_ = 0; source_segment_ptr_ = NULL; source_segment_length_ = 0; instructions_and_sizes_.Invalidate(); data_for_add_and_run_.Invalidate(); addresses_for_copy_.Invalidate(); interleaved_bytes_expected_ = 0; has_checksum_ = false; expected_checksum_ = 0; } VCDiffResult VCDiffDeltaFileWindow::SetUpWindowSections( VCDiffHeaderParser* header_parser) { size_t add_and_run_data_length = 0; size_t instructions_and_sizes_length = 0; size_t addresses_length = 0; if (!header_parser->ParseSectionLengths(has_checksum_, &add_and_run_data_length, &instructions_and_sizes_length, &addresses_length, &expected_checksum_)) { return header_parser->GetResult(); } if (parent_->AllowInterleaved() && (add_and_run_data_length == 0) && (addresses_length == 0)) { // The interleaved format is being used. interleaved_bytes_expected_ = static_cast(instructions_and_sizes_length); UpdateInterleavedSectionPointers(header_parser->UnparsedData(), header_parser->End()); } else { // If interleaved format is not used, then the whole window contents // must be available before decoding can begin. If only part of // the current window is available, then report end of data // and re-parse the whole header when DecodeChunk() is called again. if (header_parser->UnparsedSize() < (add_and_run_data_length + instructions_and_sizes_length + addresses_length)) { return RESULT_END_OF_DATA; } data_for_add_and_run_.Init(header_parser->UnparsedData(), add_and_run_data_length); instructions_and_sizes_.Init(data_for_add_and_run_.End(), instructions_and_sizes_length); addresses_for_copy_.Init(instructions_and_sizes_.End(), addresses_length); if (addresses_for_copy_.End() != header_parser->EndOfDeltaWindow()) { LOG(ERROR) << "The end of the instructions section " "does not match the end of the delta window" << LOG_ENDL; return RESULT_ERROR; } } reader_.Init(instructions_and_sizes_.UnparsedDataAddr(), instructions_and_sizes_.End()); return RESULT_SUCCESS; } // Here are the elements of the delta window header to be parsed, // from section 4 of the RFC: // // Window1 // Win_Indicator - byte // [Source segment size] - integer // [Source segment position] - integer // The delta encoding of the target window // Length of the delta encoding - integer // The delta encoding // Size of the target window - integer // Delta_Indicator - byte // Length of data for ADDs and RUNs - integer // Length of instructions and sizes - integer // Length of addresses for COPYs - integer // Data section for ADDs and RUNs - array of bytes // Instructions and sizes section - array of bytes // Addresses section for COPYs - array of bytes // VCDiffResult VCDiffDeltaFileWindow::ReadHeader( ParseableChunk* parseable_chunk) { std::string* decoded_target = parent_->decoded_target(); VCDiffHeaderParser header_parser(parseable_chunk->UnparsedData(), parseable_chunk->End()); size_t source_segment_position = 0; unsigned char win_indicator = 0; if (!header_parser.ParseWinIndicatorAndSourceSegment( parent_->dictionary_size(), decoded_target->size(), parent_->allow_vcd_target(), &win_indicator, &source_segment_length_, &source_segment_position)) { return header_parser.GetResult(); } has_checksum_ = parent_->AllowChecksum() && (win_indicator & VCD_CHECKSUM); if (!header_parser.ParseWindowLengths(&target_window_length_)) { return header_parser.GetResult(); } if (parent_->TargetWindowWouldExceedSizeLimits(target_window_length_)) { // An error has been logged by TargetWindowWouldExceedSizeLimits(). return RESULT_ERROR; } header_parser.ParseDeltaIndicator(); VCDiffResult setup_return_code = SetUpWindowSections(&header_parser); if (RESULT_SUCCESS != setup_return_code) { return setup_return_code; } // Reserve enough space in the output string for the current target window. decoded_target->reserve(target_window_start_pos_ + target_window_length_); // Get a pointer to the start of the source segment. if (win_indicator & VCD_SOURCE) { source_segment_ptr_ = parent_->dictionary_ptr() + source_segment_position; } else if (win_indicator & VCD_TARGET) { // This assignment must happen after the reserve(). // decoded_target should not be resized again while processing this window, // so source_segment_ptr_ should remain valid. source_segment_ptr_ = decoded_target->data() + source_segment_position; } // The whole window header was found and parsed successfully. found_header_ = true; parseable_chunk->Advance(header_parser.ParsedSize()); parent_->AddToTotalTargetWindowSize(target_window_length_); return RESULT_SUCCESS; } void VCDiffDeltaFileWindow::UpdateInstructionPointer( ParseableChunk* parseable_chunk) { if (IsInterleaved()) { size_t bytes_parsed = instructions_and_sizes_.ParsedSize(); // Reduce expected instruction segment length by bytes parsed interleaved_bytes_expected_ -= static_cast(bytes_parsed); parseable_chunk->Advance(bytes_parsed); } } inline size_t VCDiffDeltaFileWindow::TargetBytesDecoded() { return parent_->decoded_target()->size() - target_window_start_pos_; } size_t VCDiffDeltaFileWindow::TargetBytesRemaining() { if (target_window_length_ == 0) { // There is no window being decoded at present return 0; } else { return target_window_length_ - TargetBytesDecoded(); } } inline void VCDiffDeltaFileWindow::CopyBytes(const char* data, size_t size) { parent_->decoded_target()->append(data, size); } inline void VCDiffDeltaFileWindow::RunByte(unsigned char byte, size_t size) { parent_->decoded_target()->append(size, byte); } VCDiffResult VCDiffDeltaFileWindow::DecodeAdd(size_t size) { if (size > data_for_add_and_run_.UnparsedSize()) { return RESULT_END_OF_DATA; } // Write the next "size" data bytes CopyBytes(data_for_add_and_run_.UnparsedData(), size); data_for_add_and_run_.Advance(size); return RESULT_SUCCESS; } VCDiffResult VCDiffDeltaFileWindow::DecodeRun(size_t size) { if (data_for_add_and_run_.Empty()) { return RESULT_END_OF_DATA; } // Write "size" copies of the next data byte RunByte(*data_for_add_and_run_.UnparsedData(), size); data_for_add_and_run_.Advance(1); return RESULT_SUCCESS; } VCDiffResult VCDiffDeltaFileWindow::DecodeCopy(size_t size, unsigned char mode) { // Keep track of the number of target bytes decoded as a local variable // to avoid recalculating it each time it is needed. size_t target_bytes_decoded = TargetBytesDecoded(); const VCDAddress here_address = static_cast(source_segment_length_ + target_bytes_decoded); const VCDAddress decoded_address = parent_->addr_cache()->DecodeAddress( here_address, mode, addresses_for_copy_.UnparsedDataAddr(), addresses_for_copy_.End()); switch (decoded_address) { case RESULT_ERROR: LOG(ERROR) << "Unable to decode address for COPY" << LOG_ENDL; return RESULT_ERROR; case RESULT_END_OF_DATA: return RESULT_END_OF_DATA; default: if ((decoded_address < 0) || (decoded_address > here_address)) { LOG(DFATAL) << "Internal error: unexpected address " << decoded_address << " returned from DecodeAddress, with here_address = " << here_address << LOG_ENDL; return RESULT_ERROR; } break; } size_t address = static_cast(decoded_address); if ((address + size) <= source_segment_length_) { // Copy all data from source segment CopyBytes(&source_segment_ptr_[address], size); return RESULT_SUCCESS; } // Copy some data from target window... if (address < source_segment_length_) { // ... plus some data from source segment const size_t partial_copy_size = source_segment_length_ - address; CopyBytes(&source_segment_ptr_[address], partial_copy_size); target_bytes_decoded += partial_copy_size; address += partial_copy_size; size -= partial_copy_size; } address -= source_segment_length_; // address is now based at start of target window const char* const target_segment_ptr = parent_->decoded_target()->data() + target_window_start_pos_; while (size > (target_bytes_decoded - address)) { // Recursive copy that extends into the yet-to-be-copied target data const size_t partial_copy_size = target_bytes_decoded - address; CopyBytes(&target_segment_ptr[address], partial_copy_size); target_bytes_decoded += partial_copy_size; address += partial_copy_size; size -= partial_copy_size; } CopyBytes(&target_segment_ptr[address], size); return RESULT_SUCCESS; } int VCDiffDeltaFileWindow::DecodeBody(ParseableChunk* parseable_chunk) { if (IsInterleaved() && (instructions_and_sizes_.UnparsedData() != parseable_chunk->UnparsedData())) { LOG(DFATAL) << "Internal error: interleaved format is used, but the" " input pointer does not point to the instructions section" << LOG_ENDL; return RESULT_ERROR; } while (TargetBytesDecoded() < target_window_length_) { int32_t decoded_size = VCD_INSTRUCTION_ERROR; unsigned char mode = 0; VCDiffInstructionType instruction = reader_.GetNextInstruction(&decoded_size, &mode); switch (instruction) { case VCD_INSTRUCTION_END_OF_DATA: UpdateInstructionPointer(parseable_chunk); return RESULT_END_OF_DATA; case VCD_INSTRUCTION_ERROR: return RESULT_ERROR; default: break; } const size_t size = static_cast(decoded_size); // The value of "size" itself could be enormous (say, INT32_MAX) // so check it individually against the limit to protect against // overflow when adding it to something else. if ((size > target_window_length_) || ((size + TargetBytesDecoded()) > target_window_length_)) { LOG(ERROR) << VCDiffInstructionName(instruction) << " with size " << size << " plus existing " << TargetBytesDecoded() << " bytes of target data exceeds length of target" " window (" << target_window_length_ << " bytes)" << LOG_ENDL; return RESULT_ERROR; } VCDiffResult result = RESULT_SUCCESS; switch (instruction) { case VCD_ADD: result = DecodeAdd(size); break; case VCD_RUN: result = DecodeRun(size); break; case VCD_COPY: result = DecodeCopy(size, mode); break; default: LOG(DFATAL) << "Unexpected instruction type " << instruction << "in opcode stream" << LOG_ENDL; return RESULT_ERROR; } switch (result) { case RESULT_END_OF_DATA: reader_.UnGetInstruction(); UpdateInstructionPointer(parseable_chunk); return RESULT_END_OF_DATA; case RESULT_ERROR: return RESULT_ERROR; case RESULT_SUCCESS: break; } } if (TargetBytesDecoded() != target_window_length_) { LOG(ERROR) << "Decoded target window size (" << TargetBytesDecoded() << " bytes) does not match expected size (" << target_window_length_ << " bytes)" << LOG_ENDL; return RESULT_ERROR; } const char* const target_window_start = parent_->decoded_target()->data() + target_window_start_pos_; if (has_checksum_ && (ComputeAdler32(target_window_start, target_window_length_) != expected_checksum_)) { LOG(ERROR) << "Target data does not match checksum; this could mean " "that the wrong dictionary was used" << LOG_ENDL; return RESULT_ERROR; } if (!instructions_and_sizes_.Empty()) { LOG(ERROR) << "Excess instructions and sizes left over " "after decoding target window" << LOG_ENDL; return RESULT_ERROR; } if (!IsInterleaved()) { // Standard format is being used, with three separate sections for the // instructions, data, and addresses. if (!data_for_add_and_run_.Empty()) { LOG(ERROR) << "Excess ADD/RUN data left over " "after decoding target window" << LOG_ENDL; return RESULT_ERROR; } if (!addresses_for_copy_.Empty()) { LOG(ERROR) << "Excess COPY addresses left over " "after decoding target window" << LOG_ENDL; return RESULT_ERROR; } // Reached the end of the window. Update the ParseableChunk to point to the // end of the addresses section, which is the last section in the window. parseable_chunk->SetPosition(addresses_for_copy_.End()); } else { // Interleaved format is being used. UpdateInstructionPointer(parseable_chunk); } return RESULT_SUCCESS; } VCDiffResult VCDiffDeltaFileWindow::DecodeWindows( ParseableChunk* parseable_chunk) { if (!parent_) { LOG(DFATAL) << "Internal error: VCDiffDeltaFileWindow::DecodeWindows() " "called before VCDiffDeltaFileWindow::Init()" << LOG_ENDL; return RESULT_ERROR; } while (!parseable_chunk->Empty()) { if (!found_header_) { switch (ReadHeader(parseable_chunk)) { case RESULT_END_OF_DATA: return RESULT_END_OF_DATA; case RESULT_ERROR: return RESULT_ERROR; default: // Reset address cache between windows (RFC section 5.1) if (!parent_->addr_cache()->Init()) { LOG(DFATAL) << "Error initializing address cache" << LOG_ENDL; return RESULT_ERROR; } } } else { // We are resuming a window that was partially decoded before a // RESULT_END_OF_DATA was returned. This can only happen on the first // loop iteration, and only if the interleaved format is enabled and used. if (!IsInterleaved()) { LOG(DFATAL) << "Internal error: Resumed decoding of a delta file window" " when interleaved format is not being used" << LOG_ENDL; return RESULT_ERROR; } UpdateInterleavedSectionPointers(parseable_chunk->UnparsedData(), parseable_chunk->End()); reader_.UpdatePointers(instructions_and_sizes_.UnparsedDataAddr(), instructions_and_sizes_.End()); } switch (DecodeBody(parseable_chunk)) { case RESULT_END_OF_DATA: if (MoreDataExpected()) { return RESULT_END_OF_DATA; } else { LOG(ERROR) << "End of data reached while decoding VCDIFF delta file" << LOG_ENDL; // fall through to RESULT_ERROR case } case RESULT_ERROR: return RESULT_ERROR; default: break; // DecodeBody succeeded } // Get ready to read a new delta window Reset(); if (parent_->ReachedPlannedTargetFileSize()) { // Found exactly the length we expected. Stop decoding. return RESULT_SUCCESS; } } return RESULT_SUCCESS; } // *** Methods for VCDiffStreamingDecoder VCDiffStreamingDecoder::VCDiffStreamingDecoder() : impl_(new VCDiffStreamingDecoderImpl) { } VCDiffStreamingDecoder::~VCDiffStreamingDecoder() { delete impl_; } void VCDiffStreamingDecoder::StartDecoding(const char* source, size_t len) { impl_->StartDecoding(source, len); } bool VCDiffStreamingDecoder::DecodeChunkToInterface( const char* data, size_t len, OutputStringInterface* output_string) { return impl_->DecodeChunk(data, len, output_string); } bool VCDiffStreamingDecoder::FinishDecoding() { return impl_->FinishDecoding(); } bool VCDiffStreamingDecoder::SetMaximumTargetFileSize( size_t new_maximum_target_file_size) { return impl_->SetMaximumTargetFileSize(new_maximum_target_file_size); } bool VCDiffStreamingDecoder::SetMaximumTargetWindowSize( size_t new_maximum_target_window_size) { return impl_->SetMaximumTargetWindowSize(new_maximum_target_window_size); } void VCDiffStreamingDecoder::SetAllowVcdTarget(bool allow_vcd_target) { impl_->SetAllowVcdTarget(allow_vcd_target); } bool VCDiffDecoder::DecodeToInterface(const char* dictionary_ptr, size_t dictionary_size, const string& encoding, OutputStringInterface* target) { target->clear(); decoder_.StartDecoding(dictionary_ptr, dictionary_size); if (!decoder_.DecodeChunkToInterface(encoding.data(), encoding.size(), target)) { return false; } return decoder_.FinishDecoding(); } } // namespace open_vcdiff