1 // Copyright 2007 Google Inc. 2 // Author: Lincoln Smith 3 // 4 // Licensed under the Apache License, Version 2.0 (the "License"); 5 // you may not use this file except in compliance with the License. 6 // You may obtain a copy of the License at 7 // 8 // http://www.apache.org/licenses/LICENSE-2.0 9 // 10 // Unless required by applicable law or agreed to in writing, software 11 // distributed under the License is distributed on an "AS IS" BASIS, 12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 // See the License for the specific language governing permissions and 14 // limitations under the License. 15 16 #ifndef OPEN_VCDIFF_VCENCODER_H_ 17 #define OPEN_VCDIFF_VCENCODER_H_ 18 19 #include <stddef.h> // size_t 20 #include <vector> 21 #include "google/output_string.h" 22 23 namespace open_vcdiff { 24 25 class VCDiffEngine; 26 class VCDiffStreamingEncoderImpl; 27 28 // These flags are passed to the constructor of VCDiffStreamingEncoder 29 // to determine whether certain open-vcdiff format extensions 30 // (which are not part of the RFC 3284 draft standard for VCDIFF) 31 // are employed. 32 // 33 // Because these extensions are not part of the VCDIFF standard, if 34 // any of these flags except VCD_STANDARD_FORMAT is specified, then the caller 35 // must be certain that the receiver of the data will be using open-vcdiff 36 // to decode the delta file, or at least that the receiver can interpret 37 // these extensions. The encoder will use an 'S' as the fourth character 38 // in the delta file to indicate that non-standard extensions are being used. 39 // 40 enum VCDiffFormatExtensionFlagValues { 41 // No extensions: the encoded format will conform to the RFC 42 // draft standard for VCDIFF. 43 VCD_STANDARD_FORMAT = 0x00, 44 // If this flag is specified, then the encoder writes each delta file 45 // window by interleaving instructions and sizes with their corresponding 46 // addresses and data, rather than placing these elements 47 // into three separate sections. This facilitates providing partially 48 // decoded results when only a portion of a delta file window is received 49 // (e.g. when HTTP over TCP is used as the transmission protocol.) 50 VCD_FORMAT_INTERLEAVED = 0x01, 51 // If this flag is specified, then an Adler32 checksum 52 // of the target window data is included in the delta window. 53 VCD_FORMAT_CHECKSUM = 0x02 54 }; 55 56 typedef int VCDiffFormatExtensionFlags; 57 58 // A HashedDictionary must be constructed from the dictionary data 59 // in order to use VCDiffStreamingEncoder. If the same dictionary will 60 // be used to perform several encoding operations, then the caller should 61 // create the HashedDictionary once and cache it for reuse. This object 62 // is thread-safe: the same const HashedDictionary can be used 63 // by several threads simultaneously, each with its own VCDiffStreamingEncoder. 64 // 65 // dictionary_contents is copied into the HashedDictionary, so the 66 // caller may free that string, if desired, after the constructor returns. 67 // 68 class HashedDictionary { 69 public: 70 HashedDictionary(const char* dictionary_contents, 71 size_t dictionary_size); 72 ~HashedDictionary(); 73 74 // Init() must be called before using the HashedDictionary as an argument 75 // to the VCDiffStreamingEncoder, or for any other purpose except 76 // destruction. It returns true if initialization succeeded, or false 77 // if an error occurred, in which case the caller should destroy the object 78 // without using it. 79 bool Init(); 80 engine()81 const VCDiffEngine* engine() const { return engine_; } 82 83 private: 84 const VCDiffEngine* engine_; 85 86 // Make the copy constructor and assignment operator private 87 // so that they don't inadvertently get used. 88 HashedDictionary(const HashedDictionary&); // NOLINT 89 void operator=(const HashedDictionary&); 90 }; 91 92 // The standard streaming interface to the VCDIFF (RFC 3284) encoder. 93 // "Streaming" in this context means that, even though the entire set of 94 // input data to be encoded may not be available at once, the encoder 95 // can produce partial output based on what is available. Of course, 96 // the caller should try to maximize the sizes of the data chunks passed 97 // to the encoder. 98 class VCDiffStreamingEncoder { 99 public: 100 // The HashedDictionary object passed to the constructor must remain valid, 101 // without being deleted, for the lifetime of the VCDiffStreamingEncoder 102 // object. 103 // 104 // format_extensions allows certain open-vcdiff extensions to the VCDIFF 105 // format to be included in the encoded output. These extensions are not 106 // part of the RFC 3284 draft standard, so specifying any extension flags 107 // will make the output compatible only with open-vcdiff, or with other 108 // VCDIFF implementations that accept these extensions. See above for an 109 // explanation of each possible flag value. 110 // 111 // *** look_for_target_matches: 112 // The VCDIFF format allows COPY instruction addresses to reference data from 113 // the source (dictionary), or from previously encoded target data. 114 // 115 // If look_for_target_matches is false, then the encoder will only 116 // produce COPY instructions that reference source data from the dictionary, 117 // never from previously encoded target data. This will speed up the encoding 118 // process, but the encoded data will not be as compact. 119 // 120 // If this value is true, then the encoder will produce COPY instructions 121 // that reference either source data or target data. A COPY instruction from 122 // the previously encoded target data may even extend into the range of the 123 // data being produced by that same COPY instruction; for example, if the 124 // previously encoded target data is "LA", then a single COPY instruction of 125 // length 10 can produce the additional target data "LALALALALA". 126 // 127 // There is a third type of COPY instruction that starts within 128 // the source data and extends from the end of the source data 129 // into the beginning of the target data. This VCDIFF encoder will never 130 // produce a COPY instruction of this third type (regardless of the value of 131 // look_for_target_matches) because the cost of checking for matches 132 // across the source-target boundary would not justify its benefits. 133 // 134 VCDiffStreamingEncoder(const HashedDictionary* dictionary, 135 VCDiffFormatExtensionFlags format_extensions, 136 bool look_for_target_matches); 137 ~VCDiffStreamingEncoder(); 138 139 // The client should use these routines as follows: 140 // HashedDictionary hd(dictionary, dictionary_size); 141 // if (!hd.Init()) { 142 // HandleError(); 143 // return; 144 // } 145 // string output_string; 146 // VCDiffStreamingEncoder v(hd, false, false); 147 // if (!v.StartEncoding(&output_string)) { 148 // HandleError(); 149 // return; // No need to call FinishEncoding() 150 // } 151 // Process(output_string.data(), output_string.size()); 152 // output_string.clear(); 153 // while (get data_buf) { 154 // if (!v.EncodeChunk(data_buf, data_len, &output_string)) { 155 // HandleError(); 156 // return; // No need to call FinishEncoding() 157 // } 158 // // The encoding is appended to output_string at each call, 159 // // so clear output_string once its contents have been processed. 160 // Process(output_string.data(), output_string.size()); 161 // output_string.clear(); 162 // } 163 // if (!v.FinishEncoding(&output_string)) { 164 // HandleError(); 165 // return; 166 // } 167 // Process(output_string.data(), output_string.size()); 168 // output_string.clear(); 169 // 170 // I.e., the allowed pattern of calls is 171 // StartEncoding EncodeChunk* FinishEncoding 172 // 173 // The size of the encoded output depends on the sizes of the chunks 174 // passed in (i.e. the chunking boundary affects compression). 175 // However the decoded output is independent of chunk boundaries. 176 177 // Sets up the data structures for encoding. 178 // Writes a VCDIFF delta file header (as defined in RFC section 4.1) 179 // to *output_string. 180 // 181 // Note: we *append*, so the old contents of *output_string stick around. 182 // This convention differs from the non-streaming Encode/Decode 183 // interfaces in VCDiffEncoder. 184 // 185 // If an error occurs, this function returns false; otherwise it returns true. 186 // If this function returns false, the caller does not need to call 187 // FinishEncoding or to do any cleanup except destroying the 188 // VCDiffStreamingEncoder object. 189 template<class OutputType> StartEncoding(OutputType * output)190 bool StartEncoding(OutputType* output) { 191 OutputString<OutputType> output_string(output); 192 return StartEncodingToInterface(&output_string); 193 } 194 195 bool StartEncodingToInterface(OutputStringInterface* output_string); 196 197 // Appends compressed encoding for "data" (one complete VCDIFF delta window) 198 // to *output_string. 199 // If an error occurs (for example, if StartEncoding was not called 200 // earlier or StartEncoding returned false), this function returns false; 201 // otherwise it returns true. The caller does not need to call FinishEncoding 202 // or do any cleanup except destroying the VCDiffStreamingEncoder 203 // if this function returns false. 204 template<class OutputType> EncodeChunk(const char * data,size_t len,OutputType * output)205 bool EncodeChunk(const char* data, size_t len, OutputType* output) { 206 OutputString<OutputType> output_string(output); 207 return EncodeChunkToInterface(data, len, &output_string); 208 } 209 210 bool EncodeChunkToInterface(const char* data, size_t len, 211 OutputStringInterface* output_string); 212 213 // Finishes encoding and appends any leftover encoded data to *output_string. 214 // If an error occurs (for example, if StartEncoding was not called 215 // earlier or StartEncoding returned false), this function returns false; 216 // otherwise it returns true. The caller does not need to 217 // do any cleanup except destroying the VCDiffStreamingEncoder 218 // if this function returns false. 219 template<class OutputType> FinishEncoding(OutputType * output)220 bool FinishEncoding(OutputType* output) { 221 OutputString<OutputType> output_string(output); 222 return FinishEncodingToInterface(&output_string); 223 } 224 225 bool FinishEncodingToInterface(OutputStringInterface* output_string); 226 227 // Replaces the contents of match_counts with a vector of integers, 228 // one for each possible match length. The value of match_counts[n] 229 // is equal to the number of matches of length n found so far 230 // for this VCDiffStreamingEncoder object. 231 void GetMatchCounts(std::vector<int>* match_counts) const; 232 233 private: 234 VCDiffStreamingEncoderImpl* const impl_; 235 236 // Make the copy constructor and assignment operator private 237 // so that they don't inadvertently get used. 238 VCDiffStreamingEncoder(const VCDiffStreamingEncoder&); // NOLINT 239 void operator=(const VCDiffStreamingEncoder&); 240 }; 241 242 // A simpler (non-streaming) interface to the VCDIFF encoder that can be used 243 // if the entire target data string is available. 244 // 245 class VCDiffEncoder { 246 public: VCDiffEncoder(const char * dictionary_contents,size_t dictionary_size)247 VCDiffEncoder(const char* dictionary_contents, size_t dictionary_size) 248 : dictionary_(dictionary_contents, dictionary_size), 249 encoder_(NULL), 250 flags_(VCD_STANDARD_FORMAT), 251 look_for_target_matches_(true) { } 252 ~VCDiffEncoder()253 ~VCDiffEncoder() { 254 delete encoder_; 255 } 256 257 // By default, VCDiffEncoder uses standard VCDIFF format. This function 258 // can be used before calling Encode(), to specify that interleaved format 259 // and/or checksum format should be used. SetFormatFlags(VCDiffFormatExtensionFlags flags)260 void SetFormatFlags(VCDiffFormatExtensionFlags flags) { flags_ = flags; } 261 262 // By default, VCDiffEncoder looks for matches in the dictionary and also in 263 // the previously encoded target data. This function can be used before 264 // calling Encode(), to specify whether or not target matching should be 265 // enabled. SetTargetMatching(bool look_for_target_matches)266 void SetTargetMatching(bool look_for_target_matches) { 267 look_for_target_matches_ = look_for_target_matches; 268 } 269 270 // Replaces old contents of output_string with the encoded form of 271 // target_data. 272 template<class OutputType> Encode(const char * target_data,size_t target_len,OutputType * output)273 bool Encode(const char* target_data, 274 size_t target_len, 275 OutputType* output) { 276 OutputString<OutputType> output_string(output); 277 return EncodeToInterface(target_data, target_len, &output_string); 278 } 279 280 private: 281 bool EncodeToInterface(const char* target_data, 282 size_t target_len, 283 OutputStringInterface* output_string); 284 285 HashedDictionary dictionary_; 286 VCDiffStreamingEncoder* encoder_; 287 VCDiffFormatExtensionFlags flags_; 288 bool look_for_target_matches_; 289 290 // Make the copy constructor and assignment operator private 291 // so that they don't inadvertently get used. 292 VCDiffEncoder(const VCDiffEncoder&); // NOLINT 293 void operator=(const VCDiffEncoder&); 294 }; 295 296 } // namespace open_vcdiff 297 298 #endif // OPEN_VCDIFF_VCENCODER_H_ 299