• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1  // Copyright 2007 Google Inc.
2  // Author: Lincoln Smith
3  //
4  // Licensed under the Apache License, Version 2.0 (the "License");
5  // you may not use this file except in compliance with the License.
6  // You may obtain a copy of the License at
7  //
8  //      http://www.apache.org/licenses/LICENSE-2.0
9  //
10  // Unless required by applicable law or agreed to in writing, software
11  // distributed under the License is distributed on an "AS IS" BASIS,
12  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  // See the License for the specific language governing permissions and
14  // limitations under the License.
15  
16  #ifndef OPEN_VCDIFF_VCENCODER_H_
17  #define OPEN_VCDIFF_VCENCODER_H_
18  
19  #include <stddef.h>  // size_t
20  #include <vector>
21  #include "google/output_string.h"
22  
23  namespace open_vcdiff {
24  
25  class VCDiffEngine;
26  class VCDiffStreamingEncoderImpl;
27  
28  // These flags are passed to the constructor of VCDiffStreamingEncoder
29  // to determine whether certain open-vcdiff format extensions
30  // (which are not part of the RFC 3284 draft standard for VCDIFF)
31  // are employed.
32  //
33  // Because these extensions are not part of the VCDIFF standard, if
34  // any of these flags except VCD_STANDARD_FORMAT is specified, then the caller
35  // must be certain that the receiver of the data will be using open-vcdiff
36  // to decode the delta file, or at least that the receiver can interpret
37  // these extensions.  The encoder will use an 'S' as the fourth character
38  // in the delta file to indicate that non-standard extensions are being used.
39  //
40  enum VCDiffFormatExtensionFlagValues {
41    // No extensions: the encoded format will conform to the RFC
42    // draft standard for VCDIFF.
43    VCD_STANDARD_FORMAT = 0x00,
44    // If this flag is specified, then the encoder writes each delta file
45    // window by interleaving instructions and sizes with their corresponding
46    // addresses and data, rather than placing these elements
47    // into three separate sections.  This facilitates providing partially
48    // decoded results when only a portion of a delta file window is received
49    // (e.g. when HTTP over TCP is used as the transmission protocol.)
50    VCD_FORMAT_INTERLEAVED = 0x01,
51    // If this flag is specified, then an Adler32 checksum
52    // of the target window data is included in the delta window.
53    VCD_FORMAT_CHECKSUM = 0x02
54  };
55  
56  typedef int VCDiffFormatExtensionFlags;
57  
58  // A HashedDictionary must be constructed from the dictionary data
59  // in order to use VCDiffStreamingEncoder.  If the same dictionary will
60  // be used to perform several encoding operations, then the caller should
61  // create the HashedDictionary once and cache it for reuse.  This object
62  // is thread-safe: the same const HashedDictionary can be used
63  // by several threads simultaneously, each with its own VCDiffStreamingEncoder.
64  //
65  // dictionary_contents is copied into the HashedDictionary, so the
66  // caller may free that string, if desired, after the constructor returns.
67  //
68  class HashedDictionary {
69   public:
70    HashedDictionary(const char* dictionary_contents,
71                     size_t dictionary_size);
72    ~HashedDictionary();
73  
74    // Init() must be called before using the HashedDictionary as an argument
75    // to the VCDiffStreamingEncoder, or for any other purpose except
76    // destruction.  It returns true if initialization succeeded, or false
77    // if an error occurred, in which case the caller should destroy the object
78    // without using it.
79    bool Init();
80  
engine()81    const VCDiffEngine* engine() const { return engine_; }
82  
83   private:
84    const VCDiffEngine* engine_;
85  
86    // Make the copy constructor and assignment operator private
87    // so that they don't inadvertently get used.
88    HashedDictionary(const HashedDictionary&);  // NOLINT
89    void operator=(const HashedDictionary&);
90  };
91  
92  // The standard streaming interface to the VCDIFF (RFC 3284) encoder.
93  // "Streaming" in this context means that, even though the entire set of
94  // input data to be encoded may not be available at once, the encoder
95  // can produce partial output based on what is available.  Of course,
96  // the caller should try to maximize the sizes of the data chunks passed
97  // to the encoder.
98  class VCDiffStreamingEncoder {
99   public:
100    // The HashedDictionary object passed to the constructor must remain valid,
101    // without being deleted, for the lifetime of the VCDiffStreamingEncoder
102    // object.
103    //
104    // format_extensions allows certain open-vcdiff extensions to the VCDIFF
105    // format to be included in the encoded output.  These extensions are not
106    // part of the RFC 3284 draft standard, so specifying any extension flags
107    // will make the output compatible only with open-vcdiff, or with other
108    // VCDIFF implementations that accept these extensions.  See above for an
109    // explanation of each possible flag value.
110    //
111    // *** look_for_target_matches:
112    // The VCDIFF format allows COPY instruction addresses to reference data from
113    // the source (dictionary), or from previously encoded target data.
114    //
115    // If look_for_target_matches is false, then the encoder will only
116    // produce COPY instructions that reference source data from the dictionary,
117    // never from previously encoded target data.  This will speed up the encoding
118    // process, but the encoded data will not be as compact.
119    //
120    // If this value is true, then the encoder will produce COPY instructions
121    // that reference either source data or target data.  A COPY instruction from
122    // the previously encoded target data may even extend into the range of the
123    // data being produced by that same COPY instruction; for example, if the
124    // previously encoded target data is "LA", then a single COPY instruction of
125    // length 10 can produce the additional target data "LALALALALA".
126    //
127    // There is a third type of COPY instruction that starts within
128    // the source data and extends from the end of the source data
129    // into the beginning of the target data.  This VCDIFF encoder will never
130    // produce a COPY instruction of this third type (regardless of the value of
131    // look_for_target_matches) because the cost of checking for matches
132    // across the source-target boundary would not justify its benefits.
133    //
134    VCDiffStreamingEncoder(const HashedDictionary* dictionary,
135                           VCDiffFormatExtensionFlags format_extensions,
136                           bool look_for_target_matches);
137    ~VCDiffStreamingEncoder();
138  
139    // The client should use these routines as follows:
140    //    HashedDictionary hd(dictionary, dictionary_size);
141    //    if (!hd.Init()) {
142    //      HandleError();
143    //      return;
144    //    }
145    //    string output_string;
146    //    VCDiffStreamingEncoder v(hd, false, false);
147    //    if (!v.StartEncoding(&output_string)) {
148    //      HandleError();
149    //      return;  // No need to call FinishEncoding()
150    //    }
151    //    Process(output_string.data(), output_string.size());
152    //    output_string.clear();
153    //    while (get data_buf) {
154    //      if (!v.EncodeChunk(data_buf, data_len, &output_string)) {
155    //        HandleError();
156    //        return;  // No need to call FinishEncoding()
157    //      }
158    //      // The encoding is appended to output_string at each call,
159    //      // so clear output_string once its contents have been processed.
160    //      Process(output_string.data(), output_string.size());
161    //      output_string.clear();
162    //    }
163    //    if (!v.FinishEncoding(&output_string)) {
164    //      HandleError();
165    //      return;
166    //    }
167    //    Process(output_string.data(), output_string.size());
168    //    output_string.clear();
169    //
170    // I.e., the allowed pattern of calls is
171    //    StartEncoding EncodeChunk* FinishEncoding
172    //
173    // The size of the encoded output depends on the sizes of the chunks
174    // passed in (i.e. the chunking boundary affects compression).
175    // However the decoded output is independent of chunk boundaries.
176  
177    // Sets up the data structures for encoding.
178    // Writes a VCDIFF delta file header (as defined in RFC section 4.1)
179    // to *output_string.
180    //
181    // Note: we *append*, so the old contents of *output_string stick around.
182    // This convention differs from the non-streaming Encode/Decode
183    // interfaces in VCDiffEncoder.
184    //
185    // If an error occurs, this function returns false; otherwise it returns true.
186    // If this function returns false, the caller does not need to call
187    // FinishEncoding or to do any cleanup except destroying the
188    // VCDiffStreamingEncoder object.
189    template<class OutputType>
StartEncoding(OutputType * output)190    bool StartEncoding(OutputType* output) {
191      OutputString<OutputType> output_string(output);
192      return StartEncodingToInterface(&output_string);
193    }
194  
195    bool StartEncodingToInterface(OutputStringInterface* output_string);
196  
197    // Appends compressed encoding for "data" (one complete VCDIFF delta window)
198    // to *output_string.
199    // If an error occurs (for example, if StartEncoding was not called
200    // earlier or StartEncoding returned false), this function returns false;
201    // otherwise it returns true.  The caller does not need to call FinishEncoding
202    // or do any cleanup except destroying the VCDiffStreamingEncoder
203    // if this function returns false.
204    template<class OutputType>
EncodeChunk(const char * data,size_t len,OutputType * output)205    bool EncodeChunk(const char* data, size_t len, OutputType* output) {
206      OutputString<OutputType> output_string(output);
207      return EncodeChunkToInterface(data, len, &output_string);
208    }
209  
210    bool EncodeChunkToInterface(const char* data, size_t len,
211                                OutputStringInterface* output_string);
212  
213    // Finishes encoding and appends any leftover encoded data to *output_string.
214    // If an error occurs (for example, if StartEncoding was not called
215    // earlier or StartEncoding returned false), this function returns false;
216    // otherwise it returns true.  The caller does not need to
217    // do any cleanup except destroying the VCDiffStreamingEncoder
218    // if this function returns false.
219    template<class OutputType>
FinishEncoding(OutputType * output)220    bool FinishEncoding(OutputType* output) {
221      OutputString<OutputType> output_string(output);
222      return FinishEncodingToInterface(&output_string);
223    }
224  
225    bool FinishEncodingToInterface(OutputStringInterface* output_string);
226  
227    // Replaces the contents of match_counts with a vector of integers,
228    // one for each possible match length.  The value of match_counts[n]
229    // is equal to the number of matches of length n found so far
230    // for this VCDiffStreamingEncoder object.
231    void GetMatchCounts(std::vector<int>* match_counts) const;
232  
233   private:
234    VCDiffStreamingEncoderImpl* const impl_;
235  
236    // Make the copy constructor and assignment operator private
237    // so that they don't inadvertently get used.
238    VCDiffStreamingEncoder(const VCDiffStreamingEncoder&);  // NOLINT
239    void operator=(const VCDiffStreamingEncoder&);
240  };
241  
242  // A simpler (non-streaming) interface to the VCDIFF encoder that can be used
243  // if the entire target data string is available.
244  //
245  class VCDiffEncoder {
246   public:
VCDiffEncoder(const char * dictionary_contents,size_t dictionary_size)247    VCDiffEncoder(const char* dictionary_contents, size_t dictionary_size)
248        : dictionary_(dictionary_contents, dictionary_size),
249          encoder_(NULL),
250          flags_(VCD_STANDARD_FORMAT),
251          look_for_target_matches_(true) { }
252  
~VCDiffEncoder()253    ~VCDiffEncoder() {
254      delete encoder_;
255    }
256  
257    // By default, VCDiffEncoder uses standard VCDIFF format.  This function
258    // can be used before calling Encode(), to specify that interleaved format
259    // and/or checksum format should be used.
SetFormatFlags(VCDiffFormatExtensionFlags flags)260    void SetFormatFlags(VCDiffFormatExtensionFlags flags) { flags_ = flags; }
261  
262    // By default, VCDiffEncoder looks for matches in the dictionary and also in
263    // the previously encoded target data.  This function can be used before
264    // calling Encode(), to specify whether or not target matching should be
265    // enabled.
SetTargetMatching(bool look_for_target_matches)266    void SetTargetMatching(bool look_for_target_matches) {
267      look_for_target_matches_ = look_for_target_matches;
268    }
269  
270    // Replaces old contents of output_string with the encoded form of
271    // target_data.
272    template<class OutputType>
Encode(const char * target_data,size_t target_len,OutputType * output)273    bool Encode(const char* target_data,
274                size_t target_len,
275                OutputType* output) {
276      OutputString<OutputType> output_string(output);
277      return EncodeToInterface(target_data, target_len, &output_string);
278    }
279  
280   private:
281    bool EncodeToInterface(const char* target_data,
282                           size_t target_len,
283                           OutputStringInterface* output_string);
284  
285    HashedDictionary dictionary_;
286    VCDiffStreamingEncoder* encoder_;
287    VCDiffFormatExtensionFlags flags_;
288    bool look_for_target_matches_;
289  
290    // Make the copy constructor and assignment operator private
291    // so that they don't inadvertently get used.
292    VCDiffEncoder(const VCDiffEncoder&);  // NOLINT
293    void operator=(const VCDiffEncoder&);
294  };
295  
296  }  // namespace open_vcdiff
297  
298  #endif  // OPEN_VCDIFF_VCENCODER_H_
299