• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2007 Google Inc.
2 // Author: Lincoln Smith
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 //      http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 
16 #ifndef OPEN_VCDIFF_VCENCODER_H_
17 #define OPEN_VCDIFF_VCENCODER_H_
18 
19 #include <stddef.h>  // size_t
20 #include <vector>
21 #include "google/output_string.h"
22 
23 namespace open_vcdiff {
24 
25 class VCDiffEngine;
26 class VCDiffStreamingEncoderImpl;
27 
28 // These flags are passed to the constructor of VCDiffStreamingEncoder
29 // to determine whether certain open-vcdiff format extensions
30 // (which are not part of the RFC 3284 draft standard for VCDIFF)
31 // are employed.
32 //
33 // Because these extensions are not part of the VCDIFF standard, if
34 // any of these flags except VCD_STANDARD_FORMAT is specified, then the caller
35 // must be certain that the receiver of the data will be using open-vcdiff
36 // to decode the delta file, or at least that the receiver can interpret
37 // these extensions.  The encoder will use an 'S' as the fourth character
38 // in the delta file to indicate that non-standard extensions are being used.
39 //
40 enum VCDiffFormatExtensionFlagValues {
41   // No extensions: the encoded format will conform to the RFC
42   // draft standard for VCDIFF.
43   VCD_STANDARD_FORMAT = 0x00,
44   // If this flag is specified, then the encoder writes each delta file
45   // window by interleaving instructions and sizes with their corresponding
46   // addresses and data, rather than placing these elements
47   // into three separate sections.  This facilitates providing partially
48   // decoded results when only a portion of a delta file window is received
49   // (e.g. when HTTP over TCP is used as the transmission protocol.)
50   VCD_FORMAT_INTERLEAVED = 0x01,
51   // If this flag is specified, then an Adler32 checksum
52   // of the target window data is included in the delta window.
53   VCD_FORMAT_CHECKSUM = 0x02
54 };
55 
56 typedef int VCDiffFormatExtensionFlags;
57 
58 // A HashedDictionary must be constructed from the dictionary data
59 // in order to use VCDiffStreamingEncoder.  If the same dictionary will
60 // be used to perform several encoding operations, then the caller should
61 // create the HashedDictionary once and cache it for reuse.  This object
62 // is thread-safe: the same const HashedDictionary can be used
63 // by several threads simultaneously, each with its own VCDiffStreamingEncoder.
64 //
65 // dictionary_contents is copied into the HashedDictionary, so the
66 // caller may free that string, if desired, after the constructor returns.
67 //
68 class HashedDictionary {
69  public:
70   HashedDictionary(const char* dictionary_contents,
71                    size_t dictionary_size);
72   ~HashedDictionary();
73 
74   // Init() must be called before using the HashedDictionary as an argument
75   // to the VCDiffStreamingEncoder, or for any other purpose except
76   // destruction.  It returns true if initialization succeeded, or false
77   // if an error occurred, in which case the caller should destroy the object
78   // without using it.
79   bool Init();
80 
engine()81   const VCDiffEngine* engine() const { return engine_; }
82 
83  private:
84   const VCDiffEngine* engine_;
85 
86   // Make the copy constructor and assignment operator private
87   // so that they don't inadvertently get used.
88   HashedDictionary(const HashedDictionary&);  // NOLINT
89   void operator=(const HashedDictionary&);
90 };
91 
92 // The standard streaming interface to the VCDIFF (RFC 3284) encoder.
93 // "Streaming" in this context means that, even though the entire set of
94 // input data to be encoded may not be available at once, the encoder
95 // can produce partial output based on what is available.  Of course,
96 // the caller should try to maximize the sizes of the data chunks passed
97 // to the encoder.
98 class VCDiffStreamingEncoder {
99  public:
100   // The HashedDictionary object passed to the constructor must remain valid,
101   // without being deleted, for the lifetime of the VCDiffStreamingEncoder
102   // object.
103   //
104   // format_extensions allows certain open-vcdiff extensions to the VCDIFF
105   // format to be included in the encoded output.  These extensions are not
106   // part of the RFC 3284 draft standard, so specifying any extension flags
107   // will make the output compatible only with open-vcdiff, or with other
108   // VCDIFF implementations that accept these extensions.  See above for an
109   // explanation of each possible flag value.
110   //
111   // *** look_for_target_matches:
112   // The VCDIFF format allows COPY instruction addresses to reference data from
113   // the source (dictionary), or from previously encoded target data.
114   //
115   // If look_for_target_matches is false, then the encoder will only
116   // produce COPY instructions that reference source data from the dictionary,
117   // never from previously encoded target data.  This will speed up the encoding
118   // process, but the encoded data will not be as compact.
119   //
120   // If this value is true, then the encoder will produce COPY instructions
121   // that reference either source data or target data.  A COPY instruction from
122   // the previously encoded target data may even extend into the range of the
123   // data being produced by that same COPY instruction; for example, if the
124   // previously encoded target data is "LA", then a single COPY instruction of
125   // length 10 can produce the additional target data "LALALALALA".
126   //
127   // There is a third type of COPY instruction that starts within
128   // the source data and extends from the end of the source data
129   // into the beginning of the target data.  This VCDIFF encoder will never
130   // produce a COPY instruction of this third type (regardless of the value of
131   // look_for_target_matches) because the cost of checking for matches
132   // across the source-target boundary would not justify its benefits.
133   //
134   VCDiffStreamingEncoder(const HashedDictionary* dictionary,
135                          VCDiffFormatExtensionFlags format_extensions,
136                          bool look_for_target_matches);
137   ~VCDiffStreamingEncoder();
138 
139   // The client should use these routines as follows:
140   //    HashedDictionary hd(dictionary, dictionary_size);
141   //    if (!hd.Init()) {
142   //      HandleError();
143   //      return;
144   //    }
145   //    string output_string;
146   //    VCDiffStreamingEncoder v(hd, false, false);
147   //    if (!v.StartEncoding(&output_string)) {
148   //      HandleError();
149   //      return;  // No need to call FinishEncoding()
150   //    }
151   //    Process(output_string.data(), output_string.size());
152   //    output_string.clear();
153   //    while (get data_buf) {
154   //      if (!v.EncodeChunk(data_buf, data_len, &output_string)) {
155   //        HandleError();
156   //        return;  // No need to call FinishEncoding()
157   //      }
158   //      // The encoding is appended to output_string at each call,
159   //      // so clear output_string once its contents have been processed.
160   //      Process(output_string.data(), output_string.size());
161   //      output_string.clear();
162   //    }
163   //    if (!v.FinishEncoding(&output_string)) {
164   //      HandleError();
165   //      return;
166   //    }
167   //    Process(output_string.data(), output_string.size());
168   //    output_string.clear();
169   //
170   // I.e., the allowed pattern of calls is
171   //    StartEncoding EncodeChunk* FinishEncoding
172   //
173   // The size of the encoded output depends on the sizes of the chunks
174   // passed in (i.e. the chunking boundary affects compression).
175   // However the decoded output is independent of chunk boundaries.
176 
177   // Sets up the data structures for encoding.
178   // Writes a VCDIFF delta file header (as defined in RFC section 4.1)
179   // to *output_string.
180   //
181   // Note: we *append*, so the old contents of *output_string stick around.
182   // This convention differs from the non-streaming Encode/Decode
183   // interfaces in VCDiffEncoder.
184   //
185   // If an error occurs, this function returns false; otherwise it returns true.
186   // If this function returns false, the caller does not need to call
187   // FinishEncoding or to do any cleanup except destroying the
188   // VCDiffStreamingEncoder object.
189   template<class OutputType>
StartEncoding(OutputType * output)190   bool StartEncoding(OutputType* output) {
191     OutputString<OutputType> output_string(output);
192     return StartEncodingToInterface(&output_string);
193   }
194 
195   bool StartEncodingToInterface(OutputStringInterface* output_string);
196 
197   // Appends compressed encoding for "data" (one complete VCDIFF delta window)
198   // to *output_string.
199   // If an error occurs (for example, if StartEncoding was not called
200   // earlier or StartEncoding returned false), this function returns false;
201   // otherwise it returns true.  The caller does not need to call FinishEncoding
202   // or do any cleanup except destroying the VCDiffStreamingEncoder
203   // if this function returns false.
204   template<class OutputType>
EncodeChunk(const char * data,size_t len,OutputType * output)205   bool EncodeChunk(const char* data, size_t len, OutputType* output) {
206     OutputString<OutputType> output_string(output);
207     return EncodeChunkToInterface(data, len, &output_string);
208   }
209 
210   bool EncodeChunkToInterface(const char* data, size_t len,
211                               OutputStringInterface* output_string);
212 
213   // Finishes encoding and appends any leftover encoded data to *output_string.
214   // If an error occurs (for example, if StartEncoding was not called
215   // earlier or StartEncoding returned false), this function returns false;
216   // otherwise it returns true.  The caller does not need to
217   // do any cleanup except destroying the VCDiffStreamingEncoder
218   // if this function returns false.
219   template<class OutputType>
FinishEncoding(OutputType * output)220   bool FinishEncoding(OutputType* output) {
221     OutputString<OutputType> output_string(output);
222     return FinishEncodingToInterface(&output_string);
223   }
224 
225   bool FinishEncodingToInterface(OutputStringInterface* output_string);
226 
227   // Replaces the contents of match_counts with a vector of integers,
228   // one for each possible match length.  The value of match_counts[n]
229   // is equal to the number of matches of length n found so far
230   // for this VCDiffStreamingEncoder object.
231   void GetMatchCounts(std::vector<int>* match_counts) const;
232 
233  private:
234   VCDiffStreamingEncoderImpl* const impl_;
235 
236   // Make the copy constructor and assignment operator private
237   // so that they don't inadvertently get used.
238   VCDiffStreamingEncoder(const VCDiffStreamingEncoder&);  // NOLINT
239   void operator=(const VCDiffStreamingEncoder&);
240 };
241 
242 // A simpler (non-streaming) interface to the VCDIFF encoder that can be used
243 // if the entire target data string is available.
244 //
245 class VCDiffEncoder {
246  public:
VCDiffEncoder(const char * dictionary_contents,size_t dictionary_size)247   VCDiffEncoder(const char* dictionary_contents, size_t dictionary_size)
248       : dictionary_(dictionary_contents, dictionary_size),
249         encoder_(NULL),
250         flags_(VCD_STANDARD_FORMAT),
251         look_for_target_matches_(true) { }
252 
~VCDiffEncoder()253   ~VCDiffEncoder() {
254     delete encoder_;
255   }
256 
257   // By default, VCDiffEncoder uses standard VCDIFF format.  This function
258   // can be used before calling Encode(), to specify that interleaved format
259   // and/or checksum format should be used.
SetFormatFlags(VCDiffFormatExtensionFlags flags)260   void SetFormatFlags(VCDiffFormatExtensionFlags flags) { flags_ = flags; }
261 
262   // By default, VCDiffEncoder looks for matches in the dictionary and also in
263   // the previously encoded target data.  This function can be used before
264   // calling Encode(), to specify whether or not target matching should be
265   // enabled.
SetTargetMatching(bool look_for_target_matches)266   void SetTargetMatching(bool look_for_target_matches) {
267     look_for_target_matches_ = look_for_target_matches;
268   }
269 
270   // Replaces old contents of output_string with the encoded form of
271   // target_data.
272   template<class OutputType>
Encode(const char * target_data,size_t target_len,OutputType * output)273   bool Encode(const char* target_data,
274               size_t target_len,
275               OutputType* output) {
276     OutputString<OutputType> output_string(output);
277     return EncodeToInterface(target_data, target_len, &output_string);
278   }
279 
280  private:
281   bool EncodeToInterface(const char* target_data,
282                          size_t target_len,
283                          OutputStringInterface* output_string);
284 
285   HashedDictionary dictionary_;
286   VCDiffStreamingEncoder* encoder_;
287   VCDiffFormatExtensionFlags flags_;
288   bool look_for_target_matches_;
289 
290   // Make the copy constructor and assignment operator private
291   // so that they don't inadvertently get used.
292   VCDiffEncoder(const VCDiffEncoder&);  // NOLINT
293   void operator=(const VCDiffEncoder&);
294 };
295 
296 }  // namespace open_vcdiff
297 
298 #endif  // OPEN_VCDIFF_VCENCODER_H_
299