• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2008 Google Inc.
2 // Author: Lincoln Smith
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 //      http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 
16 #ifndef OPEN_VCDIFF_ENCODETABLE_H_
17 #define OPEN_VCDIFF_ENCODETABLE_H_
18 
19 #include <config.h>
20 #include <stddef.h>  // size_t
21 #include <stdint.h>  // int32_t
22 #include <string>
23 #include <vector>
24 #include "addrcache.h"
25 #include "checksum.h"
26 #include "codetable.h"
27 #include "codetablewriter_interface.h"
28 
29 namespace open_vcdiff {
30 
31 class OutputStringInterface;
32 class VCDiffInstructionMap;
33 
34 // The method calls after construction *must* conform
35 // to the following pattern:
36 //    {{Add|Copy|Run}* [AddChecksum] Output}*
37 //
38 // When Output has been called in this sequence, a complete target window
39 // (as defined in RFC 3284 section 4.3) will have been appended to
40 // out (unless no calls to Add, Run, or Copy were made, in which
41 // case Output will do nothing.)  The output will not be available for use
42 // until after each call to Output().
43 //
44 // NOT threadsafe.
45 //
46 class VCDiffCodeTableWriter : public CodeTableWriterInterface {
47  public:
48   // This constructor uses the default code table.
49   // If interleaved is true, the encoder writes each delta file window
50   // by interleaving instructions and sizes with their corresponding
51   // addresses and data, rather than placing these elements into three
52   // separate sections.  This facilitates providing partially
53   // decoded results when only a portion of a delta file window
54   // is received (e.g. when HTTP over TCP is used as the
55   // transmission protocol.)  The interleaved format is
56   // not consistent with the VCDIFF draft standard.
57   //
58   explicit VCDiffCodeTableWriter(bool interleaved);
59 
60   // Uses a non-standard code table and non-standard cache sizes.  The caller
61   // must guarantee that code_table_data remains allocated for the lifetime of
62   // the VCDiffCodeTableWriter object.  Note that this is different from how
63   // VCDiffCodeTableReader::UseCodeTable works.  It is assumed that a given
64   // encoder will use either the default code table or a statically-defined
65   // non-standard code table, whereas the decoder must have the ability to read
66   // an arbitrary non-standard code table from a delta file and discard it once
67   // the file has been decoded.
68   //
69   VCDiffCodeTableWriter(bool interleaved,
70                         int near_cache_size,
71                         int same_cache_size,
72                         const VCDiffCodeTableData& code_table_data,
73                         unsigned char max_mode);
74 
75   virtual ~VCDiffCodeTableWriter();
76 
77   // Initializes the constructed object for use.
78   // This method must be called after a VCDiffCodeTableWriter is constructed
79   // and before any of its other methods can be called.  It will return
80   // false if there was an error initializing the object, or true if it
81   // was successful.  After the object has been initialized and used,
82   // Init() can be called again to restore the initial state of the object.
83   //
84   bool Init(size_t dictionary_size);
85 
target_length()86   virtual size_t target_length() const { return target_length_; }
87 
88   // Encode an ADD opcode with the "size" bytes starting at data
89   virtual void Add(const char* data, size_t size);
90 
91   // Encode a COPY opcode with args "offset" (into dictionary) and "size" bytes.
92   virtual void Copy(int32_t offset, size_t size);
93 
94   // Encode a RUN opcode for "size" copies of the value "byte".
95   virtual void Run(size_t size, unsigned char byte);
96 
AddChecksum(VCDChecksum checksum)97   void AddChecksum(VCDChecksum checksum) {
98     add_checksum_ = true;
99     checksum_ = checksum;
100   }
101 
102   // Finishes encoding and appends the encoded delta window to the output
103   // string.  The output string is not null-terminated and may contain embedded
104   // '\0' characters.
105   virtual void Output(OutputStringInterface* out);
106 
match_counts()107   const std::vector<int>& match_counts() const { return match_counts_; }
108 
109  private:
110   typedef std::string string;
111 
112   // This is an estimate of the longest match size the encoder expects to find.
113   // It is used to determine the initial size of the vector match_counts_.
114   // If it is too large, then some space will be wasted on vector elements
115   // that are not used.  If it is too small, then some time will be wasted
116   // expanding match_counts_ to accommodate larger match sizes.
117   static const size_t kMaxMatchSize = 2000;
118 
119   // The maximum value for the mode of a COPY instruction.
120   const unsigned char max_mode_;
121 
122   // If interleaved is true, sets data_for_add_and_run_ and
123   // addresses_for_copy_ to point at instructions_and_sizes_,
124   // so that instructions, sizes, addresses and data will be
125   // combined into a single interleaved stream.
126   // If interleaved is false, sets data_for_add_and_run_ and
127   // addresses_for_copy_ to point at their corresponding
128   // separate_... strings, so that the three sections will
129   // be generated separately from one another.
130   //
131   void InitSectionPointers(bool interleaved);
132 
133   // Determines the best opcode to encode an instruction, and appends
134   // or substitutes that opcode and its size into the
135   // instructions_and_sizes_ string.
136   //
137   void EncodeInstruction(VCDiffInstructionType inst,
138                          size_t size,
139                          unsigned char mode);
140 
EncodeInstruction(VCDiffInstructionType inst,size_t size)141   void EncodeInstruction(VCDiffInstructionType inst, size_t size) {
142     return EncodeInstruction(inst, size, 0);
143   }
144 
145   // Calculates the number of bytes needed to store the given size value as a
146   // variable-length integer (VarintBE).
147   static size_t CalculateLengthOfSizeAsVarint(size_t size);
148 
149   // Appends the size value to the string as a variable-length integer.
150   static void AppendSizeToString(size_t size, string* out);
151 
152   // Appends the size value to the output string as a variable-length integer.
153   static void AppendSizeToOutputString(size_t size, OutputStringInterface* out);
154 
155   // Calculates the "Length of the delta encoding" field for the delta window
156   // header, based on the sizes of the sections and of the other header
157   // elements.
158   size_t CalculateLengthOfTheDeltaEncoding() const;
159 
160   // None of the following 'string' objects are null-terminated.
161 
162   // A series of instruction opcodes, each of which may be followed
163   // by one or two Varint values representing the size parameters
164   // of the first and second instruction in the opcode.
165   string instructions_and_sizes_;
166 
167   // A series of data arguments (byte values) used for ADD and RUN
168   // instructions.  Depending on whether interleaved output is used
169   // for streaming or not, the pointer may point to
170   // separate_data_for_add_and_run_ or to instructions_and_sizes_.
171   string *data_for_add_and_run_;
172   string separate_data_for_add_and_run_;
173 
174   // A series of Varint addresses used for COPY instructions.
175   // For the SAME mode, a byte value is stored instead of a Varint.
176   // Depending on whether interleaved output is used
177   // for streaming or not, the pointer may point to
178   // separate_addresses_for_copy_ or to instructions_and_sizes_.
179   string *addresses_for_copy_;
180   string separate_addresses_for_copy_;
181 
182   VCDiffAddressCache address_cache_;
183 
184   size_t dictionary_size_;
185 
186   // The number of bytes of target data that has been encoded so far.
187   // Each time Add(), Copy(), or Run() is called, this will be incremented.
188   // The target length is used to compute HERE mode addresses
189   // for COPY instructions, and is also written into the header
190   // of the delta window when Output() is called.
191   //
192   size_t target_length_;
193 
194   const VCDiffCodeTableData* code_table_data_;
195 
196   // The instruction map facilitates finding an opcode quickly given an
197   // instruction inst, size, and mode.  This is an alternate representation
198   // of the same information that is found in code_table_data_.
199   //
200   const VCDiffInstructionMap* instruction_map_;
201 
202   // The zero-based index within instructions_and_sizes_ of the byte
203   // that contains the last single-instruction opcode generated by
204   // EncodeInstruction().  (See that function for exhaustive details.)
205   // It is necessary to use an index rather than a pointer for this value
206   // because instructions_and_sizes_ may be resized, which would invalidate
207   // any pointers into its data buffer.  The value -1 is reserved to mean that
208   // either no opcodes have been generated yet, or else the last opcode
209   // generated was a double-instruction opcode.
210   //
211   int last_opcode_index_;
212 
213   // If true, an Adler32 checksum of the target window data will be written as
214   // a variable-length integer, just after the size of the addresses section.
215   //
216   bool add_checksum_;
217 
218   // The checksum to be written to the current target window,
219   // if add_checksum_ is true.
220   // This will not be calculated based on the individual calls to Add(), Run(),
221   // and Copy(), which would be unnecessarily expensive.  Instead, the code
222   // that uses the VCDiffCodeTableWriter object is expected to calculate
223   // the checksum all at once and to call AddChecksum() with that value.
224   // Must be called sometime before calling Output(), though it can be called
225   // either before or after the calls to Add(), Run(), and Copy().
226   //
227   VCDChecksum checksum_;
228 
229   // The value of match_counts_[n] is equal to the number of matches
230   // of length n (that is, COPY instructions of size n) found so far.
231   std::vector<int> match_counts_;
232 
233   // Making these private avoids implicit copy constructor & assignment operator
234   VCDiffCodeTableWriter(const VCDiffCodeTableWriter&);  // NOLINT
235   void operator=(const VCDiffCodeTableWriter&);
236 };
237 
238 };  // namespace open_vcdiff
239 
240 #endif  // OPEN_VCDIFF_ENCODETABLE_H_
241