• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2006, 2008 Google Inc.
2 // Authors: Chandra Chereddi, Lincoln Smith
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 //      http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 
16 #include <config.h>
17 #include "vcdiffengine.h"
18 #include <stdint.h>  // uint32_t
19 #include <string.h>  // memcpy
20 #include "blockhash.h"
21 #include "codetablewriter_interface.h"
22 #include "logging.h"
23 #include "rolling_hash.h"
24 
25 namespace open_vcdiff {
26 
VCDiffEngine(const char * dictionary,size_t dictionary_size)27 VCDiffEngine::VCDiffEngine(const char* dictionary, size_t dictionary_size)
28     // If dictionary_size == 0, then dictionary could be NULL.  Guard against
29     // using a NULL value.
30     : dictionary_((dictionary_size > 0) ? new char[dictionary_size] : ""),
31       dictionary_size_(dictionary_size),
32       hashed_dictionary_(NULL) {
33   if (dictionary_size > 0) {
34     memcpy(const_cast<char*>(dictionary_), dictionary, dictionary_size);
35   }
36 }
37 
~VCDiffEngine()38 VCDiffEngine::~VCDiffEngine() {
39   delete hashed_dictionary_;
40   if (dictionary_size_ > 0) {
41     delete[] dictionary_;
42   }
43 }
44 
Init()45 bool VCDiffEngine::Init() {
46   if (hashed_dictionary_) {
47     LOG(DFATAL) << "Init() called twice for same VCDiffEngine object"
48                 << LOG_ENDL;
49     return false;
50   }
51   hashed_dictionary_ = BlockHash::CreateDictionaryHash(dictionary_,
52                                                        dictionary_size());
53   if (!hashed_dictionary_) {
54     LOG(DFATAL) << "Creation of dictionary hash failed" << LOG_ENDL;
55     return false;
56   }
57   if (!RollingHash<BlockHash::kBlockSize>::Init()) {
58     LOG(DFATAL) << "RollingHash initialization failed" << LOG_ENDL;
59     return false;
60   }
61   return true;
62 }
63 
64 // This helper function tries to find an appropriate match within
65 // hashed_dictionary_ for the block starting at the current target position.
66 // If target_hash is not NULL, this function will also look for a match
67 // within the previously encoded target data.
68 //
69 // If a match is found, this function will generate an ADD instruction
70 // for all unencoded data that precedes the match,
71 // and a COPY instruction for the match itself; then it returns
72 // the number of bytes processed by both instructions,
73 // which is guaranteed to be > 0.
74 // If no appropriate match is found, the function returns 0.
75 //
76 // The first four parameters are input parameters which are passed
77 // directly to BlockHash::FindBestMatch; please see that function
78 // for a description of their allowable values.
79 template<bool look_for_target_matches>
EncodeCopyForBestMatch(uint32_t hash_value,const char * target_candidate_start,const char * unencoded_target_start,size_t unencoded_target_size,const BlockHash * target_hash,CodeTableWriterInterface * coder) const80 inline size_t VCDiffEngine::EncodeCopyForBestMatch(
81     uint32_t hash_value,
82     const char* target_candidate_start,
83     const char* unencoded_target_start,
84     size_t unencoded_target_size,
85     const BlockHash* target_hash,
86     CodeTableWriterInterface* coder) const {
87   // When FindBestMatch() comes up with a match for a candidate block,
88   // it will populate best_match with the size, source offset,
89   // and target offset of the match.
90   BlockHash::Match best_match;
91 
92   // First look for a match in the dictionary.
93   hashed_dictionary_->FindBestMatch(hash_value,
94                                     target_candidate_start,
95                                     unencoded_target_start,
96                                     unencoded_target_size,
97                                     &best_match);
98   // If target matching is enabled, then see if there is a better match
99   // within the target data that has been encoded so far.
100   if (look_for_target_matches) {
101     target_hash->FindBestMatch(hash_value,
102                                target_candidate_start,
103                                unencoded_target_start,
104                                unencoded_target_size,
105                                &best_match);
106   }
107   if (!ShouldGenerateCopyInstructionForMatchOfSize(best_match.size())) {
108     return 0;
109   }
110   if (best_match.target_offset() > 0) {
111     // Create an ADD instruction to encode all target bytes
112     // from the end of the last COPY match, if any, up to
113     // the beginning of this COPY match.
114     coder->Add(unencoded_target_start, best_match.target_offset());
115   }
116   coder->Copy(best_match.source_offset(), best_match.size());
117   return best_match.target_offset()  // ADD size
118        + best_match.size();          // + COPY size
119 }
120 
121 // Once the encoder loop has finished checking for matches in the target data,
122 // this function creates an ADD instruction to encode all target bytes
123 // from the end of the last COPY match, if any, through the end of
124 // the target data.  In the worst case, if no matches were found at all,
125 // this function will create one big ADD instruction
126 // for the entire buffer of target data.
AddUnmatchedRemainder(const char * unencoded_target_start,size_t unencoded_target_size,CodeTableWriterInterface * coder) const127 inline void VCDiffEngine::AddUnmatchedRemainder(
128     const char* unencoded_target_start,
129     size_t unencoded_target_size,
130     CodeTableWriterInterface* coder) const {
131   if (unencoded_target_size > 0) {
132     coder->Add(unencoded_target_start, unencoded_target_size);
133   }
134 }
135 
136 // This helper function tells the coder to finish the encoding and write
137 // the results into the output string "diff".
FinishEncoding(size_t target_size,OutputStringInterface * diff,CodeTableWriterInterface * coder) const138 inline void VCDiffEngine::FinishEncoding(
139     size_t target_size,
140     OutputStringInterface* diff,
141     CodeTableWriterInterface* coder) const {
142   if (target_size != static_cast<size_t>(coder->target_length())) {
143     LOG(DFATAL) << "Internal error in VCDiffEngine::Encode: "
144                    "original target size (" << target_size
145                 << ") does not match number of bytes processed ("
146                 << coder->target_length() << ")" << LOG_ENDL;
147   }
148   coder->Output(diff);
149 }
150 
151 template<bool look_for_target_matches>
EncodeInternal(const char * target_data,size_t target_size,OutputStringInterface * diff,CodeTableWriterInterface * coder) const152 void VCDiffEngine::EncodeInternal(const char* target_data,
153                                   size_t target_size,
154                                   OutputStringInterface* diff,
155                                   CodeTableWriterInterface* coder) const {
156   if (!hashed_dictionary_) {
157     LOG(DFATAL) << "Internal error: VCDiffEngine::Encode() "
158                    "called before VCDiffEngine::Init()" << LOG_ENDL;
159     return;
160   }
161   if (target_size == 0) {
162     return;  // Do nothing for empty target
163   }
164   // Special case for really small input
165   if (target_size < static_cast<size_t>(BlockHash::kBlockSize)) {
166     AddUnmatchedRemainder(target_data, target_size, coder);
167     FinishEncoding(target_size, diff, coder);
168     return;
169   }
170   RollingHash<BlockHash::kBlockSize> hasher;
171   BlockHash* target_hash = NULL;
172   if (look_for_target_matches) {
173     // Check matches against previously encoded target data
174     // in this same target window, as well as against the dictionary
175     target_hash = BlockHash::CreateTargetHash(target_data,
176                                               target_size,
177                                               dictionary_size());
178     if (!target_hash) {
179       LOG(DFATAL) << "Instantiation of target hash failed" << LOG_ENDL;
180       return;
181     }
182   }
183   const char* const target_end = target_data + target_size;
184   const char* const start_of_last_block = target_end - BlockHash::kBlockSize;
185   // Offset of next bytes in string to ADD if NOT copied (i.e., not found in
186   // dictionary)
187   const char* next_encode = target_data;
188   // candidate_pos points to the start of the kBlockSize-byte block that may
189   // begin a match with the dictionary or previously encoded target data.
190   const char* candidate_pos = target_data;
191   uint32_t hash_value = hasher.Hash(candidate_pos);
192   while (1) {
193     const size_t bytes_encoded =
194         EncodeCopyForBestMatch<look_for_target_matches>(
195             hash_value,
196             candidate_pos,
197             next_encode,
198             (target_end - next_encode),
199             target_hash,
200             coder);
201     if (bytes_encoded > 0) {
202       next_encode += bytes_encoded;  // Advance past COPYed data
203       candidate_pos = next_encode;
204       if (candidate_pos > start_of_last_block) {
205         break;  // Reached end of target data
206       }
207       // candidate_pos has jumped ahead by bytes_encoded bytes, so UpdateHash
208       // can't be used to calculate the hash value at its new position.
209       hash_value = hasher.Hash(candidate_pos);
210       if (look_for_target_matches) {
211         // Update the target hash for the ADDed and COPYed data
212         target_hash->AddAllBlocksThroughIndex(
213             static_cast<int>(next_encode - target_data));
214       }
215     } else {
216       // No match, or match is too small to be worth a COPY instruction.
217       // Move to the next position in the target data.
218       if ((candidate_pos + 1) > start_of_last_block) {
219         break;  // Reached end of target data
220       }
221       if (look_for_target_matches) {
222         target_hash->AddOneIndexHash(
223             static_cast<int>(candidate_pos - target_data),
224             hash_value);
225       }
226       hash_value = hasher.UpdateHash(hash_value,
227                                      candidate_pos[0],
228                                      candidate_pos[BlockHash::kBlockSize]);
229       ++candidate_pos;
230     }
231   }
232   AddUnmatchedRemainder(next_encode, target_end - next_encode, coder);
233   FinishEncoding(target_size, diff, coder);
234   delete target_hash;
235 }
236 
Encode(const char * target_data,size_t target_size,bool look_for_target_matches,OutputStringInterface * diff,CodeTableWriterInterface * coder) const237 void VCDiffEngine::Encode(const char* target_data,
238                           size_t target_size,
239                           bool look_for_target_matches,
240                           OutputStringInterface* diff,
241                           CodeTableWriterInterface* coder) const {
242   if (look_for_target_matches) {
243     EncodeInternal<true>(target_data, target_size, diff, coder);
244   } else {
245     EncodeInternal<false>(target_data, target_size, diff, coder);
246   }
247 }
248 
249 }  // namespace open_vcdiff
250