1 // Copyright 2006, 2008 Google Inc.
2 // Authors: Chandra Chereddi, Lincoln Smith
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 // http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15
16 #include <config.h>
17 #include "vcdiffengine.h"
18 #include <stdint.h> // uint32_t
19 #include <string.h> // memcpy
20 #include "blockhash.h"
21 #include "codetablewriter_interface.h"
22 #include "logging.h"
23 #include "rolling_hash.h"
24
25 namespace open_vcdiff {
26
VCDiffEngine(const char * dictionary,size_t dictionary_size)27 VCDiffEngine::VCDiffEngine(const char* dictionary, size_t dictionary_size)
28 // If dictionary_size == 0, then dictionary could be NULL. Guard against
29 // using a NULL value.
30 : dictionary_((dictionary_size > 0) ? new char[dictionary_size] : ""),
31 dictionary_size_(dictionary_size),
32 hashed_dictionary_(NULL) {
33 if (dictionary_size > 0) {
34 memcpy(const_cast<char*>(dictionary_), dictionary, dictionary_size);
35 }
36 }
37
~VCDiffEngine()38 VCDiffEngine::~VCDiffEngine() {
39 delete hashed_dictionary_;
40 if (dictionary_size_ > 0) {
41 delete[] dictionary_;
42 }
43 }
44
Init()45 bool VCDiffEngine::Init() {
46 if (hashed_dictionary_) {
47 LOG(DFATAL) << "Init() called twice for same VCDiffEngine object"
48 << LOG_ENDL;
49 return false;
50 }
51 hashed_dictionary_ = BlockHash::CreateDictionaryHash(dictionary_,
52 dictionary_size());
53 if (!hashed_dictionary_) {
54 LOG(DFATAL) << "Creation of dictionary hash failed" << LOG_ENDL;
55 return false;
56 }
57 if (!RollingHash<BlockHash::kBlockSize>::Init()) {
58 LOG(DFATAL) << "RollingHash initialization failed" << LOG_ENDL;
59 return false;
60 }
61 return true;
62 }
63
64 // This helper function tries to find an appropriate match within
65 // hashed_dictionary_ for the block starting at the current target position.
66 // If target_hash is not NULL, this function will also look for a match
67 // within the previously encoded target data.
68 //
69 // If a match is found, this function will generate an ADD instruction
70 // for all unencoded data that precedes the match,
71 // and a COPY instruction for the match itself; then it returns
72 // the number of bytes processed by both instructions,
73 // which is guaranteed to be > 0.
74 // If no appropriate match is found, the function returns 0.
75 //
76 // The first four parameters are input parameters which are passed
77 // directly to BlockHash::FindBestMatch; please see that function
78 // for a description of their allowable values.
79 template<bool look_for_target_matches>
EncodeCopyForBestMatch(uint32_t hash_value,const char * target_candidate_start,const char * unencoded_target_start,size_t unencoded_target_size,const BlockHash * target_hash,CodeTableWriterInterface * coder) const80 inline size_t VCDiffEngine::EncodeCopyForBestMatch(
81 uint32_t hash_value,
82 const char* target_candidate_start,
83 const char* unencoded_target_start,
84 size_t unencoded_target_size,
85 const BlockHash* target_hash,
86 CodeTableWriterInterface* coder) const {
87 // When FindBestMatch() comes up with a match for a candidate block,
88 // it will populate best_match with the size, source offset,
89 // and target offset of the match.
90 BlockHash::Match best_match;
91
92 // First look for a match in the dictionary.
93 hashed_dictionary_->FindBestMatch(hash_value,
94 target_candidate_start,
95 unencoded_target_start,
96 unencoded_target_size,
97 &best_match);
98 // If target matching is enabled, then see if there is a better match
99 // within the target data that has been encoded so far.
100 if (look_for_target_matches) {
101 target_hash->FindBestMatch(hash_value,
102 target_candidate_start,
103 unencoded_target_start,
104 unencoded_target_size,
105 &best_match);
106 }
107 if (!ShouldGenerateCopyInstructionForMatchOfSize(best_match.size())) {
108 return 0;
109 }
110 if (best_match.target_offset() > 0) {
111 // Create an ADD instruction to encode all target bytes
112 // from the end of the last COPY match, if any, up to
113 // the beginning of this COPY match.
114 coder->Add(unencoded_target_start, best_match.target_offset());
115 }
116 coder->Copy(best_match.source_offset(), best_match.size());
117 return best_match.target_offset() // ADD size
118 + best_match.size(); // + COPY size
119 }
120
121 // Once the encoder loop has finished checking for matches in the target data,
122 // this function creates an ADD instruction to encode all target bytes
123 // from the end of the last COPY match, if any, through the end of
124 // the target data. In the worst case, if no matches were found at all,
125 // this function will create one big ADD instruction
126 // for the entire buffer of target data.
AddUnmatchedRemainder(const char * unencoded_target_start,size_t unencoded_target_size,CodeTableWriterInterface * coder) const127 inline void VCDiffEngine::AddUnmatchedRemainder(
128 const char* unencoded_target_start,
129 size_t unencoded_target_size,
130 CodeTableWriterInterface* coder) const {
131 if (unencoded_target_size > 0) {
132 coder->Add(unencoded_target_start, unencoded_target_size);
133 }
134 }
135
136 // This helper function tells the coder to finish the encoding and write
137 // the results into the output string "diff".
FinishEncoding(size_t target_size,OutputStringInterface * diff,CodeTableWriterInterface * coder) const138 inline void VCDiffEngine::FinishEncoding(
139 size_t target_size,
140 OutputStringInterface* diff,
141 CodeTableWriterInterface* coder) const {
142 if (target_size != static_cast<size_t>(coder->target_length())) {
143 LOG(DFATAL) << "Internal error in VCDiffEngine::Encode: "
144 "original target size (" << target_size
145 << ") does not match number of bytes processed ("
146 << coder->target_length() << ")" << LOG_ENDL;
147 }
148 coder->Output(diff);
149 }
150
151 template<bool look_for_target_matches>
EncodeInternal(const char * target_data,size_t target_size,OutputStringInterface * diff,CodeTableWriterInterface * coder) const152 void VCDiffEngine::EncodeInternal(const char* target_data,
153 size_t target_size,
154 OutputStringInterface* diff,
155 CodeTableWriterInterface* coder) const {
156 if (!hashed_dictionary_) {
157 LOG(DFATAL) << "Internal error: VCDiffEngine::Encode() "
158 "called before VCDiffEngine::Init()" << LOG_ENDL;
159 return;
160 }
161 if (target_size == 0) {
162 return; // Do nothing for empty target
163 }
164 // Special case for really small input
165 if (target_size < static_cast<size_t>(BlockHash::kBlockSize)) {
166 AddUnmatchedRemainder(target_data, target_size, coder);
167 FinishEncoding(target_size, diff, coder);
168 return;
169 }
170 RollingHash<BlockHash::kBlockSize> hasher;
171 BlockHash* target_hash = NULL;
172 if (look_for_target_matches) {
173 // Check matches against previously encoded target data
174 // in this same target window, as well as against the dictionary
175 target_hash = BlockHash::CreateTargetHash(target_data,
176 target_size,
177 dictionary_size());
178 if (!target_hash) {
179 LOG(DFATAL) << "Instantiation of target hash failed" << LOG_ENDL;
180 return;
181 }
182 }
183 const char* const target_end = target_data + target_size;
184 const char* const start_of_last_block = target_end - BlockHash::kBlockSize;
185 // Offset of next bytes in string to ADD if NOT copied (i.e., not found in
186 // dictionary)
187 const char* next_encode = target_data;
188 // candidate_pos points to the start of the kBlockSize-byte block that may
189 // begin a match with the dictionary or previously encoded target data.
190 const char* candidate_pos = target_data;
191 uint32_t hash_value = hasher.Hash(candidate_pos);
192 while (1) {
193 const size_t bytes_encoded =
194 EncodeCopyForBestMatch<look_for_target_matches>(
195 hash_value,
196 candidate_pos,
197 next_encode,
198 (target_end - next_encode),
199 target_hash,
200 coder);
201 if (bytes_encoded > 0) {
202 next_encode += bytes_encoded; // Advance past COPYed data
203 candidate_pos = next_encode;
204 if (candidate_pos > start_of_last_block) {
205 break; // Reached end of target data
206 }
207 // candidate_pos has jumped ahead by bytes_encoded bytes, so UpdateHash
208 // can't be used to calculate the hash value at its new position.
209 hash_value = hasher.Hash(candidate_pos);
210 if (look_for_target_matches) {
211 // Update the target hash for the ADDed and COPYed data
212 target_hash->AddAllBlocksThroughIndex(
213 static_cast<int>(next_encode - target_data));
214 }
215 } else {
216 // No match, or match is too small to be worth a COPY instruction.
217 // Move to the next position in the target data.
218 if ((candidate_pos + 1) > start_of_last_block) {
219 break; // Reached end of target data
220 }
221 if (look_for_target_matches) {
222 target_hash->AddOneIndexHash(
223 static_cast<int>(candidate_pos - target_data),
224 hash_value);
225 }
226 hash_value = hasher.UpdateHash(hash_value,
227 candidate_pos[0],
228 candidate_pos[BlockHash::kBlockSize]);
229 ++candidate_pos;
230 }
231 }
232 AddUnmatchedRemainder(next_encode, target_end - next_encode, coder);
233 FinishEncoding(target_size, diff, coder);
234 delete target_hash;
235 }
236
Encode(const char * target_data,size_t target_size,bool look_for_target_matches,OutputStringInterface * diff,CodeTableWriterInterface * coder) const237 void VCDiffEngine::Encode(const char* target_data,
238 size_t target_size,
239 bool look_for_target_matches,
240 OutputStringInterface* diff,
241 CodeTableWriterInterface* coder) const {
242 if (look_for_target_matches) {
243 EncodeInternal<true>(target_data, target_size, diff, coder);
244 } else {
245 EncodeInternal<false>(target_data, target_size, diff, coder);
246 }
247 }
248
249 } // namespace open_vcdiff
250