1 // Copyright 2008 Google Inc.
2 // Author: Lincoln Smith
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 // http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16 // A command-line interface to the open-vcdiff library.
17
18 #include <config.h>
19 #include <assert.h>
20 #include <errno.h>
21 #ifdef WIN32
22 #include <fcntl.h>
23 #include <io.h>
24 #endif // WIN32
25 #include <stdio.h>
26 #include <string.h> // strerror
27 #include <iostream>
28 #include <memory>
29 #include <string>
30 #include <vector>
31 #include "gflags/gflags.h"
32 #include "google/vcdecoder.h"
33 #include "google/vcencoder.h"
34
35 #ifndef HAS_GLOBAL_STRING
36 using std::string;
37 #endif // !HAS_GLOBAL_STRING
38 using google::GetCommandLineFlagInfoOrDie;
39 using google::ShowUsageWithFlagsRestrict;
40
41 static const size_t kDefaultMaxTargetSize = 1 << 26; // 64 MB
42
43 // Definitions of command-line flags
44 DEFINE_string(dictionary, "",
45 "File containing dictionary data (required)");
46 DEFINE_string(target, "",
47 "Target file (default is stdin for encode, stdout for decode");
48 DEFINE_string(delta, "",
49 "Encoded delta file (default is stdout for encode, "
50 "stdin for decode");
51 // --buffersize is the maximum allowable size of a target window.
52 // This value may be increased if there is sufficient memory available.
53 DEFINE_uint64(buffersize, 1 << 20, // 1 MB
54 "Buffer size for reading input file");
55 DEFINE_bool(allow_vcd_target, true,
56 "If false, the decoder issues an error when the VCD_TARGET flag "
57 "is encountered");
58 DEFINE_bool(checksum, false,
59 "Include an Adler32 checksum of the target data when encoding");
60 DEFINE_bool(interleaved, false, "Use interleaved format");
61 DEFINE_bool(stats, false, "Report compression percentage");
62 DEFINE_bool(target_matches, false, "Find duplicate strings in target data"
63 " as well as dictionary data");
64 DEFINE_uint64(max_target_file_size, kDefaultMaxTargetSize,
65 "Maximum target file size allowed by decoder");
66 DEFINE_uint64(max_target_window_size, kDefaultMaxTargetSize,
67 "Maximum target window size allowed by decoder");
68
69 static const char* const kUsageString =
70 " {encode | delta | decode | patch }[ <options> ]\n"
71 "encode or delta: create delta file from dictionary and target file\n"
72 "decode or patch: reconstruct target file from dictionary and delta file";
73
74 namespace open_vcdiff {
75
76 class VCDiffFileBasedCoder {
77 public:
78 VCDiffFileBasedCoder();
79 ~VCDiffFileBasedCoder();
80
81 // Once the command-line arguments have been parsed, these functions
82 // will use the supplied options to carry out a file-based encode
83 // or decode operation.
84 bool Encode();
85 bool Decode();
86 bool DecodeAndCompare(); // for "vcdiff test"; compare target with original
87
88 private:
89 // Determines the size of the file. The given file must be an input file
90 // opened for reading only, not an input stream such as stdin. The function
91 // returns true and populates file_size if successful; otherwise, it returns
92 // false.
93 static bool FileSize(FILE* file, size_t* file_size);
94
95 // Opens a file for incremental reading. file_name is the name of the file
96 // to be opened. file_type should be a descriptive name (like "target") for
97 // use in log messages. If successful, returns true and sets *file to a
98 // valid input file, *buffer to a region of memory allocated using malloc()
99 // (so the caller must release it using free()), and buffer_size to the size
100 // of the buffer, which will not be larger than the size of the file, and
101 // will not be smaller than the --buffersize option. If the function fails,
102 // it outputs a log message and returns false.
103 bool OpenFileForReading(const string& file_name,
104 const char* file_type,
105 FILE** file,
106 std::vector<char>* buffer);
107
108 // Opens the dictionary file and reads it into a newly allocated buffer.
109 // If successful, returns true and populates dictionary_ with the dictionary
110 // contents; otherwise, returns false.
111 bool OpenDictionary();
112
113 // Opens the input file (the delta or target file) for reading.
114 // Allocates space for the input buffer. If successful,
115 // input_file_ will be valid and input_buffer_ will be allocated.
OpenInputFile()116 bool OpenInputFile() {
117 return OpenFileForReading(input_file_name_,
118 input_file_type_,
119 &input_file_,
120 &input_buffer_);
121 }
122
123 // Opens the output file (the target or delta file) for writing.
124 // If successful, output_file_ will be valid.
125 bool OpenOutputFile();
126
127 // Opens the output file (the target file) for comparison against the decoded
128 // output when using "vcdiff test".
OpenOutputFileForCompare()129 bool OpenOutputFileForCompare() {
130 return OpenFileForReading(output_file_name_,
131 output_file_type_,
132 &output_file_,
133 &compare_buffer_);
134 }
135
136 // Reads as much input data as possible from the input file
137 // into input_buffer_. If successful, returns true and sets *bytes_read
138 // to the number of bytes read into input_buffer_. If an error occurs,
139 // writes an error log message and returns false.
140 bool ReadInput(size_t* bytes_read);
141
142 // Writes the contents of output to output_file_. If successful, returns
143 // true. If an error occurs, writes an error log message and returns false.
144 bool WriteOutput(const string& output);
145
146 // Reads a number of bytes from output_file_ equal to the size of output,
147 // and compares to make sure they match the contents of output. If the bytes
148 // do not match, or if end of file is reached before the expected number of
149 // bytes have been read, or a read error occurs, the function returns false;
150 // otherwise, returns true.
151 bool CompareOutput(const string& output);
152
153 // Dictionary contents. The entire dictionary file will be read into memory.
154 std::vector<char> dictionary_;
155
156 std::auto_ptr<open_vcdiff::HashedDictionary> hashed_dictionary_;
157
158 // These should be set to either "delta" or "target". They are only
159 // used in log messages such as "Error opening delta file..."
160 const char* input_file_type_;
161 const char* output_file_type_;
162
163 // The filenames used for input and output. Will be empty if stdin
164 // or stdout is being used.
165 string input_file_name_;
166 string output_file_name_;
167
168 // stdio-style file handles for the input and output files and the dictionary.
169 // When encoding, input_file_ is the target file and output_file_ is the delta
170 // file; when decoding, the reverse is true. The dictionary is always read
171 // from a file rather than from standard input.
172 FILE* input_file_;
173 FILE* output_file_;
174
175 // A memory buffer used to load the input file into memory. If the input
176 // comes from stdin because no input file was specified, then the size of
177 // input_buffer_ will be the value specified by the --buffersize option.
178 // If the input comes from a file, then the buffer will be allocated to match
179 // the file size, if possible. However, the buffer will not exceed
180 // --buffersize bytes in length.
181 std::vector<char> input_buffer_;
182
183 // A memory buffer used to load the output file into memory for comparison
184 // if "vcdiff test" is specified.
185 std::vector<char> compare_buffer_;
186
187 // Making these private avoids implicit copy constructor & assignment operator
188 VCDiffFileBasedCoder(const VCDiffFileBasedCoder&); // NOLINT
189 void operator=(const VCDiffFileBasedCoder&);
190 };
191
VCDiffFileBasedCoder()192 inline VCDiffFileBasedCoder::VCDiffFileBasedCoder()
193 : input_file_type_(""),
194 output_file_type_(""),
195 input_file_(NULL),
196 output_file_(NULL) { }
197
~VCDiffFileBasedCoder()198 VCDiffFileBasedCoder::~VCDiffFileBasedCoder() {
199 if (input_file_ && (input_file_ != stdin)) {
200 fclose(input_file_);
201 input_file_ = NULL;
202 }
203 if (output_file_ && (output_file_ != stdout)) {
204 fclose(output_file_);
205 output_file_ = NULL;
206 }
207 }
208
FileSize(FILE * file,size_t * file_size)209 bool VCDiffFileBasedCoder::FileSize(FILE* file, size_t* file_size) {
210 long initial_position = ftell(file);
211 if (fseek(file, 0, SEEK_END) != 0) {
212 return false;
213 }
214 *file_size = static_cast<size_t>(ftell(file));
215 if (fseek(file, initial_position, SEEK_SET) != 0) {
216 return false;
217 }
218 return true;
219 }
220
OpenDictionary()221 bool VCDiffFileBasedCoder::OpenDictionary() {
222 assert(dictionary_.empty());
223 assert(!FLAGS_dictionary.empty());
224 FILE* dictionary_file = fopen(FLAGS_dictionary.c_str(), "rb");
225 if (!dictionary_file) {
226 std::cerr << "Error opening dictionary file '" << FLAGS_dictionary
227 << "': " << strerror(errno) << std::endl;
228 return false;
229 }
230 size_t dictionary_size = 0U;
231 if (!FileSize(dictionary_file, &dictionary_size)) {
232 std::cerr << "Error finding size of dictionary file '" << FLAGS_dictionary
233 << "': " << strerror(errno) << std::endl;
234 return false;
235 }
236 dictionary_.resize(dictionary_size);
237 if (dictionary_size > 0) {
238 if (fread(&dictionary_[0], 1, dictionary_size, dictionary_file)
239 != dictionary_size) {
240 std::cerr << "Unable to read dictionary file '" << FLAGS_dictionary
241 << "': " << strerror(errno) << std::endl;
242 fclose(dictionary_file);
243 dictionary_.clear();
244 return false;
245 }
246 }
247 fclose(dictionary_file);
248 return true;
249 }
250
OpenFileForReading(const string & file_name,const char * file_type,FILE ** file,std::vector<char> * buffer)251 bool VCDiffFileBasedCoder::OpenFileForReading(const string& file_name,
252 const char* file_type,
253 FILE** file,
254 std::vector<char>* buffer) {
255 assert(buffer->empty());
256 size_t buffer_size = 0U;
257 if (!*file && file_name.empty()) {
258 #ifdef WIN32
259 _setmode(_fileno(stdin), _O_BINARY);
260 #endif
261 *file = stdin;
262 buffer_size = static_cast<size_t>(FLAGS_buffersize);
263 } else {
264 if (!*file) {
265 *file = fopen(file_name.c_str(), "rb");
266 if (!*file) {
267 std::cerr << "Error opening " << file_type << " file '"
268 << file_name << "': " << strerror(errno) << std::endl;
269 return false;
270 }
271 }
272 size_t file_size = 0U;
273 if (!FileSize(*file, &file_size)) {
274 std::cerr << "Error finding size of " << file_type << " file '"
275 << file_name << "': " << strerror(errno) << std::endl;
276 return false;
277 }
278 buffer_size = static_cast<size_t>(FLAGS_buffersize);
279 if (file_size < buffer_size) {
280 // Allocate just enough memory to store the entire file
281 buffer_size = file_size;
282 }
283 }
284 buffer->resize(buffer_size);
285 return true;
286 }
287
288 // Opens the output file for streamed read operations using the
289 // standard C I/O library, i.e., fopen(), fwrite(), fclose().
290 // No output buffer is allocated because the encoded/decoded output
291 // is constructed progressively using a std::string object
292 // whose buffer is resized as needed.
OpenOutputFile()293 bool VCDiffFileBasedCoder::OpenOutputFile() {
294 if (output_file_name_.empty()) {
295 #ifdef WIN32
296 _setmode(_fileno(stdout), _O_BINARY);
297 #endif
298 output_file_ = stdout;
299 } else {
300 output_file_ = fopen(output_file_name_.c_str(), "wb");
301 if (!output_file_) {
302 std::cerr << "Error opening " << output_file_type_ << " file '"
303 << output_file_name_
304 << "': " << strerror(errno) << std::endl;
305 return false;
306 }
307 }
308 return true;
309 }
310
ReadInput(size_t * bytes_read)311 bool VCDiffFileBasedCoder::ReadInput(size_t* bytes_read) {
312 // Read from file or stdin
313 *bytes_read = fread(&input_buffer_[0], 1, input_buffer_.size(), input_file_);
314 if (ferror(input_file_)) {
315 std::cerr << "Error reading from " << input_file_type_ << " file '"
316 << input_file_name_
317 << "': " << strerror(errno) << std::endl;
318 return false;
319 }
320 return true;
321 }
322
WriteOutput(const string & output)323 bool VCDiffFileBasedCoder::WriteOutput(const string& output) {
324 if (!output.empty()) {
325 // Some new output has been generated and is ready to be written
326 // to the output file or to stdout.
327 fwrite(output.data(), 1, output.size(), output_file_);
328 if (ferror(output_file_)) {
329 std::cerr << "Error writing " << output.size() << " bytes to "
330 << output_file_type_ << " file '" << output_file_name_
331 << "': " << strerror(errno) << std::endl;
332 return false;
333 }
334 }
335 return true;
336 }
337
CompareOutput(const string & output)338 bool VCDiffFileBasedCoder::CompareOutput(const string& output) {
339 if (!output.empty()) {
340 size_t output_size = output.size();
341 // Some new output has been generated and is ready to be compared against
342 // the output file.
343 if (output_size > compare_buffer_.size()) {
344 compare_buffer_.resize(output_size);
345 }
346 size_t bytes_read = fread(&compare_buffer_[0],
347 1,
348 output_size,
349 output_file_);
350 if (ferror(output_file_)) {
351 std::cerr << "Error reading from " << output_file_type_ << " file '"
352 << output_file_name_ << "': " << strerror(errno) << std::endl;
353 return false;
354 }
355 if (bytes_read < output_size) {
356 std::cerr << "Decoded target is longer than original target file"
357 << std::endl;
358 return false;
359 }
360 if (output.compare(0, output_size, &compare_buffer_[0], bytes_read) != 0) {
361 std::cerr << "Original target file does not match decoded target"
362 << std::endl;
363 return false;
364 }
365 }
366 return true;
367 }
368
Encode()369 bool VCDiffFileBasedCoder::Encode() {
370 input_file_type_ = "target";
371 input_file_name_ = FLAGS_target;
372 output_file_type_ = "delta";
373 output_file_name_ = FLAGS_delta;
374 if (!OpenDictionary() || !OpenInputFile() || !OpenOutputFile()) {
375 return false;
376 }
377 // Issue 6: Visual Studio STL produces a runtime exception
378 // if &dictionary_[0] is attempted for an empty dictionary.
379 if (dictionary_.empty()) {
380 hashed_dictionary_.reset(new open_vcdiff::HashedDictionary("", 0));
381 } else {
382 hashed_dictionary_.reset(
383 new open_vcdiff::HashedDictionary(&dictionary_[0],
384 dictionary_.size()));
385 }
386 if (!hashed_dictionary_->Init()) {
387 std::cerr << "Error initializing hashed dictionary" << std::endl;
388 return false;
389 }
390 VCDiffFormatExtensionFlags format_flags = open_vcdiff::VCD_STANDARD_FORMAT;
391 if (FLAGS_interleaved) {
392 format_flags |= open_vcdiff::VCD_FORMAT_INTERLEAVED;
393 }
394 if (FLAGS_checksum) {
395 format_flags |= open_vcdiff::VCD_FORMAT_CHECKSUM;
396 }
397 open_vcdiff::VCDiffStreamingEncoder encoder(hashed_dictionary_.get(),
398 format_flags,
399 FLAGS_target_matches);
400 string output;
401 size_t input_size = 0;
402 size_t output_size = 0;
403 {
404 if (!encoder.StartEncoding(&output)) {
405 std::cerr << "Error during encoder initialization" << std::endl;
406 return false;
407 }
408 }
409 do {
410 size_t bytes_read = 0;
411 if (!WriteOutput(output) || !ReadInput(&bytes_read)) {
412 return false;
413 }
414 output_size += output.size();
415 output.clear();
416 if (bytes_read > 0) {
417 input_size += bytes_read;
418 if (!encoder.EncodeChunk(&input_buffer_[0], bytes_read, &output)) {
419 std::cerr << "Error trying to encode data chunk of length "
420 << bytes_read << std::endl;
421 return false;
422 }
423 }
424 } while (!feof(input_file_));
425 encoder.FinishEncoding(&output);
426 if (!WriteOutput(output)) {
427 return false;
428 }
429 output_size += output.size();
430 output.clear();
431 if (FLAGS_stats && (input_size > 0)) {
432 std::cerr << "Original size: " << input_size
433 << "\tCompressed size: " << output_size << " ("
434 << ((static_cast<double>(output_size) / input_size) * 100)
435 << "% of original)" << std::endl;
436 }
437 return true;
438 }
439
Decode()440 bool VCDiffFileBasedCoder::Decode() {
441 input_file_type_ = "delta";
442 input_file_name_ = FLAGS_delta;
443 output_file_type_ = "target";
444 output_file_name_ = FLAGS_target;
445 if (!OpenDictionary() || !OpenInputFile() || !OpenOutputFile()) {
446 return false;
447 }
448
449 open_vcdiff::VCDiffStreamingDecoder decoder;
450 decoder.SetMaximumTargetFileSize(
451 static_cast<size_t>(FLAGS_max_target_file_size));
452 decoder.SetMaximumTargetWindowSize(
453 static_cast<size_t>(FLAGS_max_target_window_size));
454 decoder.SetAllowVcdTarget(FLAGS_allow_vcd_target);
455 string output;
456 size_t input_size = 0;
457 size_t output_size = 0;
458 // Issue 6: Visual Studio STL produces a runtime exception
459 // if &dictionary_[0] is attempted for an empty dictionary.
460 if (dictionary_.empty()) {
461 decoder.StartDecoding("", 0);
462 } else {
463 decoder.StartDecoding(&dictionary_[0], dictionary_.size());
464 }
465
466 do {
467 size_t bytes_read = 0;
468 if (!ReadInput(&bytes_read)) {
469 return false;
470 }
471 if (bytes_read > 0) {
472 input_size += bytes_read;
473 if (!decoder.DecodeChunk(&input_buffer_[0], bytes_read, &output)) {
474 std::cerr << "Error trying to decode data chunk of length "
475 << bytes_read << std::endl;
476 return false;
477 }
478 }
479 if (!WriteOutput(output)) {
480 return false;
481 }
482 output_size += output.size();
483 output.clear();
484 } while (!feof(input_file_));
485 if (!decoder.FinishDecoding()) {
486 std::cerr << "Decode error; '" << FLAGS_delta
487 << " may not be a valid VCDIFF delta file" << std::endl;
488 return false;
489 }
490 if (!WriteOutput(output)) {
491 return false;
492 }
493 output_size += output.size();
494 output.clear();
495 if (FLAGS_stats && (output_size > 0)) {
496 std::cerr << "Decompressed size: " << output_size
497 << "\tCompressed size: " << input_size << " ("
498 << ((static_cast<double>(input_size) / output_size) * 100)
499 << "% of original)" << std::endl;
500 }
501 return true;
502 }
503
DecodeAndCompare()504 bool VCDiffFileBasedCoder::DecodeAndCompare() {
505 input_file_type_ = "delta";
506 input_file_name_ = FLAGS_delta;
507 output_file_type_ = "target";
508 output_file_name_ = FLAGS_target;
509 if (!OpenDictionary() || !OpenInputFile() || !OpenOutputFileForCompare()) {
510 return false;
511 }
512
513 open_vcdiff::VCDiffStreamingDecoder decoder;
514 decoder.SetMaximumTargetFileSize(
515 static_cast<size_t>(FLAGS_max_target_file_size));
516 decoder.SetMaximumTargetWindowSize(
517 static_cast<size_t>(FLAGS_max_target_window_size));
518 decoder.SetAllowVcdTarget(FLAGS_allow_vcd_target);
519 string output;
520 size_t input_size = 0;
521 size_t output_size = 0;
522 // Issue 6: Visual Studio STL produces a runtime exception
523 // if &dictionary_[0] is attempted for an empty dictionary.
524 if (dictionary_.empty()) {
525 decoder.StartDecoding("", 0);
526 } else {
527 decoder.StartDecoding(&dictionary_[0], dictionary_.size());
528 }
529
530 do {
531 size_t bytes_read = 0;
532 if (!ReadInput(&bytes_read)) {
533 return false;
534 }
535 if (bytes_read > 0) {
536 input_size += bytes_read;
537 if (!decoder.DecodeChunk(&input_buffer_[0], bytes_read, &output)) {
538 std::cerr << "Error trying to decode data chunk of length "
539 << bytes_read << std::endl;
540 return false;
541 }
542 }
543 if (!CompareOutput(output)) {
544 return false;
545 }
546 output_size += output.size();
547 output.clear();
548 } while (!feof(input_file_));
549 if (!decoder.FinishDecoding()) {
550 std::cerr << "Decode error; '" << FLAGS_delta
551 << " may not be a valid VCDIFF delta file" << std::endl;
552 return false;
553 }
554 if (!CompareOutput(output)) {
555 return false;
556 }
557 output_size += output.size();
558 output.clear();
559 if (fgetc(output_file_) != EOF) {
560 std::cerr << "Decoded target is shorter than original target file"
561 << std::endl;
562 return false;
563 }
564 if (ferror(output_file_)) {
565 std::cerr << "Error reading end-of-file indicator from target file"
566 << std::endl;
567 return false;
568 }
569 if (FLAGS_stats && (output_size > 0)) {
570 std::cerr << "Decompressed size: " << output_size
571 << "\tCompressed size: " << input_size << " ("
572 << ((static_cast<double>(input_size) / output_size) * 100)
573 << "% of original)" << std::endl;
574 }
575 return true;
576 }
577
578 } // namespace open_vcdiff
579
main(int argc,char ** argv)580 int main(int argc, char** argv) {
581 const char* const command_name = argv[0];
582 google::SetUsageMessage(kUsageString);
583 google::ParseCommandLineFlags(&argc, &argv, true);
584 if (argc != 2) {
585 std::cerr << command_name << ": Must specify exactly one command option"
586 << std::endl;
587 ShowUsageWithFlagsRestrict(command_name, "vcdiff");
588 return 1;
589 }
590 const char* const command_option = argv[1];
591 if (FLAGS_dictionary.empty()) {
592 std::cerr << command_name << " " << command_option
593 << ": Must specify --dictionary <file-name>" << std::endl;
594 ShowUsageWithFlagsRestrict(command_name, "vcdiff");
595 return 1;
596 }
597 if (!GetCommandLineFlagInfoOrDie("buffersize").is_default &&
598 (FLAGS_buffersize == 0)) {
599 std::cerr << command_name << ": Option --buffersize cannot be 0"
600 << std::endl;
601 ShowUsageWithFlagsRestrict(command_name, "vcdiff");
602 return 1;
603 }
604 if ((strcmp(command_option, "encode") == 0) ||
605 (strcmp(command_option, "delta") == 0)) {
606 open_vcdiff::VCDiffFileBasedCoder coder;
607 if (!coder.Encode()) {
608 return 1;
609 }
610 // The destructor for VCDiffFileBasedCoder will clean up the open files
611 // and allocated memory.
612 } else if ((strcmp(command_option, "decode") == 0) ||
613 (strcmp(command_option, "patch") == 0)) {
614 open_vcdiff::VCDiffFileBasedCoder coder;
615 if (!coder.Decode()) {
616 return 1;
617 }
618 } else if ((strcmp(command_option, "test") == 0)) {
619 // "vcdiff test" does not appear in the usage string, but can be
620 // used for debugging. It encodes, then decodes, then compares the result
621 // with the original target. It expects the same arguments as
622 // "vcdiff encode", with the additional requirement that the --target
623 // and --delta file arguments must be specified, rather than using stdin
624 // or stdout. It produces a delta file just as for "vcdiff encode".
625 if (FLAGS_target.empty() || FLAGS_delta.empty()) {
626 std::cerr << command_name
627 << " test: Must specify both --target <file-name>"
628 " and --delta <file-name>" << std::endl;
629 return 1;
630 }
631 const string original_target(FLAGS_target);
632 // Put coder into a separate scope.
633 {
634 open_vcdiff::VCDiffFileBasedCoder coder;
635 if (!coder.Encode()) {
636 return 1;
637 }
638 }
639 {
640 open_vcdiff::VCDiffFileBasedCoder coder;
641 if (!coder.DecodeAndCompare()) {
642 return 1;
643 }
644 }
645 } else {
646 std::cerr << command_name << ": Unrecognized command option "
647 << command_option << std::endl;
648 ShowUsageWithFlagsRestrict(command_name, "vcdiff");
649 return 1;
650 }
651 return 0;
652 }
653