1 /* gzappend -- command to append to a gzip file
2
3 Copyright (C) 2003, 2012 Mark Adler, all rights reserved
4 version 1.2, 11 Oct 2012
5
6 This software is provided 'as-is', without any express or implied
7 warranty. In no event will the author be held liable for any damages
8 arising from the use of this software.
9
10 Permission is granted to anyone to use this software for any purpose,
11 including commercial applications, and to alter it and redistribute it
12 freely, subject to the following restrictions:
13
14 1. The origin of this software must not be misrepresented; you must not
15 claim that you wrote the original software. If you use this software
16 in a product, an acknowledgment in the product documentation would be
17 appreciated but is not required.
18 2. Altered source versions must be plainly marked as such, and must not be
19 misrepresented as being the original software.
20 3. This notice may not be removed or altered from any source distribution.
21
22 Mark Adler madler@alumni.caltech.edu
23 */
24
25 /*
26 * Change history:
27 *
28 * 1.0 19 Oct 2003 - First version
29 * 1.1 4 Nov 2003 - Expand and clarify some comments and notes
30 * - Add version and copyright to help
31 * - Send help to stdout instead of stderr
32 * - Add some preemptive typecasts
33 * - Add L to constants in lseek() calls
34 * - Remove some debugging information in error messages
35 * - Use new data_type definition for zlib 1.2.1
36 * - Simplify and unify file operations
37 * - Finish off gzip file in gztack()
38 * - Use deflatePrime() instead of adding empty blocks
39 * - Keep gzip file clean on appended file read errors
40 * - Use in-place rotate instead of auxiliary buffer
41 * (Why you ask? Because it was fun to write!)
42 * 1.2 11 Oct 2012 - Fix for proper z_const usage
43 * - Check for input buffer malloc failure
44 */
45
46 /*
47 gzappend takes a gzip file and appends to it, compressing files from the
48 command line or data from stdin. The gzip file is written to directly, to
49 avoid copying that file, in case it's large. Note that this results in the
50 unfriendly behavior that if gzappend fails, the gzip file is corrupted.
51
52 This program was written to illustrate the use of the new Z_BLOCK option of
53 zlib 1.2.x's inflate() function. This option returns from inflate() at each
54 block boundary to facilitate locating and modifying the last block bit at
55 the start of the final deflate block. Also whether using Z_BLOCK or not,
56 another required feature of zlib 1.2.x is that inflate() now provides the
57 number of unused bits in the last input byte used. gzappend will not work
58 with versions of zlib earlier than 1.2.1.
59
60 gzappend first decompresses the gzip file internally, discarding all but
61 the last 32K of uncompressed data, and noting the location of the last block
62 bit and the number of unused bits in the last byte of the compressed data.
63 The gzip trailer containing the CRC-32 and length of the uncompressed data
64 is verified. This trailer will be later overwritten.
65
66 Then the last block bit is cleared by seeking back in the file and rewriting
67 the byte that contains it. Seeking forward, the last byte of the compressed
68 data is saved along with the number of unused bits to initialize deflate.
69
70 A deflate process is initialized, using the last 32K of the uncompressed
71 data from the gzip file to initialize the dictionary. If the total
72 uncompressed data was less than 32K, then all of it is used to initialize
73 the dictionary. The deflate output bit buffer is also initialized with the
74 last bits from the original deflate stream. From here on, the data to
75 append is simply compressed using deflate, and written to the gzip file.
76 When that is complete, the new CRC-32 and uncompressed length are written
77 as the trailer of the gzip file.
78 */
79
80 #include <stdio.h>
81 #include <stdlib.h>
82 #include <string.h>
83 #include <fcntl.h>
84 #include <unistd.h>
85 #include "zlib.h"
86
87 #define local static
88 #define LGCHUNK 14
89 #define CHUNK (1U << LGCHUNK)
90 #define DSIZE 32768U
91
92 /* print an error message and terminate with extreme prejudice */
bye(char * msg1,char * msg2)93 local void bye(char *msg1, char *msg2)
94 {
95 fprintf(stderr, "gzappend error: %s%s\n", msg1, msg2);
96 exit(1);
97 }
98
99 /* return the greatest common divisor of a and b using Euclid's algorithm,
100 modified to be fast when one argument much greater than the other, and
101 coded to avoid unnecessary swapping */
gcd(unsigned a,unsigned b)102 local unsigned gcd(unsigned a, unsigned b)
103 {
104 unsigned c;
105
106 while (a && b)
107 if (a > b) {
108 c = b;
109 while (a - c >= c)
110 c <<= 1;
111 a -= c;
112 }
113 else {
114 c = a;
115 while (b - c >= c)
116 c <<= 1;
117 b -= c;
118 }
119 return a + b;
120 }
121
122 /* rotate list[0..len-1] left by rot positions, in place */
rotate(unsigned char * list,unsigned len,unsigned rot)123 local void rotate(unsigned char *list, unsigned len, unsigned rot)
124 {
125 unsigned char tmp;
126 unsigned cycles;
127 unsigned char *start, *last, *to, *from;
128
129 /* normalize rot and handle degenerate cases */
130 if (len < 2) return;
131 if (rot >= len) rot %= len;
132 if (rot == 0) return;
133
134 /* pointer to last entry in list */
135 last = list + (len - 1);
136
137 /* do simple left shift by one */
138 if (rot == 1) {
139 tmp = *list;
140 memmove(list, list + 1, len - 1);
141 *last = tmp;
142 return;
143 }
144
145 /* do simple right shift by one */
146 if (rot == len - 1) {
147 tmp = *last;
148 memmove(list + 1, list, len - 1);
149 *list = tmp;
150 return;
151 }
152
153 /* otherwise do rotate as a set of cycles in place */
154 cycles = gcd(len, rot); /* number of cycles */
155 do {
156 start = from = list + cycles; /* start index is arbitrary */
157 tmp = *from; /* save entry to be overwritten */
158 for (;;) {
159 to = from; /* next step in cycle */
160 from += rot; /* go right rot positions */
161 if (from > last) from -= len; /* (pointer better not wrap) */
162 if (from == start) break; /* all but one shifted */
163 *to = *from; /* shift left */
164 }
165 *to = tmp; /* complete the circle */
166 } while (--cycles);
167 }
168
169 /* structure for gzip file read operations */
170 typedef struct {
171 int fd; /* file descriptor */
172 int size; /* 1 << size is bytes in buf */
173 unsigned left; /* bytes available at next */
174 unsigned char *buf; /* buffer */
175 z_const unsigned char *next; /* next byte in buffer */
176 char *name; /* file name for error messages */
177 } file;
178
179 /* reload buffer */
readin(file * in)180 local int readin(file *in)
181 {
182 int len;
183
184 len = read(in->fd, in->buf, 1 << in->size);
185 if (len == -1) bye("error reading ", in->name);
186 in->left = (unsigned)len;
187 in->next = in->buf;
188 return len;
189 }
190
191 /* read from file in, exit if end-of-file */
readmore(file * in)192 local int readmore(file *in)
193 {
194 if (readin(in) == 0) bye("unexpected end of ", in->name);
195 return 0;
196 }
197
198 #define read1(in) (in->left == 0 ? readmore(in) : 0, \
199 in->left--, *(in->next)++)
200
201 /* skip over n bytes of in */
skip(file * in,unsigned n)202 local void skip(file *in, unsigned n)
203 {
204 unsigned bypass;
205
206 if (n > in->left) {
207 n -= in->left;
208 bypass = n & ~((1U << in->size) - 1);
209 if (bypass) {
210 if (lseek(in->fd, (off_t)bypass, SEEK_CUR) == -1)
211 bye("seeking ", in->name);
212 n -= bypass;
213 }
214 readmore(in);
215 if (n > in->left)
216 bye("unexpected end of ", in->name);
217 }
218 in->left -= n;
219 in->next += n;
220 }
221
222 /* read a four-byte unsigned integer, little-endian, from in */
read4(file * in)223 unsigned long read4(file *in)
224 {
225 unsigned long val;
226
227 val = read1(in);
228 val += (unsigned)read1(in) << 8;
229 val += (unsigned long)read1(in) << 16;
230 val += (unsigned long)read1(in) << 24;
231 return val;
232 }
233
234 /* skip over gzip header */
gzheader(file * in)235 local void gzheader(file *in)
236 {
237 int flags;
238 unsigned n;
239
240 if (read1(in) != 31 || read1(in) != 139) bye(in->name, " not a gzip file");
241 if (read1(in) != 8) bye("unknown compression method in", in->name);
242 flags = read1(in);
243 if (flags & 0xe0) bye("unknown header flags set in", in->name);
244 skip(in, 6);
245 if (flags & 4) {
246 n = read1(in);
247 n += (unsigned)(read1(in)) << 8;
248 skip(in, n);
249 }
250 if (flags & 8) while (read1(in) != 0) ;
251 if (flags & 16) while (read1(in) != 0) ;
252 if (flags & 2) skip(in, 2);
253 }
254
255 /* decompress gzip file "name", return strm with a deflate stream ready to
256 continue compression of the data in the gzip file, and return a file
257 descriptor pointing to where to write the compressed data -- the deflate
258 stream is initialized to compress using level "level" */
gzscan(char * name,z_stream * strm,int level)259 local int gzscan(char *name, z_stream *strm, int level)
260 {
261 int ret, lastbit, left, full;
262 unsigned have;
263 unsigned long crc, tot;
264 unsigned char *window;
265 off_t lastoff, end;
266 file gz;
267
268 /* open gzip file */
269 gz.name = name;
270 gz.fd = open(name, O_RDWR, 0);
271 if (gz.fd == -1) bye("cannot open ", name);
272 gz.buf = malloc(CHUNK);
273 if (gz.buf == NULL) bye("out of memory", "");
274 gz.size = LGCHUNK;
275 gz.left = 0;
276
277 /* skip gzip header */
278 gzheader(&gz);
279
280 /* prepare to decompress */
281 window = malloc(DSIZE);
282 if (window == NULL) bye("out of memory", "");
283 strm->zalloc = Z_NULL;
284 strm->zfree = Z_NULL;
285 strm->opaque = Z_NULL;
286 ret = inflateInit2(strm, -15);
287 if (ret != Z_OK) bye("out of memory", " or library mismatch");
288
289 /* decompress the deflate stream, saving append information */
290 lastbit = 0;
291 lastoff = lseek(gz.fd, 0L, SEEK_CUR) - gz.left;
292 left = 0;
293 strm->avail_in = gz.left;
294 strm->next_in = gz.next;
295 crc = crc32(0L, Z_NULL, 0);
296 have = full = 0;
297 do {
298 /* if needed, get more input */
299 if (strm->avail_in == 0) {
300 readmore(&gz);
301 strm->avail_in = gz.left;
302 strm->next_in = gz.next;
303 }
304
305 /* set up output to next available section of sliding window */
306 strm->avail_out = DSIZE - have;
307 strm->next_out = window + have;
308
309 /* inflate and check for errors */
310 ret = inflate(strm, Z_BLOCK);
311 if (ret == Z_STREAM_ERROR) bye("internal stream error!", "");
312 if (ret == Z_MEM_ERROR) bye("out of memory", "");
313 if (ret == Z_DATA_ERROR)
314 bye("invalid compressed data--format violated in", name);
315
316 /* update crc and sliding window pointer */
317 crc = crc32(crc, window + have, DSIZE - have - strm->avail_out);
318 if (strm->avail_out)
319 have = DSIZE - strm->avail_out;
320 else {
321 have = 0;
322 full = 1;
323 }
324
325 /* process end of block */
326 if (strm->data_type & 128) {
327 if (strm->data_type & 64)
328 left = strm->data_type & 0x1f;
329 else {
330 lastbit = strm->data_type & 0x1f;
331 lastoff = lseek(gz.fd, 0L, SEEK_CUR) - strm->avail_in;
332 }
333 }
334 } while (ret != Z_STREAM_END);
335 inflateEnd(strm);
336 gz.left = strm->avail_in;
337 gz.next = strm->next_in;
338
339 /* save the location of the end of the compressed data */
340 end = lseek(gz.fd, 0L, SEEK_CUR) - gz.left;
341
342 /* check gzip trailer and save total for deflate */
343 if (crc != read4(&gz))
344 bye("invalid compressed data--crc mismatch in ", name);
345 tot = strm->total_out;
346 if ((tot & 0xffffffffUL) != read4(&gz))
347 bye("invalid compressed data--length mismatch in", name);
348
349 /* if not at end of file, warn */
350 if (gz.left || readin(&gz))
351 fprintf(stderr,
352 "gzappend warning: junk at end of gzip file overwritten\n");
353
354 /* clear last block bit */
355 lseek(gz.fd, lastoff - (lastbit != 0), SEEK_SET);
356 if (read(gz.fd, gz.buf, 1) != 1) bye("reading after seek on ", name);
357 *gz.buf = (unsigned char)(*gz.buf ^ (1 << ((8 - lastbit) & 7)));
358 lseek(gz.fd, -1L, SEEK_CUR);
359 if (write(gz.fd, gz.buf, 1) != 1) bye("writing after seek to ", name);
360
361 /* if window wrapped, build dictionary from window by rotating */
362 if (full) {
363 rotate(window, DSIZE, have);
364 have = DSIZE;
365 }
366
367 /* set up deflate stream with window, crc, total_in, and leftover bits */
368 ret = deflateInit2(strm, level, Z_DEFLATED, -15, 8, Z_DEFAULT_STRATEGY);
369 if (ret != Z_OK) bye("out of memory", "");
370 deflateSetDictionary(strm, window, have);
371 strm->adler = crc;
372 strm->total_in = tot;
373 if (left) {
374 lseek(gz.fd, --end, SEEK_SET);
375 if (read(gz.fd, gz.buf, 1) != 1) bye("reading after seek on ", name);
376 deflatePrime(strm, 8 - left, *gz.buf);
377 }
378 lseek(gz.fd, end, SEEK_SET);
379
380 /* clean up and return */
381 free(window);
382 free(gz.buf);
383 return gz.fd;
384 }
385
386 /* append file "name" to gzip file gd using deflate stream strm -- if last
387 is true, then finish off the deflate stream at the end */
gztack(char * name,int gd,z_stream * strm,int last)388 local void gztack(char *name, int gd, z_stream *strm, int last)
389 {
390 int fd, len, ret;
391 unsigned left;
392 unsigned char *in, *out;
393
394 /* open file to compress and append */
395 fd = 0;
396 if (name != NULL) {
397 fd = open(name, O_RDONLY, 0);
398 if (fd == -1)
399 fprintf(stderr, "gzappend warning: %s not found, skipping ...\n",
400 name);
401 }
402
403 /* allocate buffers */
404 in = malloc(CHUNK);
405 out = malloc(CHUNK);
406 if (in == NULL || out == NULL) bye("out of memory", "");
407
408 /* compress input file and append to gzip file */
409 do {
410 /* get more input */
411 len = read(fd, in, CHUNK);
412 if (len == -1) {
413 fprintf(stderr,
414 "gzappend warning: error reading %s, skipping rest ...\n",
415 name);
416 len = 0;
417 }
418 strm->avail_in = (unsigned)len;
419 strm->next_in = in;
420 if (len) strm->adler = crc32(strm->adler, in, (unsigned)len);
421
422 /* compress and write all available output */
423 do {
424 strm->avail_out = CHUNK;
425 strm->next_out = out;
426 ret = deflate(strm, last && len == 0 ? Z_FINISH : Z_NO_FLUSH);
427 left = CHUNK - strm->avail_out;
428 while (left) {
429 len = write(gd, out + CHUNK - strm->avail_out - left, left);
430 if (len == -1) bye("writing gzip file", "");
431 left -= (unsigned)len;
432 }
433 } while (strm->avail_out == 0 && ret != Z_STREAM_END);
434 } while (len != 0);
435
436 /* write trailer after last entry */
437 if (last) {
438 deflateEnd(strm);
439 out[0] = (unsigned char)(strm->adler);
440 out[1] = (unsigned char)(strm->adler >> 8);
441 out[2] = (unsigned char)(strm->adler >> 16);
442 out[3] = (unsigned char)(strm->adler >> 24);
443 out[4] = (unsigned char)(strm->total_in);
444 out[5] = (unsigned char)(strm->total_in >> 8);
445 out[6] = (unsigned char)(strm->total_in >> 16);
446 out[7] = (unsigned char)(strm->total_in >> 24);
447 len = 8;
448 do {
449 ret = write(gd, out + 8 - len, len);
450 if (ret == -1) bye("writing gzip file", "");
451 len -= ret;
452 } while (len);
453 close(gd);
454 }
455
456 /* clean up and return */
457 free(out);
458 free(in);
459 if (fd > 0) close(fd);
460 }
461
462 /* process the compression level option if present, scan the gzip file, and
463 append the specified files, or append the data from stdin if no other file
464 names are provided on the command line -- the gzip file must be writable
465 and seekable */
main(int argc,char ** argv)466 int main(int argc, char **argv)
467 {
468 int gd, level;
469 z_stream strm;
470
471 /* ignore command name */
472 argc--; argv++;
473
474 /* provide usage if no arguments */
475 if (*argv == NULL) {
476 printf(
477 "gzappend 1.2 (11 Oct 2012) Copyright (C) 2003, 2012 Mark Adler\n"
478 );
479 printf(
480 "usage: gzappend [-level] file.gz [ addthis [ andthis ... ]]\n");
481 return 0;
482 }
483
484 /* set compression level */
485 level = Z_DEFAULT_COMPRESSION;
486 if (argv[0][0] == '-') {
487 if (argv[0][1] < '0' || argv[0][1] > '9' || argv[0][2] != 0)
488 bye("invalid compression level", "");
489 level = argv[0][1] - '0';
490 if (*++argv == NULL) bye("no gzip file name after options", "");
491 }
492
493 /* prepare to append to gzip file */
494 gd = gzscan(*argv++, &strm, level);
495
496 /* append files on command line, or from stdin if none */
497 if (*argv == NULL)
498 gztack(NULL, gd, &strm, 1);
499 else
500 do {
501 gztack(*argv, gd, &strm, argv[1] == NULL);
502 } while (*++argv != NULL);
503 return 0;
504 }
505