1 /* gzappend -- command to append to a gzip file
2
3 Copyright (C) 2003 Mark Adler, all rights reserved
4 version 1.1, 4 Nov 2003
5
6 This software is provided 'as-is', without any express or implied
7 warranty. In no event will the author be held liable for any damages
8 arising from the use of this software.
9
10 Permission is granted to anyone to use this software for any purpose,
11 including commercial applications, and to alter it and redistribute it
12 freely, subject to the following restrictions:
13
14 1. The origin of this software must not be misrepresented; you must not
15 claim that you wrote the original software. If you use this software
16 in a product, an acknowledgment in the product documentation would be
17 appreciated but is not required.
18 2. Altered source versions must be plainly marked as such, and must not be
19 misrepresented as being the original software.
20 3. This notice may not be removed or altered from any source distribution.
21
22 Mark Adler madler@alumni.caltech.edu
23 */
24
25 /*
26 * Change history:
27 *
28 * 1.0 19 Oct 2003 - First version
29 * 1.1 4 Nov 2003 - Expand and clarify some comments and notes
30 * - Add version and copyright to help
31 * - Send help to stdout instead of stderr
32 * - Add some preemptive typecasts
33 * - Add L to constants in lseek() calls
34 * - Remove some debugging information in error messages
35 * - Use new data_type definition for zlib 1.2.1
36 * - Simplfy and unify file operations
37 * - Finish off gzip file in gztack()
38 * - Use deflatePrime() instead of adding empty blocks
39 * - Keep gzip file clean on appended file read errors
40 * - Use in-place rotate instead of auxiliary buffer
41 * (Why you ask? Because it was fun to write!)
42 */
43
44 /*
45 gzappend takes a gzip file and appends to it, compressing files from the
46 command line or data from stdin. The gzip file is written to directly, to
47 avoid copying that file, in case it's large. Note that this results in the
48 unfriendly behavior that if gzappend fails, the gzip file is corrupted.
49
50 This program was written to illustrate the use of the new Z_BLOCK option of
51 zlib 1.2.x's inflate() function. This option returns from inflate() at each
52 block boundary to facilitate locating and modifying the last block bit at
53 the start of the final deflate block. Also whether using Z_BLOCK or not,
54 another required feature of zlib 1.2.x is that inflate() now provides the
55 number of unusued bits in the last input byte used. gzappend will not work
56 with versions of zlib earlier than 1.2.1.
57
58 gzappend first decompresses the gzip file internally, discarding all but
59 the last 32K of uncompressed data, and noting the location of the last block
60 bit and the number of unused bits in the last byte of the compressed data.
61 The gzip trailer containing the CRC-32 and length of the uncompressed data
62 is verified. This trailer will be later overwritten.
63
64 Then the last block bit is cleared by seeking back in the file and rewriting
65 the byte that contains it. Seeking forward, the last byte of the compressed
66 data is saved along with the number of unused bits to initialize deflate.
67
68 A deflate process is initialized, using the last 32K of the uncompressed
69 data from the gzip file to initialize the dictionary. If the total
70 uncompressed data was less than 32K, then all of it is used to initialize
71 the dictionary. The deflate output bit buffer is also initialized with the
72 last bits from the original deflate stream. From here on, the data to
73 append is simply compressed using deflate, and written to the gzip file.
74 When that is complete, the new CRC-32 and uncompressed length are written
75 as the trailer of the gzip file.
76 */
77
78 #include <stdio.h>
79 #include <stdlib.h>
80 #include <string.h>
81 #include <fcntl.h>
82 #include <unistd.h>
83 #include "zlib.h"
84
85 #define local static
86 #define LGCHUNK 14
87 #define CHUNK (1U << LGCHUNK)
88 #define DSIZE 32768U
89
90 /* print an error message and terminate with extreme prejudice */
bye(char * msg1,char * msg2)91 local void bye(char *msg1, char *msg2)
92 {
93 fprintf(stderr, "gzappend error: %s%s\n", msg1, msg2);
94 exit(1);
95 }
96
97 /* return the greatest common divisor of a and b using Euclid's algorithm,
98 modified to be fast when one argument much greater than the other, and
99 coded to avoid unnecessary swapping */
gcd(unsigned a,unsigned b)100 local unsigned gcd(unsigned a, unsigned b)
101 {
102 unsigned c;
103
104 while (a && b)
105 if (a > b) {
106 c = b;
107 while (a - c >= c)
108 c <<= 1;
109 a -= c;
110 }
111 else {
112 c = a;
113 while (b - c >= c)
114 c <<= 1;
115 b -= c;
116 }
117 return a + b;
118 }
119
120 /* rotate list[0..len-1] left by rot positions, in place */
rotate(unsigned char * list,unsigned len,unsigned rot)121 local void rotate(unsigned char *list, unsigned len, unsigned rot)
122 {
123 unsigned char tmp;
124 unsigned cycles;
125 unsigned char *start, *last, *to, *from;
126
127 /* normalize rot and handle degenerate cases */
128 if (len < 2) return;
129 if (rot >= len) rot %= len;
130 if (rot == 0) return;
131
132 /* pointer to last entry in list */
133 last = list + (len - 1);
134
135 /* do simple left shift by one */
136 if (rot == 1) {
137 tmp = *list;
138 memcpy(list, list + 1, len - 1);
139 *last = tmp;
140 return;
141 }
142
143 /* do simple right shift by one */
144 if (rot == len - 1) {
145 tmp = *last;
146 memmove(list + 1, list, len - 1);
147 *list = tmp;
148 return;
149 }
150
151 /* otherwise do rotate as a set of cycles in place */
152 cycles = gcd(len, rot); /* number of cycles */
153 do {
154 start = from = list + cycles; /* start index is arbitrary */
155 tmp = *from; /* save entry to be overwritten */
156 for (;;) {
157 to = from; /* next step in cycle */
158 from += rot; /* go right rot positions */
159 if (from > last) from -= len; /* (pointer better not wrap) */
160 if (from == start) break; /* all but one shifted */
161 *to = *from; /* shift left */
162 }
163 *to = tmp; /* complete the circle */
164 } while (--cycles);
165 }
166
167 /* structure for gzip file read operations */
168 typedef struct {
169 int fd; /* file descriptor */
170 int size; /* 1 << size is bytes in buf */
171 unsigned left; /* bytes available at next */
172 unsigned char *buf; /* buffer */
173 unsigned char *next; /* next byte in buffer */
174 char *name; /* file name for error messages */
175 } file;
176
177 /* reload buffer */
readin(file * in)178 local int readin(file *in)
179 {
180 int len;
181
182 len = read(in->fd, in->buf, 1 << in->size);
183 if (len == -1) bye("error reading ", in->name);
184 in->left = (unsigned)len;
185 in->next = in->buf;
186 return len;
187 }
188
189 /* read from file in, exit if end-of-file */
readmore(file * in)190 local int readmore(file *in)
191 {
192 if (readin(in) == 0) bye("unexpected end of ", in->name);
193 return 0;
194 }
195
196 #define read1(in) (in->left == 0 ? readmore(in) : 0, \
197 in->left--, *(in->next)++)
198
199 /* skip over n bytes of in */
skip(file * in,unsigned n)200 local void skip(file *in, unsigned n)
201 {
202 unsigned bypass;
203
204 if (n > in->left) {
205 n -= in->left;
206 bypass = n & ~((1U << in->size) - 1);
207 if (bypass) {
208 if (lseek(in->fd, (off_t)bypass, SEEK_CUR) == -1)
209 bye("seeking ", in->name);
210 n -= bypass;
211 }
212 readmore(in);
213 if (n > in->left)
214 bye("unexpected end of ", in->name);
215 }
216 in->left -= n;
217 in->next += n;
218 }
219
220 /* read a four-byte unsigned integer, little-endian, from in */
read4(file * in)221 unsigned long read4(file *in)
222 {
223 unsigned long val;
224
225 val = read1(in);
226 val += (unsigned)read1(in) << 8;
227 val += (unsigned long)read1(in) << 16;
228 val += (unsigned long)read1(in) << 24;
229 return val;
230 }
231
232 /* skip over gzip header */
gzheader(file * in)233 local void gzheader(file *in)
234 {
235 int flags;
236 unsigned n;
237
238 if (read1(in) != 31 || read1(in) != 139) bye(in->name, " not a gzip file");
239 if (read1(in) != 8) bye("unknown compression method in", in->name);
240 flags = read1(in);
241 if (flags & 0xe0) bye("unknown header flags set in", in->name);
242 skip(in, 6);
243 if (flags & 4) {
244 n = read1(in);
245 n += (unsigned)(read1(in)) << 8;
246 skip(in, n);
247 }
248 if (flags & 8) while (read1(in) != 0) ;
249 if (flags & 16) while (read1(in) != 0) ;
250 if (flags & 2) skip(in, 2);
251 }
252
253 /* decompress gzip file "name", return strm with a deflate stream ready to
254 continue compression of the data in the gzip file, and return a file
255 descriptor pointing to where to write the compressed data -- the deflate
256 stream is initialized to compress using level "level" */
gzscan(char * name,z_stream * strm,int level)257 local int gzscan(char *name, z_stream *strm, int level)
258 {
259 int ret, lastbit, left, full;
260 unsigned have;
261 unsigned long crc, tot;
262 unsigned char *window;
263 off_t lastoff, end;
264 file gz;
265
266 /* open gzip file */
267 gz.name = name;
268 gz.fd = open(name, O_RDWR, 0);
269 if (gz.fd == -1) bye("cannot open ", name);
270 gz.buf = malloc(CHUNK);
271 if (gz.buf == NULL) bye("out of memory", "");
272 gz.size = LGCHUNK;
273 gz.left = 0;
274
275 /* skip gzip header */
276 gzheader(&gz);
277
278 /* prepare to decompress */
279 window = malloc(DSIZE);
280 if (window == NULL) bye("out of memory", "");
281 strm->zalloc = Z_NULL;
282 strm->zfree = Z_NULL;
283 strm->opaque = Z_NULL;
284 ret = inflateInit2(strm, -15);
285 if (ret != Z_OK) bye("out of memory", " or library mismatch");
286
287 /* decompress the deflate stream, saving append information */
288 lastbit = 0;
289 lastoff = lseek(gz.fd, 0L, SEEK_CUR) - gz.left;
290 left = 0;
291 strm->avail_in = gz.left;
292 strm->next_in = gz.next;
293 crc = crc32(0L, Z_NULL, 0);
294 have = full = 0;
295 do {
296 /* if needed, get more input */
297 if (strm->avail_in == 0) {
298 readmore(&gz);
299 strm->avail_in = gz.left;
300 strm->next_in = gz.next;
301 }
302
303 /* set up output to next available section of sliding window */
304 strm->avail_out = DSIZE - have;
305 strm->next_out = window + have;
306
307 /* inflate and check for errors */
308 ret = inflate(strm, Z_BLOCK);
309 if (ret == Z_STREAM_ERROR) bye("internal stream error!", "");
310 if (ret == Z_MEM_ERROR) bye("out of memory", "");
311 if (ret == Z_DATA_ERROR)
312 bye("invalid compressed data--format violated in", name);
313
314 /* update crc and sliding window pointer */
315 crc = crc32(crc, window + have, DSIZE - have - strm->avail_out);
316 if (strm->avail_out)
317 have = DSIZE - strm->avail_out;
318 else {
319 have = 0;
320 full = 1;
321 }
322
323 /* process end of block */
324 if (strm->data_type & 128) {
325 if (strm->data_type & 64)
326 left = strm->data_type & 0x1f;
327 else {
328 lastbit = strm->data_type & 0x1f;
329 lastoff = lseek(gz.fd, 0L, SEEK_CUR) - strm->avail_in;
330 }
331 }
332 } while (ret != Z_STREAM_END);
333 inflateEnd(strm);
334 gz.left = strm->avail_in;
335 gz.next = strm->next_in;
336
337 /* save the location of the end of the compressed data */
338 end = lseek(gz.fd, 0L, SEEK_CUR) - gz.left;
339
340 /* check gzip trailer and save total for deflate */
341 if (crc != read4(&gz))
342 bye("invalid compressed data--crc mismatch in ", name);
343 tot = strm->total_out;
344 if ((tot & 0xffffffffUL) != read4(&gz))
345 bye("invalid compressed data--length mismatch in", name);
346
347 /* if not at end of file, warn */
348 if (gz.left || readin(&gz))
349 fprintf(stderr,
350 "gzappend warning: junk at end of gzip file overwritten\n");
351
352 /* clear last block bit */
353 lseek(gz.fd, lastoff - (lastbit != 0), SEEK_SET);
354 if (read(gz.fd, gz.buf, 1) != 1) bye("reading after seek on ", name);
355 *gz.buf = (unsigned char)(*gz.buf ^ (1 << ((8 - lastbit) & 7)));
356 lseek(gz.fd, -1L, SEEK_CUR);
357 if (write(gz.fd, gz.buf, 1) != 1) bye("writing after seek to ", name);
358
359 /* if window wrapped, build dictionary from window by rotating */
360 if (full) {
361 rotate(window, DSIZE, have);
362 have = DSIZE;
363 }
364
365 /* set up deflate stream with window, crc, total_in, and leftover bits */
366 ret = deflateInit2(strm, level, Z_DEFLATED, -15, 8, Z_DEFAULT_STRATEGY);
367 if (ret != Z_OK) bye("out of memory", "");
368 deflateSetDictionary(strm, window, have);
369 strm->adler = crc;
370 strm->total_in = tot;
371 if (left) {
372 lseek(gz.fd, --end, SEEK_SET);
373 if (read(gz.fd, gz.buf, 1) != 1) bye("reading after seek on ", name);
374 deflatePrime(strm, 8 - left, *gz.buf);
375 }
376 lseek(gz.fd, end, SEEK_SET);
377
378 /* clean up and return */
379 free(window);
380 free(gz.buf);
381 return gz.fd;
382 }
383
384 /* append file "name" to gzip file gd using deflate stream strm -- if last
385 is true, then finish off the deflate stream at the end */
gztack(char * name,int gd,z_stream * strm,int last)386 local void gztack(char *name, int gd, z_stream *strm, int last)
387 {
388 int fd, len, ret;
389 unsigned left;
390 unsigned char *in, *out;
391
392 /* open file to compress and append */
393 fd = 0;
394 if (name != NULL) {
395 fd = open(name, O_RDONLY, 0);
396 if (fd == -1)
397 fprintf(stderr, "gzappend warning: %s not found, skipping ...\n",
398 name);
399 }
400
401 /* allocate buffers */
402 in = fd == -1 ? NULL : malloc(CHUNK);
403 out = malloc(CHUNK);
404 if (out == NULL) bye("out of memory", "");
405
406 /* compress input file and append to gzip file */
407 do {
408 /* get more input */
409 len = fd == -1 ? 0 : read(fd, in, CHUNK);
410 if (len == -1) {
411 fprintf(stderr,
412 "gzappend warning: error reading %s, skipping rest ...\n",
413 name);
414 len = 0;
415 }
416 strm->avail_in = (unsigned)len;
417 strm->next_in = in;
418 if (len) strm->adler = crc32(strm->adler, in, (unsigned)len);
419
420 /* compress and write all available output */
421 do {
422 strm->avail_out = CHUNK;
423 strm->next_out = out;
424 ret = deflate(strm, last && len == 0 ? Z_FINISH : Z_NO_FLUSH);
425 left = CHUNK - strm->avail_out;
426 while (left) {
427 len = write(gd, out + CHUNK - strm->avail_out - left, left);
428 if (len == -1) bye("writing gzip file", "");
429 left -= (unsigned)len;
430 }
431 } while (strm->avail_out == 0 && ret != Z_STREAM_END);
432 } while (len != 0);
433
434 /* write trailer after last entry */
435 if (last) {
436 deflateEnd(strm);
437 out[0] = (unsigned char)(strm->adler);
438 out[1] = (unsigned char)(strm->adler >> 8);
439 out[2] = (unsigned char)(strm->adler >> 16);
440 out[3] = (unsigned char)(strm->adler >> 24);
441 out[4] = (unsigned char)(strm->total_in);
442 out[5] = (unsigned char)(strm->total_in >> 8);
443 out[6] = (unsigned char)(strm->total_in >> 16);
444 out[7] = (unsigned char)(strm->total_in >> 24);
445 len = 8;
446 do {
447 ret = write(gd, out + 8 - len, len);
448 if (ret == -1) bye("writing gzip file", "");
449 len -= ret;
450 } while (len);
451 close(gd);
452 }
453
454 /* clean up and return */
455 free(out);
456 if (in != NULL) free(in);
457 if (fd > 0) close(fd);
458 }
459
460 /* process the compression level option if present, scan the gzip file, and
461 append the specified files, or append the data from stdin if no other file
462 names are provided on the command line -- the gzip file must be writable
463 and seekable */
main(int argc,char ** argv)464 int main(int argc, char **argv)
465 {
466 int gd, level;
467 z_stream strm;
468
469 /* ignore command name */
470 argv++;
471
472 /* provide usage if no arguments */
473 if (*argv == NULL) {
474 printf("gzappend 1.1 (4 Nov 2003) Copyright (C) 2003 Mark Adler\n");
475 printf(
476 "usage: gzappend [-level] file.gz [ addthis [ andthis ... ]]\n");
477 return 0;
478 }
479
480 /* set compression level */
481 level = Z_DEFAULT_COMPRESSION;
482 if (argv[0][0] == '-') {
483 if (argv[0][1] < '0' || argv[0][1] > '9' || argv[0][2] != 0)
484 bye("invalid compression level", "");
485 level = argv[0][1] - '0';
486 if (*++argv == NULL) bye("no gzip file name after options", "");
487 }
488
489 /* prepare to append to gzip file */
490 gd = gzscan(*argv++, &strm, level);
491
492 /* append files on command line, or from stdin if none */
493 if (*argv == NULL)
494 gztack(NULL, gd, &strm, 1);
495 else
496 do {
497 gztack(*argv, gd, &strm, argv[1] == NULL);
498 } while (*++argv != NULL);
499 return 0;
500 }
501