1 /* gznorm.c -- normalize a gzip stream
2 * Copyright (C) 2018 Mark Adler
3 * For conditions of distribution and use, see copyright notice in zlib.h
4 * Version 1.0 7 Oct 2018 Mark Adler */
5
6 // gznorm takes a gzip stream, potentially containing multiple members, and
7 // converts it to a gzip stream with a single member. In addition the gzip
8 // header is normalized, removing the file name and time stamp, and setting the
9 // other header contents (XFL, OS) to fixed values. gznorm does not recompress
10 // the data, so it is fast, but no advantage is gained from the history that
11 // could be available across member boundaries.
12
13 #include <stdio.h> // fread, fwrite, putc, fflush, ferror, fprintf,
14 // vsnprintf, stdout, stderr, NULL, FILE
15 #include <stdlib.h> // malloc, free
16 #include <string.h> // strerror
17 #include <errno.h> // errno
18 #include <stdarg.h> // va_list, va_start, va_end
19 #include "zlib.h" // inflateInit2, inflate, inflateReset, inflateEnd,
20 // z_stream, z_off_t, crc32_combine, Z_NULL, Z_BLOCK,
21 // Z_OK, Z_STREAM_END, Z_BUF_ERROR, Z_DATA_ERROR,
22 // Z_MEM_ERROR
23
24 #if defined(MSDOS) || defined(OS2) || defined(WIN32) || defined(__CYGWIN__)
25 # include <fcntl.h>
26 # include <io.h>
27 # define SET_BINARY_MODE(file) setmode(fileno(file), O_BINARY)
28 #else
29 # define SET_BINARY_MODE(file)
30 #endif
31
32 #define local static
33
34 // printf to an allocated string. Return the string, or NULL if the printf or
35 // allocation fails.
aprintf(char * fmt,...)36 local char *aprintf(char *fmt, ...) {
37 // Get the length of the result of the printf.
38 va_list args;
39 va_start(args, fmt);
40 int len = vsnprintf(NULL, 0, fmt, args);
41 va_end(args);
42 if (len < 0)
43 return NULL;
44
45 // Allocate the required space and printf to it.
46 char *str = malloc(len + 1);
47 if (str == NULL)
48 return NULL;
49 va_start(args, fmt);
50 vsnprintf(str, len + 1, fmt, args);
51 va_end(args);
52 return str;
53 }
54
55 // Return with an error, putting an allocated error message in *err. Doing an
56 // inflateEnd() on an already ended state, or one with state set to Z_NULL, is
57 // permitted.
58 #define BYE(...) \
59 do { \
60 inflateEnd(&strm); \
61 *err = aprintf(__VA_ARGS__); \
62 return 1; \
63 } while (0)
64
65 // Chunk size for buffered reads and for decompression. Twice this many bytes
66 // will be allocated on the stack by gzip_normalize(). Must fit in an unsigned.
67 #define CHUNK 16384
68
69 // Read a gzip stream from in and write an equivalent normalized gzip stream to
70 // out. If given no input, an empty gzip stream will be written. If successful,
71 // 0 is returned, and *err is set to NULL. On error, 1 is returned, where the
72 // details of the error are returned in *err, a pointer to an allocated string.
73 //
74 // The input may be a stream with multiple gzip members, which is converted to
75 // a single gzip member on the output. Each gzip member is decompressed at the
76 // level of deflate blocks. This enables clearing the last-block bit, shifting
77 // the compressed data to concatenate to the previous member's compressed data,
78 // which can end at an arbitrary bit boundary, and identifying stored blocks in
79 // order to resynchronize those to byte boundaries. The deflate compressed data
80 // is terminated with a 10-bit empty fixed block. If any members on the input
81 // end with a 10-bit empty fixed block, then that block is excised from the
82 // stream. This avoids appending empty fixed blocks for every normalization,
83 // and assures that gzip_normalize applied a second time will not change the
84 // input. The pad bits after stored block headers and after the final deflate
85 // block are all forced to zeros.
gzip_normalize(FILE * in,FILE * out,char ** err)86 local int gzip_normalize(FILE *in, FILE *out, char **err) {
87 // initialize the inflate engine to process a gzip member
88 z_stream strm;
89 strm.zalloc = Z_NULL;
90 strm.zfree = Z_NULL;
91 strm.opaque = Z_NULL;
92 strm.avail_in = 0;
93 strm.next_in = Z_NULL;
94 if (inflateInit2(&strm, 15 + 16) != Z_OK)
95 BYE("out of memory");
96
97 // State while processing the input gzip stream.
98 enum { // BETWEEN -> HEAD -> BLOCK -> TAIL -> BETWEEN -> ...
99 BETWEEN, // between gzip members (must end in this state)
100 HEAD, // reading a gzip header
101 BLOCK, // reading deflate blocks
102 TAIL // reading a gzip trailer
103 } state = BETWEEN; // current component being processed
104 unsigned long crc = 0; // accumulated CRC of uncompressed data
105 unsigned long len = 0; // accumulated length of uncompressed data
106 unsigned long buf = 0; // deflate stream bit buffer of num bits
107 int num = 0; // number of bits in buf (at bottom)
108
109 // Write a canonical gzip header (no mod time, file name, comment, extra
110 // block, or extra flags, and OS is marked as unknown).
111 fwrite("\x1f\x8b\x08\0\0\0\0\0\0\xff", 1, 10, out);
112
113 // Process the gzip stream from in until reaching the end of the input,
114 // encountering invalid input, or experiencing an i/o error.
115 int more; // true if not at the end of the input
116 do {
117 // State inside this loop.
118 unsigned char *put; // next input buffer location to process
119 int prev; // number of bits from previous block in
120 // the bit buffer, or -1 if not at the
121 // start of a block
122 unsigned long long memb; // uncompressed length of member
123 size_t tail; // number of trailer bytes read (0..8)
124 unsigned long part; // accumulated trailer component
125
126 // Get the next chunk of input from in.
127 unsigned char dat[CHUNK];
128 strm.avail_in = fread(dat, 1, CHUNK, in);
129 if (strm.avail_in == 0)
130 break;
131 more = strm.avail_in == CHUNK;
132 strm.next_in = put = dat;
133
134 // Run that chunk of input through the inflate engine to exhaustion.
135 do {
136 // At this point it is assured that strm.avail_in > 0.
137
138 // Inflate until the end of a gzip component (header, deflate
139 // block, trailer) is reached, or until all of the chunk is
140 // consumed. The resulting decompressed data is discarded, though
141 // the total size of the decompressed data in each member is
142 // tracked, for the calculation of the total CRC.
143 do {
144 // inflate and handle any errors
145 unsigned char scrap[CHUNK];
146 strm.avail_out = CHUNK;
147 strm.next_out = scrap;
148 int ret = inflate(&strm, Z_BLOCK);
149 if (ret == Z_MEM_ERROR)
150 BYE("out of memory");
151 if (ret == Z_DATA_ERROR)
152 BYE("input invalid: %s", strm.msg);
153 if (ret != Z_OK && ret != Z_BUF_ERROR && ret != Z_STREAM_END)
154 BYE("internal error");
155
156 // Update the number of uncompressed bytes generated in this
157 // member. The actual count (not modulo 2^32) is required to
158 // correctly compute the total CRC.
159 unsigned got = CHUNK - strm.avail_out;
160 memb += got;
161 if (memb < got)
162 BYE("overflow error");
163
164 // Continue to process this chunk until it is consumed, or
165 // until the end of a component (header, deflate block, or
166 // trailer) is reached.
167 } while (strm.avail_out == 0 && (strm.data_type & 0x80) == 0);
168
169 // Since strm.avail_in was > 0 for the inflate call, some input was
170 // just consumed. It is therefore assured that put < strm.next_in.
171
172 // Disposition the consumed component or part of a component.
173 switch (state) {
174 case BETWEEN:
175 state = HEAD;
176 // Fall through to HEAD when some or all of the header is
177 // processed.
178
179 case HEAD:
180 // Discard the header.
181 if (strm.data_type & 0x80) {
182 // End of header reached -- deflate blocks follow.
183 put = strm.next_in;
184 prev = num;
185 memb = 0;
186 state = BLOCK;
187 }
188 break;
189
190 case BLOCK:
191 // Copy the deflate stream to the output, but with the
192 // last-block-bit cleared. Re-synchronize stored block
193 // headers to the output byte boundaries. The bytes at
194 // put..strm.next_in-1 is the compressed data that has been
195 // processed and is ready to be copied to the output.
196
197 // At this point, it is assured that new compressed data is
198 // available, i.e., put < strm.next_in. If prev is -1, then
199 // that compressed data starts in the middle of a deflate
200 // block. If prev is not -1, then the bits in the bit
201 // buffer, possibly combined with the bits in *put, contain
202 // the three-bit header of the new deflate block. In that
203 // case, prev is the number of bits from the previous block
204 // that remain in the bit buffer. Since num is the number
205 // of bits in the bit buffer, we have that num - prev is
206 // the number of bits from the new block currently in the
207 // bit buffer.
208
209 // If strm.data_type & 0xc0 is 0x80, then the last byte of
210 // the available compressed data includes the last bits of
211 // the end of a deflate block. In that case, that last byte
212 // also has strm.data_type & 0x1f bits of the next deflate
213 // block, in the range 0..7. If strm.data_type & 0xc0 is
214 // 0xc0, then the last byte of the compressed data is the
215 // end of the deflate stream, followed by strm.data_type &
216 // 0x1f pad bits, also in the range 0..7.
217
218 // Set bits to the number of bits not yet consumed from the
219 // last byte. If we are at the end of the block, bits is
220 // either the number of bits in the last byte belonging to
221 // the next block, or the number of pad bits after the
222 // final block. In either of those cases, bits is in the
223 // range 0..7.
224 ; // (required due to C syntax oddity)
225 int bits = strm.data_type & 0x1f;
226
227 if (prev != -1) {
228 // We are at the start of a new block. Clear the last
229 // block bit, and check for special cases. If it is a
230 // stored block, then emit the header and pad to the
231 // next byte boundary. If it is a final, empty fixed
232 // block, then excise it.
233
234 // Some or all of the three header bits for this block
235 // may already be in the bit buffer. Load any remaining
236 // header bits into the bit buffer.
237 if (num - prev < 3) {
238 buf += (unsigned long)*put++ << num;
239 num += 8;
240 }
241
242 // Set last to have a 1 in the position of the last
243 // block bit in the bit buffer.
244 unsigned long last = (unsigned long)1 << prev;
245
246 if (((buf >> prev) & 7) == 3) {
247 // This is a final fixed block. Load at least ten
248 // bits from this block, including the header, into
249 // the bit buffer. We already have at least three,
250 // so at most one more byte needs to be loaded.
251 if (num - prev < 10) {
252 if (put == strm.next_in)
253 // Need to go get and process more input.
254 // We'll end up back here to finish this.
255 break;
256 buf += (unsigned long)*put++ << num;
257 num += 8;
258 }
259 if (((buf >> prev) & 0x3ff) == 3) {
260 // That final fixed block is empty. Delete it
261 // to avoid adding an empty block every time a
262 // gzip stream is normalized.
263 num = prev;
264 buf &= last - 1; // zero the pad bits
265 }
266 }
267 else if (((buf >> prev) & 6) == 0) {
268 // This is a stored block. Flush to the next
269 // byte boundary after the three-bit header.
270 num = (prev + 10) & ~7;
271 buf &= last - 1; // zero the pad bits
272 }
273
274 // Clear the last block bit.
275 buf &= ~last;
276
277 // Write out complete bytes in the bit buffer.
278 while (num >= 8) {
279 putc(buf, out);
280 buf >>= 8;
281 num -= 8;
282 }
283
284 // If no more bytes left to process, then we have
285 // consumed the byte that had bits from the next block.
286 if (put == strm.next_in)
287 bits = 0;
288 }
289
290 // We are done handling the deflate block header. Now copy
291 // all or almost all of the remaining compressed data that
292 // has been processed so far. Don't copy one byte at the
293 // end if it contains bits from the next deflate block or
294 // pad bits at the end of a deflate block.
295
296 // mix is 1 if we are at the end of a deflate block, and if
297 // some of the bits in the last byte follow this block. mix
298 // is 0 if we are in the middle of a deflate block, if the
299 // deflate block ended on a byte boundary, or if all of the
300 // compressed data processed so far has been consumed.
301 int mix = (strm.data_type & 0x80) && bits;
302
303 // Copy all of the processed compressed data to the output,
304 // except for the last byte if it contains bits from the
305 // next deflate block or pad bits at the end of the deflate
306 // stream. Copy the data after shifting in num bits from
307 // buf in front of it, leaving num bits from the end of the
308 // compressed data in buf when done.
309 unsigned char *end = strm.next_in - mix;
310 if (put < end) {
311 if (num)
312 // Insert num bits from buf before the data being
313 // copied.
314 do {
315 buf += (unsigned)(*put++) << num;
316 putc(buf, out);
317 buf >>= 8;
318 } while (put < end);
319 else {
320 // No shifting needed -- write directly.
321 fwrite(put, 1, end - put, out);
322 put = end;
323 }
324 }
325
326 // Process the last processed byte if it wasn't written.
327 if (mix) {
328 // Load the last byte into the bit buffer.
329 buf += (unsigned)(*put++) << num;
330 num += 8;
331
332 if (strm.data_type & 0x40) {
333 // We are at the end of the deflate stream and
334 // there are bits pad bits. Discard the pad bits
335 // and write a byte to the output, if available.
336 // Leave the num bits left over in buf to prepend
337 // to the next deflate stream.
338 num -= bits;
339 if (num >= 8) {
340 putc(buf, out);
341 num -= 8;
342 buf >>= 8;
343 }
344
345 // Force the pad bits in the bit buffer to zeros.
346 buf &= ((unsigned long)1 << num) - 1;
347
348 // Don't need to set prev here since going to TAIL.
349 }
350 else
351 // At the end of an internal deflate block. Leave
352 // the last byte in the bit buffer to examine on
353 // the next entry to BLOCK, when more bits from the
354 // next block will be available.
355 prev = num - bits; // number of bits in buffer
356 // from current block
357 }
358
359 // Don't have a byte left over, so we are in the middle of
360 // a deflate block, or the deflate block ended on a byte
361 // boundary. Set prev appropriately for the next entry into
362 // BLOCK.
363 else if (strm.data_type & 0x80)
364 // The block ended on a byte boundary, so no header
365 // bits are in the bit buffer.
366 prev = num;
367 else
368 // In the middle of a deflate block, so no header here.
369 prev = -1;
370
371 // Check for the end of the deflate stream.
372 if ((strm.data_type & 0xc0) == 0xc0) {
373 // That ends the deflate stream on the input side, the
374 // pad bits were discarded, and any remaining bits from
375 // the last block in the stream are saved in the bit
376 // buffer to prepend to the next stream. Process the
377 // gzip trailer next.
378 tail = 0;
379 part = 0;
380 state = TAIL;
381 }
382 break;
383
384 case TAIL:
385 // Accumulate available trailer bytes to update the total
386 // CRC and the total uncompressed length.
387 do {
388 part = (part >> 8) + ((unsigned long)(*put++) << 24);
389 tail++;
390 if (tail == 4) {
391 // Update the total CRC.
392 z_off_t len2 = memb;
393 if (len2 < 0 || (unsigned long long)len2 != memb)
394 BYE("overflow error");
395 crc = crc ? crc32_combine(crc, part, len2) : part;
396 part = 0;
397 }
398 else if (tail == 8) {
399 // Update the total uncompressed length. (It's ok
400 // if this sum is done modulo 2^32.)
401 len += part;
402
403 // At the end of a member. Set up to inflate an
404 // immediately following gzip member. (If we made
405 // it this far, then the trailer was valid.)
406 if (inflateReset(&strm) != Z_OK)
407 BYE("internal error");
408 state = BETWEEN;
409 break;
410 }
411 } while (put < strm.next_in);
412 break;
413 }
414
415 // Process the input buffer until completely consumed.
416 } while (strm.avail_in > 0);
417
418 // Process input until end of file, invalid input, or i/o error.
419 } while (more);
420
421 // Done with the inflate engine.
422 inflateEnd(&strm);
423
424 // Verify the validity of the input.
425 if (state != BETWEEN)
426 BYE("input invalid: incomplete gzip stream");
427
428 // Write the remaining deflate stream bits, followed by a terminating
429 // deflate fixed block.
430 buf += (unsigned long)3 << num;
431 putc(buf, out);
432 putc(buf >> 8, out);
433 if (num > 6)
434 putc(0, out);
435
436 // Write the gzip trailer, which is the CRC and the uncompressed length
437 // modulo 2^32, both in little-endian order.
438 putc(crc, out);
439 putc(crc >> 8, out);
440 putc(crc >> 16, out);
441 putc(crc >> 24, out);
442 putc(len, out);
443 putc(len >> 8, out);
444 putc(len >> 16, out);
445 putc(len >> 24, out);
446 fflush(out);
447
448 // Check for any i/o errors.
449 if (ferror(in) || ferror(out))
450 BYE("i/o error: %s", strerror(errno));
451
452 // All good!
453 *err = NULL;
454 return 0;
455 }
456
457 // Normalize the gzip stream on stdin, writing the result to stdout.
main(void)458 int main(void) {
459 // Avoid end-of-line conversions on evil operating systems.
460 SET_BINARY_MODE(stdin);
461 SET_BINARY_MODE(stdout);
462
463 // Normalize from stdin to stdout, returning 1 on error, 0 if ok.
464 char *err;
465 int ret = gzip_normalize(stdin, stdout, &err);
466 if (ret)
467 fprintf(stderr, "gznorm error: %s\n", err);
468 free(err);
469 return ret;
470 }
471