• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* gznorm.c -- normalize a gzip stream
2  * Copyright (C) 2018 Mark Adler
3  * For conditions of distribution and use, see copyright notice in zlib.h
4  * Version 1.0  7 Oct 2018  Mark Adler */
5 
6 // gznorm takes a gzip stream, potentially containing multiple members, and
7 // converts it to a gzip stream with a single member. In addition the gzip
8 // header is normalized, removing the file name and time stamp, and setting the
9 // other header contents (XFL, OS) to fixed values. gznorm does not recompress
10 // the data, so it is fast, but no advantage is gained from the history that
11 // could be available across member boundaries.
12 
13 #include <stdio.h>      // fread, fwrite, putc, fflush, ferror, fprintf,
14                         // vsnprintf, stdout, stderr, NULL, FILE
15 #include <stdlib.h>     // malloc, free
16 #include <string.h>     // strerror
17 #include <errno.h>      // errno
18 #include <stdarg.h>     // va_list, va_start, va_end
19 #include "zlib.h"       // inflateInit2, inflate, inflateReset, inflateEnd,
20                         // z_stream, z_off_t, crc32_combine, Z_NULL, Z_BLOCK,
21                         // Z_OK, Z_STREAM_END, Z_BUF_ERROR, Z_DATA_ERROR,
22                         // Z_MEM_ERROR
23 
24 #if defined(MSDOS) || defined(OS2) || defined(WIN32) || defined(__CYGWIN__)
25 #  include <fcntl.h>
26 #  include <io.h>
27 #  define SET_BINARY_MODE(file) setmode(fileno(file), O_BINARY)
28 #else
29 #  define SET_BINARY_MODE(file)
30 #endif
31 
32 #define local static
33 
34 // printf to an allocated string. Return the string, or NULL if the printf or
35 // allocation fails.
aprintf(char * fmt,...)36 local char *aprintf(char *fmt, ...) {
37     // Get the length of the result of the printf.
38     va_list args;
39     va_start(args, fmt);
40     int len = vsnprintf(NULL, 0, fmt, args);
41     va_end(args);
42     if (len < 0)
43         return NULL;
44 
45     // Allocate the required space and printf to it.
46     char *str = malloc(len + 1);
47     if (str == NULL)
48         return NULL;
49     va_start(args, fmt);
50     vsnprintf(str, len + 1, fmt, args);
51     va_end(args);
52     return str;
53 }
54 
55 // Return with an error, putting an allocated error message in *err. Doing an
56 // inflateEnd() on an already ended state, or one with state set to Z_NULL, is
57 // permitted.
58 #define BYE(...) \
59     do { \
60         inflateEnd(&strm); \
61         *err = aprintf(__VA_ARGS__); \
62         return 1; \
63     } while (0)
64 
65 // Chunk size for buffered reads and for decompression. Twice this many bytes
66 // will be allocated on the stack by gzip_normalize(). Must fit in an unsigned.
67 #define CHUNK 16384
68 
69 // Read a gzip stream from in and write an equivalent normalized gzip stream to
70 // out. If given no input, an empty gzip stream will be written. If successful,
71 // 0 is returned, and *err is set to NULL. On error, 1 is returned, where the
72 // details of the error are returned in *err, a pointer to an allocated string.
73 //
74 // The input may be a stream with multiple gzip members, which is converted to
75 // a single gzip member on the output. Each gzip member is decompressed at the
76 // level of deflate blocks. This enables clearing the last-block bit, shifting
77 // the compressed data to concatenate to the previous member's compressed data,
78 // which can end at an arbitrary bit boundary, and identifying stored blocks in
79 // order to resynchronize those to byte boundaries. The deflate compressed data
80 // is terminated with a 10-bit empty fixed block. If any members on the input
81 // end with a 10-bit empty fixed block, then that block is excised from the
82 // stream. This avoids appending empty fixed blocks for every normalization,
83 // and assures that gzip_normalize applied a second time will not change the
84 // input. The pad bits after stored block headers and after the final deflate
85 // block are all forced to zeros.
gzip_normalize(FILE * in,FILE * out,char ** err)86 local int gzip_normalize(FILE *in, FILE *out, char **err) {
87     // initialize the inflate engine to process a gzip member
88     z_stream strm;
89     strm.zalloc = Z_NULL;
90     strm.zfree = Z_NULL;
91     strm.opaque = Z_NULL;
92     strm.avail_in = 0;
93     strm.next_in = Z_NULL;
94     if (inflateInit2(&strm, 15 + 16) != Z_OK)
95         BYE("out of memory");
96 
97     // State while processing the input gzip stream.
98     enum {              // BETWEEN -> HEAD -> BLOCK -> TAIL -> BETWEEN -> ...
99         BETWEEN,        // between gzip members (must end in this state)
100         HEAD,           // reading a gzip header
101         BLOCK,          // reading deflate blocks
102         TAIL            // reading a gzip trailer
103     } state = BETWEEN;              // current component being processed
104     unsigned long crc = 0;          // accumulated CRC of uncompressed data
105     unsigned long len = 0;          // accumulated length of uncompressed data
106     unsigned long buf = 0;          // deflate stream bit buffer of num bits
107     int num = 0;                    // number of bits in buf (at bottom)
108 
109     // Write a canonical gzip header (no mod time, file name, comment, extra
110     // block, or extra flags, and OS is marked as unknown).
111     fwrite("\x1f\x8b\x08\0\0\0\0\0\0\xff", 1, 10, out);
112 
113     // Process the gzip stream from in until reaching the end of the input,
114     // encountering invalid input, or experiencing an i/o error.
115     int more;                       // true if not at the end of the input
116     do {
117         // State inside this loop.
118         unsigned char *put;         // next input buffer location to process
119         int prev;                   // number of bits from previous block in
120                                     // the bit buffer, or -1 if not at the
121                                     // start of a block
122         unsigned long long memb;    // uncompressed length of member
123         size_t tail;                // number of trailer bytes read (0..8)
124         unsigned long part;         // accumulated trailer component
125 
126         // Get the next chunk of input from in.
127         unsigned char dat[CHUNK];
128         strm.avail_in = fread(dat, 1, CHUNK, in);
129         if (strm.avail_in == 0)
130             break;
131         more = strm.avail_in == CHUNK;
132         strm.next_in = put = dat;
133 
134         // Run that chunk of input through the inflate engine to exhaustion.
135         do {
136             // At this point it is assured that strm.avail_in > 0.
137 
138             // Inflate until the end of a gzip component (header, deflate
139             // block, trailer) is reached, or until all of the chunk is
140             // consumed. The resulting decompressed data is discarded, though
141             // the total size of the decompressed data in each member is
142             // tracked, for the calculation of the total CRC.
143             do {
144                 // inflate and handle any errors
145                 unsigned char scrap[CHUNK];
146                 strm.avail_out = CHUNK;
147                 strm.next_out = scrap;
148                 int ret = inflate(&strm, Z_BLOCK);
149                 if (ret == Z_MEM_ERROR)
150                     BYE("out of memory");
151                 if (ret == Z_DATA_ERROR)
152                     BYE("input invalid: %s", strm.msg);
153                 if (ret != Z_OK && ret != Z_BUF_ERROR && ret != Z_STREAM_END)
154                     BYE("internal error");
155 
156                 // Update the number of uncompressed bytes generated in this
157                 // member. The actual count (not modulo 2^32) is required to
158                 // correctly compute the total CRC.
159                 unsigned got = CHUNK - strm.avail_out;
160                 memb += got;
161                 if (memb < got)
162                     BYE("overflow error");
163 
164                 // Continue to process this chunk until it is consumed, or
165                 // until the end of a component (header, deflate block, or
166                 // trailer) is reached.
167             } while (strm.avail_out == 0 && (strm.data_type & 0x80) == 0);
168 
169             // Since strm.avail_in was > 0 for the inflate call, some input was
170             // just consumed. It is therefore assured that put < strm.next_in.
171 
172             // Disposition the consumed component or part of a component.
173             switch (state) {
174                 case BETWEEN:
175                     state = HEAD;
176                     // Fall through to HEAD when some or all of the header is
177                     // processed.
178 
179                 case HEAD:
180                     // Discard the header.
181                     if (strm.data_type & 0x80) {
182                         // End of header reached -- deflate blocks follow.
183                         put = strm.next_in;
184                         prev = num;
185                         memb = 0;
186                         state = BLOCK;
187                     }
188                     break;
189 
190                 case BLOCK:
191                     // Copy the deflate stream to the output, but with the
192                     // last-block-bit cleared. Re-synchronize stored block
193                     // headers to the output byte boundaries. The bytes at
194                     // put..strm.next_in-1 is the compressed data that has been
195                     // processed and is ready to be copied to the output.
196 
197                     // At this point, it is assured that new compressed data is
198                     // available, i.e., put < strm.next_in. If prev is -1, then
199                     // that compressed data starts in the middle of a deflate
200                     // block. If prev is not -1, then the bits in the bit
201                     // buffer, possibly combined with the bits in *put, contain
202                     // the three-bit header of the new deflate block. In that
203                     // case, prev is the number of bits from the previous block
204                     // that remain in the bit buffer. Since num is the number
205                     // of bits in the bit buffer, we have that num - prev is
206                     // the number of bits from the new block currently in the
207                     // bit buffer.
208 
209                     // If strm.data_type & 0xc0 is 0x80, then the last byte of
210                     // the available compressed data includes the last bits of
211                     // the end of a deflate block. In that case, that last byte
212                     // also has strm.data_type & 0x1f bits of the next deflate
213                     // block, in the range 0..7. If strm.data_type & 0xc0 is
214                     // 0xc0, then the last byte of the compressed data is the
215                     // end of the deflate stream, followed by strm.data_type &
216                     // 0x1f pad bits, also in the range 0..7.
217 
218                     // Set bits to the number of bits not yet consumed from the
219                     // last byte. If we are at the end of the block, bits is
220                     // either the number of bits in the last byte belonging to
221                     // the next block, or the number of pad bits after the
222                     // final block. In either of those cases, bits is in the
223                     // range 0..7.
224                     ;                   // (required due to C syntax oddity)
225                     int bits = strm.data_type & 0x1f;
226 
227                     if (prev != -1) {
228                         // We are at the start of a new block. Clear the last
229                         // block bit, and check for special cases. If it is a
230                         // stored block, then emit the header and pad to the
231                         // next byte boundary. If it is a final, empty fixed
232                         // block, then excise it.
233 
234                         // Some or all of the three header bits for this block
235                         // may already be in the bit buffer. Load any remaining
236                         // header bits into the bit buffer.
237                         if (num - prev < 3) {
238                             buf += (unsigned long)*put++ << num;
239                             num += 8;
240                         }
241 
242                         // Set last to have a 1 in the position of the last
243                         // block bit in the bit buffer.
244                         unsigned long last = (unsigned long)1 << prev;
245 
246                         if (((buf >> prev) & 7) == 3) {
247                             // This is a final fixed block. Load at least ten
248                             // bits from this block, including the header, into
249                             // the bit buffer. We already have at least three,
250                             // so at most one more byte needs to be loaded.
251                             if (num - prev < 10) {
252                                 if (put == strm.next_in)
253                                     // Need to go get and process more input.
254                                     // We'll end up back here to finish this.
255                                     break;
256                                 buf += (unsigned long)*put++ << num;
257                                 num += 8;
258                             }
259                             if (((buf >> prev) & 0x3ff) == 3) {
260                                 // That final fixed block is empty. Delete it
261                                 // to avoid adding an empty block every time a
262                                 // gzip stream is normalized.
263                                 num = prev;
264                                 buf &= last - 1;    // zero the pad bits
265                             }
266                         }
267                         else if (((buf >> prev) & 6) == 0) {
268                             // This is a stored block. Flush to the next
269                             // byte boundary after the three-bit header.
270                             num = (prev + 10) & ~7;
271                             buf &= last - 1;        // zero the pad bits
272                         }
273 
274                         // Clear the last block bit.
275                         buf &= ~last;
276 
277                         // Write out complete bytes in the bit buffer.
278                         while (num >= 8) {
279                             putc(buf, out);
280                             buf >>= 8;
281                             num -= 8;
282                         }
283 
284                         // If no more bytes left to process, then we have
285                         // consumed the byte that had bits from the next block.
286                         if (put == strm.next_in)
287                             bits = 0;
288                     }
289 
290                     // We are done handling the deflate block header. Now copy
291                     // all or almost all of the remaining compressed data that
292                     // has been processed so far. Don't copy one byte at the
293                     // end if it contains bits from the next deflate block or
294                     // pad bits at the end of a deflate block.
295 
296                     // mix is 1 if we are at the end of a deflate block, and if
297                     // some of the bits in the last byte follow this block. mix
298                     // is 0 if we are in the middle of a deflate block, if the
299                     // deflate block ended on a byte boundary, or if all of the
300                     // compressed data processed so far has been consumed.
301                     int mix = (strm.data_type & 0x80) && bits;
302 
303                     // Copy all of the processed compressed data to the output,
304                     // except for the last byte if it contains bits from the
305                     // next deflate block or pad bits at the end of the deflate
306                     // stream. Copy the data after shifting in num bits from
307                     // buf in front of it, leaving num bits from the end of the
308                     // compressed data in buf when done.
309                     unsigned char *end = strm.next_in - mix;
310                     if (put < end) {
311                         if (num)
312                             // Insert num bits from buf before the data being
313                             // copied.
314                             do {
315                                 buf += (unsigned)(*put++) << num;
316                                 putc(buf, out);
317                                 buf >>= 8;
318                             } while (put < end);
319                         else {
320                             // No shifting needed -- write directly.
321                             fwrite(put, 1, end - put, out);
322                             put = end;
323                         }
324                     }
325 
326                     // Process the last processed byte if it wasn't written.
327                     if (mix) {
328                         // Load the last byte into the bit buffer.
329                         buf += (unsigned)(*put++) << num;
330                         num += 8;
331 
332                         if (strm.data_type & 0x40) {
333                             // We are at the end of the deflate stream and
334                             // there are bits pad bits. Discard the pad bits
335                             // and write a byte to the output, if available.
336                             // Leave the num bits left over in buf to prepend
337                             // to the next deflate stream.
338                             num -= bits;
339                             if (num >= 8) {
340                                 putc(buf, out);
341                                 num -= 8;
342                                 buf >>= 8;
343                             }
344 
345                             // Force the pad bits in the bit buffer to zeros.
346                             buf &= ((unsigned long)1 << num) - 1;
347 
348                             // Don't need to set prev here since going to TAIL.
349                         }
350                         else
351                             // At the end of an internal deflate block. Leave
352                             // the last byte in the bit buffer to examine on
353                             // the next entry to BLOCK, when more bits from the
354                             // next block will be available.
355                             prev = num - bits;      // number of bits in buffer
356                                                     // from current block
357                     }
358 
359                     // Don't have a byte left over, so we are in the middle of
360                     // a deflate block, or the deflate block ended on a byte
361                     // boundary. Set prev appropriately for the next entry into
362                     // BLOCK.
363                     else if (strm.data_type & 0x80)
364                         // The block ended on a byte boundary, so no header
365                         // bits are in the bit buffer.
366                         prev = num;
367                     else
368                         // In the middle of a deflate block, so no header here.
369                         prev = -1;
370 
371                     // Check for the end of the deflate stream.
372                     if ((strm.data_type & 0xc0) == 0xc0) {
373                         // That ends the deflate stream on the input side, the
374                         // pad bits were discarded, and any remaining bits from
375                         // the last block in the stream are saved in the bit
376                         // buffer to prepend to the next stream. Process the
377                         // gzip trailer next.
378                         tail = 0;
379                         part = 0;
380                         state = TAIL;
381                     }
382                     break;
383 
384                 case TAIL:
385                     // Accumulate available trailer bytes to update the total
386                     // CRC and the total uncompressed length.
387                     do {
388                         part = (part >> 8) + ((unsigned long)(*put++) << 24);
389                         tail++;
390                         if (tail == 4) {
391                             // Update the total CRC.
392                             z_off_t len2 = memb;
393                             if (len2 < 0 || (unsigned long long)len2 != memb)
394                                 BYE("overflow error");
395                             crc = crc ? crc32_combine(crc, part, len2) : part;
396                             part = 0;
397                         }
398                         else if (tail == 8) {
399                             // Update the total uncompressed length. (It's ok
400                             // if this sum is done modulo 2^32.)
401                             len += part;
402 
403                             // At the end of a member. Set up to inflate an
404                             // immediately following gzip member. (If we made
405                             // it this far, then the trailer was valid.)
406                             if (inflateReset(&strm) != Z_OK)
407                                 BYE("internal error");
408                             state = BETWEEN;
409                             break;
410                         }
411                     } while (put < strm.next_in);
412                     break;
413             }
414 
415             // Process the input buffer until completely consumed.
416         } while (strm.avail_in > 0);
417 
418         // Process input until end of file, invalid input, or i/o error.
419     } while (more);
420 
421     // Done with the inflate engine.
422     inflateEnd(&strm);
423 
424     // Verify the validity of the input.
425     if (state != BETWEEN)
426         BYE("input invalid: incomplete gzip stream");
427 
428     // Write the remaining deflate stream bits, followed by a terminating
429     // deflate fixed block.
430     buf += (unsigned long)3 << num;
431     putc(buf, out);
432     putc(buf >> 8, out);
433     if (num > 6)
434         putc(0, out);
435 
436     // Write the gzip trailer, which is the CRC and the uncompressed length
437     // modulo 2^32, both in little-endian order.
438     putc(crc, out);
439     putc(crc >> 8, out);
440     putc(crc >> 16, out);
441     putc(crc >> 24, out);
442     putc(len, out);
443     putc(len >> 8, out);
444     putc(len >> 16, out);
445     putc(len >> 24, out);
446     fflush(out);
447 
448     // Check for any i/o errors.
449     if (ferror(in) || ferror(out))
450         BYE("i/o error: %s", strerror(errno));
451 
452     // All good!
453     *err = NULL;
454     return 0;
455 }
456 
457 // Normalize the gzip stream on stdin, writing the result to stdout.
main(void)458 int main(void) {
459     // Avoid end-of-line conversions on evil operating systems.
460     SET_BINARY_MODE(stdin);
461     SET_BINARY_MODE(stdout);
462 
463     // Normalize from stdin to stdout, returning 1 on error, 0 if ok.
464     char *err;
465     int ret = gzip_normalize(stdin, stdout, &err);
466     if (ret)
467         fprintf(stderr, "gznorm error: %s\n", err);
468     free(err);
469     return ret;
470 }
471