• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* deflate.c - deflate/inflate code for gzip and friends
2  *
3  * Copyright 2014 Rob Landley <rob@landley.net>
4  *
5  * See RFCs 1950 (zlib), 1951 (deflate), and 1952 (gzip)
6  * LSB 4.1 has gzip, gunzip, and zcat
7  *
8  * TODO: zip -d DIR -x LIST -list -quiet -no overwrite -overwrite -p to stdout
9  */
10 
11 #include "toys.h"
12 
13 struct deflate {
14   // Huffman codes: base offset and extra bits tables (length and distance)
15   char lenbits[29], distbits[30];
16   unsigned short lenbase[29], distbase[30];
17   void *fixdisthuff, *fixlithuff;
18 
19   // CRC
20   void (*crcfunc)(struct deflate *dd, char *data, unsigned len);
21   unsigned crctable[256], crc;
22 
23 
24   // Tables only used for deflation
25   unsigned short *hashhead, *hashchain;
26 
27   // Compressed data buffer (extra space malloced at end)
28   unsigned pos, len;
29   int infd, outfd;
30   char data[];
31 };
32 
33 // little endian bit buffer
34 struct bitbuf {
35   int fd, bitpos, len, max;
36   char buf[];
37 };
38 
39 // malloc a struct bitbuf
bitbuf_init(int fd,int size)40 static struct bitbuf *bitbuf_init(int fd, int size)
41 {
42   struct bitbuf *bb = xzalloc(sizeof(struct bitbuf)+size);
43 
44   bb->max = size;
45   bb->fd = fd;
46 
47   return bb;
48 }
49 
50 // Advance bitpos without the overhead of recording bits
51 // Loads more data when input buffer empty
52 // call with 0 to just load data, returns 0 at EOF
bitbuf_skip(struct bitbuf * bb,int bits)53 static int bitbuf_skip(struct bitbuf *bb, int bits)
54 {
55   int pos = bb->bitpos + bits + (bits<0), len;
56 
57   while (pos >= (len = bb->len<<3)) {
58     pos -= len;
59     if (1 > (bb->len = read(bb->fd, bb->buf, bb->max))) {
60       if (!bb->len && !bits) break;
61       error_exit("inflate EOF");
62     }
63   }
64   bb->bitpos = pos;
65 
66   return pos<len;
67 }
68 
69 // Optimized single bit inlined version
bitbuf_bit(struct bitbuf * bb)70 static inline int bitbuf_bit(struct bitbuf *bb)
71 {
72   int bufpos = bb->bitpos>>3;
73 
74   if (bufpos == bb->len) {
75     bitbuf_skip(bb, -1);
76     bufpos = 0;
77   }
78 
79   return (bb->buf[bufpos]>>(bb->bitpos++&7))&1;
80 }
81 
82 // Fetch the next X bits from the bitbuf, little endian
bitbuf_get(struct bitbuf * bb,int bits)83 static unsigned bitbuf_get(struct bitbuf *bb, int bits)
84 {
85   int result = 0, offset = 0;
86 
87   while (bits) {
88     int click = bb->bitpos >> 3, blow, blen;
89 
90     // Load more data if buffer empty
91     if (click == bb->len) {
92       bitbuf_skip(bb, -1);
93       click = 0;
94     }
95 
96     // grab bits from next byte
97     blow = bb->bitpos & 7;
98     blen = 8-blow;
99     if (blen > bits) blen = bits;
100     result |= ((bb->buf[click] >> blow) & ((1<<blen)-1)) << offset;
101     offset += blen;
102     bits -= blen;
103     bb->bitpos += blen;
104   }
105 
106   return result;
107 }
108 
bitbuf_flush(struct bitbuf * bb)109 static void bitbuf_flush(struct bitbuf *bb)
110 {
111   if (!bb->bitpos) return;
112 
113   xwrite(bb->fd, bb->buf, (bb->bitpos+7)>>3);
114   memset(bb->buf, 0, bb->max);
115   bb->bitpos = 0;
116 }
117 
bitbuf_put(struct bitbuf * bb,int data,int len)118 static void bitbuf_put(struct bitbuf *bb, int data, int len)
119 {
120   while (len) {
121     int click = bb->bitpos >> 3, blow, blen;
122 
123     // Flush buffer if necessary
124     if (click == bb->max) {
125       bitbuf_flush(bb);
126       click = 0;
127     }
128     blow = bb->bitpos & 7;
129     blen = 8-blow;
130     if (blen > len) blen = len;
131     bb->buf[click] |= data << blow;
132     bb->bitpos += blen;
133     data >>= blen;
134     len -= blen;
135   }
136 }
137 
output_byte(struct deflate * dd,char sym)138 static void output_byte(struct deflate *dd, char sym)
139 {
140   int pos = dd->pos++ & 32767;
141 
142   dd->data[pos] = sym;
143 
144   if (pos == 32767) {
145     xwrite(dd->outfd, dd->data, 32768);
146     if (dd->crcfunc) dd->crcfunc(dd, dd->data, 32768);
147   }
148 }
149 
150 // Huffman coding uses bits to traverse a binary tree to a leaf node,
151 // By placing frequently occurring symbols at shorter paths, frequently
152 // used symbols may be represented in fewer bits than uncommon symbols.
153 // (length[0] isn't used but code's clearer if it's there.)
154 
155 struct huff {
156   unsigned short length[16];  // How many symbols have this bit length?
157   unsigned short symbol[288]; // sorted by bit length, then ascending order
158 };
159 
160 // Create simple huffman tree from array of bit lengths.
161 
162 // The symbols in the huffman trees are sorted (first by bit length
163 // of the code to reach them, then by symbol number). This means that given
164 // the bit length of each symbol, we can construct a unique tree.
len2huff(struct huff * huff,char bitlen[],int len)165 static void len2huff(struct huff *huff, char bitlen[], int len)
166 {
167   int offset[16];
168   int i;
169 
170   // Count number of codes at each bit length
171   memset(huff, 0, sizeof(struct huff));
172   for (i = 0; i<len; i++) huff->length[bitlen[i]]++;
173 
174   // Sort symbols by bit length, then symbol. Get list of starting positions
175   // for each group, then write each symbol to next position within its group.
176   *huff->length = *offset = 0;
177   for (i = 1; i<16; i++) offset[i] = offset[i-1] + huff->length[i-1];
178   for (i = 0; i<len; i++) if (bitlen[i]) huff->symbol[offset[bitlen[i]]++] = i;
179 }
180 
181 // Fetch and decode next huffman coded symbol from bitbuf.
182 // This takes advantage of the sorting to navigate the tree as an array:
183 // each time we fetch a bit we have all the codes at that bit level in
184 // order with no gaps.
huff_and_puff(struct bitbuf * bb,struct huff * huff)185 static unsigned huff_and_puff(struct bitbuf *bb, struct huff *huff)
186 {
187   unsigned short *length = huff->length;
188   int start = 0, offset = 0;
189 
190   // Traverse through the bit lengths until our code is in this range
191   for (;;) {
192     offset = (offset << 1) | bitbuf_bit(bb);
193     start += *++length;
194     if ((offset -= *length) < 0) break;
195     if ((length - huff->length) & 16) error_exit("bad symbol");
196   }
197 
198   return huff->symbol[start + offset];
199 }
200 
201 // Decompress deflated data from bitbuf to dd->outfd.
inflate(struct deflate * dd,struct bitbuf * bb)202 static void inflate(struct deflate *dd, struct bitbuf *bb)
203 {
204   dd->crc = ~0;
205 
206   // repeat until spanked
207   for (;;) {
208     int final, type;
209 
210     final = bitbuf_get(bb, 1);
211     type = bitbuf_get(bb, 2);
212 
213     if (type == 3) error_exit("bad type");
214 
215     // Uncompressed block?
216     if (!type) {
217       int len, nlen;
218 
219       // Align to byte, read length
220       bitbuf_skip(bb, (8-bb->bitpos)&7);
221       len = bitbuf_get(bb, 16);
222       nlen = bitbuf_get(bb, 16);
223       if (len != (0xffff & ~nlen)) error_exit("bad len");
224 
225       // Dump literal output data
226       while (len) {
227         int pos = bb->bitpos >> 3, bblen = bb->len - pos;
228         char *p = bb->buf+pos;
229 
230         // dump bytes until done or end of current bitbuf contents
231         if (bblen > len) bblen = len;
232         pos = bblen;
233         while (pos--) output_byte(dd, *(p++));
234         bitbuf_skip(bb, bblen << 3);
235         len -= bblen;
236       }
237 
238     // Compressed block
239     } else {
240       struct huff *disthuff, *lithuff;
241 
242       // Dynamic huffman codes?
243       if (type == 2) {
244         struct huff *h2 = ((struct huff *)libbuf)+1;
245         int i, litlen, distlen, hufflen;
246         char *hufflen_order = "\x10\x11\x12\0\x08\x07\x09\x06\x0a\x05\x0b"
247                               "\x04\x0c\x03\x0d\x02\x0e\x01\x0f", *bits;
248 
249         // The huffman trees are stored as a series of bit lengths
250         litlen = bitbuf_get(bb, 5)+257;  // max 288
251         distlen = bitbuf_get(bb, 5)+1;   // max 32
252         hufflen = bitbuf_get(bb, 4)+4;   // max 19
253 
254         // The literal and distance codes are themselves compressed, in
255         // a complicated way: an array of bit lengths (hufflen many
256         // entries, each 3 bits) is used to fill out an array of 19 entries
257         // in a magic order, leaving the rest 0. Then make a tree out of it:
258         memset(bits = libbuf+1, 0, 19);
259         for (i=0; i<hufflen; i++) bits[hufflen_order[i]] = bitbuf_get(bb, 3);
260         len2huff(h2, bits, 19);
261 
262         // Use that tree to read in the literal and distance bit lengths
263         for (i = 0; i < litlen + distlen;) {
264           int sym = huff_and_puff(bb, h2);
265 
266           // 0-15 are literals, 16 = repeat previous code 3-6 times,
267           // 17 = 3-10 zeroes (3 bit), 18 = 11-138 zeroes (7 bit)
268           if (sym < 16) bits[i++] = sym;
269           else {
270             int len = sym & 2;
271 
272             len = bitbuf_get(bb, sym-14+len+(len>>1)) + 3 + (len<<2);
273             memset(bits+i, bits[i-1] * !(sym&3), len);
274             i += len;
275           }
276         }
277         if (i > litlen+distlen) error_exit("bad tree");
278 
279         len2huff(lithuff = h2, bits, litlen);
280         len2huff(disthuff = ((struct huff *)libbuf)+2, bits+litlen, distlen);
281 
282       // Static huffman codes
283       } else {
284         lithuff = dd->fixlithuff;
285         disthuff = dd->fixdisthuff;
286       }
287 
288       // Use huffman tables to decode block of compressed symbols
289       for (;;) {
290         int sym = huff_and_puff(bb, lithuff);
291 
292         // Literal?
293         if (sym < 256) output_byte(dd, sym);
294 
295         // Copy range?
296         else if (sym > 256) {
297           int len, dist;
298 
299           sym -= 257;
300           len = dd->lenbase[sym] + bitbuf_get(bb, dd->lenbits[sym]);
301           sym = huff_and_puff(bb, disthuff);
302           dist = dd->distbase[sym] + bitbuf_get(bb, dd->distbits[sym]);
303           sym = dd->pos & 32767;
304 
305           while (len--) output_byte(dd, dd->data[(dd->pos-dist) & 32767]);
306 
307         // End of block
308         } else break;
309       }
310     }
311 
312     // Was that the last block?
313     if (final) break;
314   }
315 
316   if (dd->pos & 32767) {
317     xwrite(dd->outfd, dd->data, dd->pos&32767);
318     if (dd->crcfunc) dd->crcfunc(dd, dd->data, dd->pos&32767);
319   }
320 }
321 
322 // Deflate from dd->infd to bitbuf
323 // For deflate, dd->len = input read, dd->pos = input consumed
deflate(struct deflate * dd,struct bitbuf * bb)324 static void deflate(struct deflate *dd, struct bitbuf *bb)
325 {
326   char *data = dd->data;
327   int len, final = 0;
328 
329   dd->crc = ~0;
330 
331   while (!final) {
332     // Read next half-window of data if we haven't hit EOF yet.
333     len = readall(dd->infd, data+(dd->len&32768), 32768);
334     if (len < 0) perror_exit("read"); // TODO: add filename
335     if (len != 32768) final++;
336     if (dd->crcfunc) dd->crcfunc(dd, data+(dd->len&32768), len);
337     // dd->len += len;  crcfunc advances len TODO
338 
339     // store block as literal
340     bitbuf_put(bb, final, 1);
341     bitbuf_put(bb, 0, 1);
342 
343     bitbuf_put(bb, 0, (8-bb->bitpos)&7);
344     bitbuf_put(bb, len, 16);
345     bitbuf_put(bb, 0xffff & ~len, 16);
346 
347     // repeat until spanked
348     while (dd->pos != dd->len) {
349       unsigned pos = dd->pos&65535;
350 
351       bitbuf_put(bb, data[pos], 8);
352 
353       // need to refill buffer?
354       if (!(32767 & ++dd->pos) && !final) break;
355     }
356   }
357   bitbuf_flush(bb);
358 }
359 
360 // Allocate memory for deflate/inflate.
init_deflate(int compress)361 static struct deflate *init_deflate(int compress)
362 {
363   int i, n = 1;
364   struct deflate *dd = xmalloc(sizeof(struct deflate)+32768*(compress ? 4 : 1));
365 
366   memset(dd, 0, sizeof(struct deflate));
367   // decompress needs 32k history, compress adds 64k hashhead and 32k hashchain
368   if (compress) {
369     dd->hashhead = (unsigned short *)(dd->data+65536);
370     dd->hashchain = (unsigned short *)(dd->data+65536+32768);
371   }
372 
373   // Calculate lenbits, lenbase, distbits, distbase
374   *dd->lenbase = 3;
375   for (i = 0; i<sizeof(dd->lenbits)-1; i++) {
376     if (i>4) {
377       if (!(i&3)) {
378         dd->lenbits[i]++;
379         n <<= 1;
380       }
381       if (i == 27) n--;
382       else dd->lenbits[i+1] = dd->lenbits[i];
383     }
384     dd->lenbase[i+1] = n + dd->lenbase[i];
385   }
386   n = 0;
387   for (i = 0; i<sizeof(dd->distbits); i++) {
388     dd->distbase[i] = 1<<n;
389     if (i) dd->distbase[i] += dd->distbase[i-1];
390     if (i>3 && !(i&1)) n++;
391     dd->distbits[i] = n;
392   }
393 
394 // TODO layout and lifetime of this?
395   // Init fixed huffman tables
396   for (i=0; i<288; i++) libbuf[i] = 8 + (i>143) - ((i>255)<<1) + (i>279);
397   len2huff(dd->fixlithuff = ((struct huff *)libbuf)+3, libbuf, 288);
398   memset(libbuf, 5, 30);
399   len2huff(dd->fixdisthuff = ((struct huff *)libbuf)+4, libbuf, 30);
400 
401   return dd;
402 }
403 
404 // Return true/false whether we consumed a gzip header.
is_gzip(struct bitbuf * bb)405 static int is_gzip(struct bitbuf *bb)
406 {
407   int flags;
408 
409   // Confirm signature
410   if (bitbuf_get(bb, 24) != 0x088b1f || (flags = bitbuf_get(bb, 8)) > 31)
411     return 0;
412   bitbuf_skip(bb, 6*8);
413 
414   // Skip extra, name, comment, header CRC fields
415   if (flags & 4) bitbuf_skip(bb, bitbuf_get(bb, 16) * 8);
416   if (flags & 8) while (bitbuf_get(bb, 8));
417   if (flags & 16) while (bitbuf_get(bb, 8));
418   if (flags & 2) bitbuf_skip(bb, 16);
419 
420   return 1;
421 }
422 
gzip_crc(struct deflate * dd,char * data,unsigned len)423 static void gzip_crc(struct deflate *dd, char *data, unsigned len)
424 {
425   int i;
426   unsigned crc, *crc_table = dd->crctable;
427 
428   crc = dd->crc;
429   for (i = 0; i<len; i++) crc = crc_table[(crc^data[i])&0xff] ^ (crc>>8);
430   dd->crc = crc;
431   dd->len += len;
432 }
433 
434 /*
435 // Start with crc = 1, or pass in last crc to append more data
436 // Deferred modulus good for paged size inputs (can't overflow for ~5500 bytes)
437 unsigned adler32(char *buf, unsigned len, unsigned crc)
438 {
439   unsigned aa = crc&((1<<16)-1), bb = crc>>16;
440 
441   while (len--) {
442     aa += *buf++;
443     bb += aa;
444   }
445 
446   return ((bb%65521)<<16)+aa%65521;
447 }
448 */
449 
gzip_fd(int infd,int outfd)450 long long gzip_fd(int infd, int outfd)
451 {
452   struct bitbuf *bb = bitbuf_init(outfd, 4096);
453   struct deflate *dd = init_deflate(1);
454   long long rc;
455 
456   // Header from RFC 1952 section 2.2:
457   // 2 ID bytes (1F, 8b), gzip method byte (8=deflate), FLAG byte (none),
458   // 4 byte MTIME (zeroed), Extra Flags (2=maximum compression),
459   // Operating System (FF=unknown)
460 
461   dd->infd = infd;
462   xwrite(bb->fd, "\x1f\x8b\x08\0\0\0\0\0\x02\xff", 10);
463 
464   // Little endian crc table
465   crc_init(dd->crctable, 1);
466   dd->crcfunc = gzip_crc;
467 
468   deflate(dd, bb);
469 
470   // tail: crc32, len32
471 
472   bitbuf_put(bb, 0, (8-bb->bitpos)&7);
473   bitbuf_put(bb, ~dd->crc, 32);
474   bitbuf_put(bb, dd->len, 32);
475   rc = dd->len;
476 
477   bitbuf_flush(bb);
478   free(bb);
479   free(dd);
480 
481   return rc;
482 }
483 
gunzip_fd(int infd,int outfd)484 long long gunzip_fd(int infd, int outfd)
485 {
486   struct bitbuf *bb = bitbuf_init(infd, 4096);
487   struct deflate *dd = init_deflate(0);
488   long long rc = 0;
489 
490   // Little endian crc table
491   crc_init(dd->crctable, 1);
492   dd->crcfunc = gzip_crc;
493   dd->outfd = outfd;
494 
495   do {
496     if (!is_gzip(bb)) error_exit("not gzip");
497 
498     inflate(dd, bb);
499 
500     // tail: crc32, len32
501     bitbuf_skip(bb, (8-bb->bitpos)&7);
502     if (~dd->crc != bitbuf_get(bb, 32) || dd->len != bitbuf_get(bb, 32))
503       error_exit("bad crc");
504     rc += dd->len;
505 
506     bitbuf_skip(bb, (8-bb->bitpos)&7);
507     dd->pos = dd->len = 0;
508   } while (bitbuf_skip(bb, 0));
509   free(bb);
510   free(dd);
511 
512   return rc;
513 }
514