1 /* deflate.c - deflate/inflate code for gzip and friends
2 *
3 * Copyright 2014 Rob Landley <rob@landley.net>
4 *
5 * See RFCs 1950 (zlib), 1951 (deflate), and 1952 (gzip)
6 * LSB 4.1 has gzip, gunzip, and zcat
7 *
8 * TODO: zip -d DIR -x LIST -list -quiet -no overwrite -overwrite -p to stdout
9 */
10
11 #include "toys.h"
12
13 struct deflate {
14 // Huffman codes: base offset and extra bits tables (length and distance)
15 char lenbits[29], distbits[30];
16 unsigned short lenbase[29], distbase[30];
17 void *fixdisthuff, *fixlithuff;
18
19 // CRC
20 void (*crcfunc)(struct deflate *dd, char *data, unsigned len);
21 unsigned crctable[256], crc;
22
23
24 // Tables only used for deflation
25 unsigned short *hashhead, *hashchain;
26
27 // Compressed data buffer (extra space malloced at end)
28 unsigned pos, len;
29 int infd, outfd;
30 char data[];
31 };
32
33 // little endian bit buffer
34 struct bitbuf {
35 int fd, bitpos, len, max;
36 char buf[];
37 };
38
39 // malloc a struct bitbuf
bitbuf_init(int fd,int size)40 static struct bitbuf *bitbuf_init(int fd, int size)
41 {
42 struct bitbuf *bb = xzalloc(sizeof(struct bitbuf)+size);
43
44 bb->max = size;
45 bb->fd = fd;
46
47 return bb;
48 }
49
50 // Advance bitpos without the overhead of recording bits
51 // Loads more data when input buffer empty
52 // call with 0 to just load data, returns 0 at EOF
bitbuf_skip(struct bitbuf * bb,int bits)53 static int bitbuf_skip(struct bitbuf *bb, int bits)
54 {
55 int pos = bb->bitpos + bits + (bits<0), len;
56
57 while (pos >= (len = bb->len<<3)) {
58 pos -= len;
59 if (1 > (bb->len = read(bb->fd, bb->buf, bb->max))) {
60 if (!bb->len && !bits) break;
61 error_exit("inflate EOF");
62 }
63 }
64 bb->bitpos = pos;
65
66 return pos<len;
67 }
68
69 // Optimized single bit inlined version
bitbuf_bit(struct bitbuf * bb)70 static inline int bitbuf_bit(struct bitbuf *bb)
71 {
72 int bufpos = bb->bitpos>>3;
73
74 if (bufpos == bb->len) {
75 bitbuf_skip(bb, -1);
76 bufpos = 0;
77 }
78
79 return (bb->buf[bufpos]>>(bb->bitpos++&7))&1;
80 }
81
82 // Fetch the next X bits from the bitbuf, little endian
bitbuf_get(struct bitbuf * bb,int bits)83 static unsigned bitbuf_get(struct bitbuf *bb, int bits)
84 {
85 int result = 0, offset = 0;
86
87 while (bits) {
88 int click = bb->bitpos >> 3, blow, blen;
89
90 // Load more data if buffer empty
91 if (click == bb->len) {
92 bitbuf_skip(bb, -1);
93 click = 0;
94 }
95
96 // grab bits from next byte
97 blow = bb->bitpos & 7;
98 blen = 8-blow;
99 if (blen > bits) blen = bits;
100 result |= ((bb->buf[click] >> blow) & ((1<<blen)-1)) << offset;
101 offset += blen;
102 bits -= blen;
103 bb->bitpos += blen;
104 }
105
106 return result;
107 }
108
bitbuf_flush(struct bitbuf * bb)109 static void bitbuf_flush(struct bitbuf *bb)
110 {
111 if (!bb->bitpos) return;
112
113 xwrite(bb->fd, bb->buf, (bb->bitpos+7)>>3);
114 memset(bb->buf, 0, bb->max);
115 bb->bitpos = 0;
116 }
117
bitbuf_put(struct bitbuf * bb,int data,int len)118 static void bitbuf_put(struct bitbuf *bb, int data, int len)
119 {
120 while (len) {
121 int click = bb->bitpos >> 3, blow, blen;
122
123 // Flush buffer if necessary
124 if (click == bb->max) {
125 bitbuf_flush(bb);
126 click = 0;
127 }
128 blow = bb->bitpos & 7;
129 blen = 8-blow;
130 if (blen > len) blen = len;
131 bb->buf[click] |= data << blow;
132 bb->bitpos += blen;
133 data >>= blen;
134 len -= blen;
135 }
136 }
137
output_byte(struct deflate * dd,char sym)138 static void output_byte(struct deflate *dd, char sym)
139 {
140 int pos = dd->pos++ & 32767;
141
142 dd->data[pos] = sym;
143
144 if (pos == 32767) {
145 xwrite(dd->outfd, dd->data, 32768);
146 if (dd->crcfunc) dd->crcfunc(dd, dd->data, 32768);
147 }
148 }
149
150 // Huffman coding uses bits to traverse a binary tree to a leaf node,
151 // By placing frequently occurring symbols at shorter paths, frequently
152 // used symbols may be represented in fewer bits than uncommon symbols.
153 // (length[0] isn't used but code's clearer if it's there.)
154
155 struct huff {
156 unsigned short length[16]; // How many symbols have this bit length?
157 unsigned short symbol[288]; // sorted by bit length, then ascending order
158 };
159
160 // Create simple huffman tree from array of bit lengths.
161
162 // The symbols in the huffman trees are sorted (first by bit length
163 // of the code to reach them, then by symbol number). This means that given
164 // the bit length of each symbol, we can construct a unique tree.
len2huff(struct huff * huff,char bitlen[],int len)165 static void len2huff(struct huff *huff, char bitlen[], int len)
166 {
167 int offset[16];
168 int i;
169
170 // Count number of codes at each bit length
171 memset(huff, 0, sizeof(struct huff));
172 for (i = 0; i<len; i++) huff->length[bitlen[i]]++;
173
174 // Sort symbols by bit length, then symbol. Get list of starting positions
175 // for each group, then write each symbol to next position within its group.
176 *huff->length = *offset = 0;
177 for (i = 1; i<16; i++) offset[i] = offset[i-1] + huff->length[i-1];
178 for (i = 0; i<len; i++) if (bitlen[i]) huff->symbol[offset[bitlen[i]]++] = i;
179 }
180
181 // Fetch and decode next huffman coded symbol from bitbuf.
182 // This takes advantage of the sorting to navigate the tree as an array:
183 // each time we fetch a bit we have all the codes at that bit level in
184 // order with no gaps.
huff_and_puff(struct bitbuf * bb,struct huff * huff)185 static unsigned huff_and_puff(struct bitbuf *bb, struct huff *huff)
186 {
187 unsigned short *length = huff->length;
188 int start = 0, offset = 0;
189
190 // Traverse through the bit lengths until our code is in this range
191 for (;;) {
192 offset = (offset << 1) | bitbuf_bit(bb);
193 start += *++length;
194 if ((offset -= *length) < 0) break;
195 if ((length - huff->length) & 16) error_exit("bad symbol");
196 }
197
198 return huff->symbol[start + offset];
199 }
200
201 // Decompress deflated data from bitbuf to dd->outfd.
inflate(struct deflate * dd,struct bitbuf * bb)202 static void inflate(struct deflate *dd, struct bitbuf *bb)
203 {
204 dd->crc = ~0;
205
206 // repeat until spanked
207 for (;;) {
208 int final, type;
209
210 final = bitbuf_get(bb, 1);
211 type = bitbuf_get(bb, 2);
212
213 if (type == 3) error_exit("bad type");
214
215 // Uncompressed block?
216 if (!type) {
217 int len, nlen;
218
219 // Align to byte, read length
220 bitbuf_skip(bb, (8-bb->bitpos)&7);
221 len = bitbuf_get(bb, 16);
222 nlen = bitbuf_get(bb, 16);
223 if (len != (0xffff & ~nlen)) error_exit("bad len");
224
225 // Dump literal output data
226 while (len) {
227 int pos = bb->bitpos >> 3, bblen = bb->len - pos;
228 char *p = bb->buf+pos;
229
230 // dump bytes until done or end of current bitbuf contents
231 if (bblen > len) bblen = len;
232 pos = bblen;
233 while (pos--) output_byte(dd, *(p++));
234 bitbuf_skip(bb, bblen << 3);
235 len -= bblen;
236 }
237
238 // Compressed block
239 } else {
240 struct huff *disthuff, *lithuff;
241
242 // Dynamic huffman codes?
243 if (type == 2) {
244 struct huff *h2 = ((struct huff *)libbuf)+1;
245 int i, litlen, distlen, hufflen;
246 char *hufflen_order = "\x10\x11\x12\0\x08\x07\x09\x06\x0a\x05\x0b"
247 "\x04\x0c\x03\x0d\x02\x0e\x01\x0f", *bits;
248
249 // The huffman trees are stored as a series of bit lengths
250 litlen = bitbuf_get(bb, 5)+257; // max 288
251 distlen = bitbuf_get(bb, 5)+1; // max 32
252 hufflen = bitbuf_get(bb, 4)+4; // max 19
253
254 // The literal and distance codes are themselves compressed, in
255 // a complicated way: an array of bit lengths (hufflen many
256 // entries, each 3 bits) is used to fill out an array of 19 entries
257 // in a magic order, leaving the rest 0. Then make a tree out of it:
258 memset(bits = libbuf+1, 0, 19);
259 for (i=0; i<hufflen; i++) bits[hufflen_order[i]] = bitbuf_get(bb, 3);
260 len2huff(h2, bits, 19);
261
262 // Use that tree to read in the literal and distance bit lengths
263 for (i = 0; i < litlen + distlen;) {
264 int sym = huff_and_puff(bb, h2);
265
266 // 0-15 are literals, 16 = repeat previous code 3-6 times,
267 // 17 = 3-10 zeroes (3 bit), 18 = 11-138 zeroes (7 bit)
268 if (sym < 16) bits[i++] = sym;
269 else {
270 int len = sym & 2;
271
272 len = bitbuf_get(bb, sym-14+len+(len>>1)) + 3 + (len<<2);
273 memset(bits+i, bits[i-1] * !(sym&3), len);
274 i += len;
275 }
276 }
277 if (i > litlen+distlen) error_exit("bad tree");
278
279 len2huff(lithuff = h2, bits, litlen);
280 len2huff(disthuff = ((struct huff *)libbuf)+2, bits+litlen, distlen);
281
282 // Static huffman codes
283 } else {
284 lithuff = dd->fixlithuff;
285 disthuff = dd->fixdisthuff;
286 }
287
288 // Use huffman tables to decode block of compressed symbols
289 for (;;) {
290 int sym = huff_and_puff(bb, lithuff);
291
292 // Literal?
293 if (sym < 256) output_byte(dd, sym);
294
295 // Copy range?
296 else if (sym > 256) {
297 int len, dist;
298
299 sym -= 257;
300 len = dd->lenbase[sym] + bitbuf_get(bb, dd->lenbits[sym]);
301 sym = huff_and_puff(bb, disthuff);
302 dist = dd->distbase[sym] + bitbuf_get(bb, dd->distbits[sym]);
303 sym = dd->pos & 32767;
304
305 while (len--) output_byte(dd, dd->data[(dd->pos-dist) & 32767]);
306
307 // End of block
308 } else break;
309 }
310 }
311
312 // Was that the last block?
313 if (final) break;
314 }
315
316 if (dd->pos & 32767) {
317 xwrite(dd->outfd, dd->data, dd->pos&32767);
318 if (dd->crcfunc) dd->crcfunc(dd, dd->data, dd->pos&32767);
319 }
320 }
321
322 // Deflate from dd->infd to bitbuf
323 // For deflate, dd->len = input read, dd->pos = input consumed
deflate(struct deflate * dd,struct bitbuf * bb)324 static void deflate(struct deflate *dd, struct bitbuf *bb)
325 {
326 char *data = dd->data;
327 int len, final = 0;
328
329 dd->crc = ~0;
330
331 while (!final) {
332 // Read next half-window of data if we haven't hit EOF yet.
333 len = readall(dd->infd, data+(dd->len&32768), 32768);
334 if (len < 0) perror_exit("read"); // TODO: add filename
335 if (len != 32768) final++;
336 if (dd->crcfunc) dd->crcfunc(dd, data+(dd->len&32768), len);
337 // dd->len += len; crcfunc advances len TODO
338
339 // store block as literal
340 bitbuf_put(bb, final, 1);
341 bitbuf_put(bb, 0, 1);
342
343 bitbuf_put(bb, 0, (8-bb->bitpos)&7);
344 bitbuf_put(bb, len, 16);
345 bitbuf_put(bb, 0xffff & ~len, 16);
346
347 // repeat until spanked
348 while (dd->pos != dd->len) {
349 unsigned pos = dd->pos&65535;
350
351 bitbuf_put(bb, data[pos], 8);
352
353 // need to refill buffer?
354 if (!(32767 & ++dd->pos) && !final) break;
355 }
356 }
357 bitbuf_flush(bb);
358 }
359
360 // Allocate memory for deflate/inflate.
init_deflate(int compress)361 static struct deflate *init_deflate(int compress)
362 {
363 int i, n = 1;
364 struct deflate *dd = xmalloc(sizeof(struct deflate)+32768*(compress ? 4 : 1));
365
366 memset(dd, 0, sizeof(struct deflate));
367 // decompress needs 32k history, compress adds 64k hashhead and 32k hashchain
368 if (compress) {
369 dd->hashhead = (unsigned short *)(dd->data+65536);
370 dd->hashchain = (unsigned short *)(dd->data+65536+32768);
371 }
372
373 // Calculate lenbits, lenbase, distbits, distbase
374 *dd->lenbase = 3;
375 for (i = 0; i<sizeof(dd->lenbits)-1; i++) {
376 if (i>4) {
377 if (!(i&3)) {
378 dd->lenbits[i]++;
379 n <<= 1;
380 }
381 if (i == 27) n--;
382 else dd->lenbits[i+1] = dd->lenbits[i];
383 }
384 dd->lenbase[i+1] = n + dd->lenbase[i];
385 }
386 n = 0;
387 for (i = 0; i<sizeof(dd->distbits); i++) {
388 dd->distbase[i] = 1<<n;
389 if (i) dd->distbase[i] += dd->distbase[i-1];
390 if (i>3 && !(i&1)) n++;
391 dd->distbits[i] = n;
392 }
393
394 // TODO layout and lifetime of this?
395 // Init fixed huffman tables
396 for (i=0; i<288; i++) libbuf[i] = 8 + (i>143) - ((i>255)<<1) + (i>279);
397 len2huff(dd->fixlithuff = ((struct huff *)libbuf)+3, libbuf, 288);
398 memset(libbuf, 5, 30);
399 len2huff(dd->fixdisthuff = ((struct huff *)libbuf)+4, libbuf, 30);
400
401 return dd;
402 }
403
404 // Return true/false whether we consumed a gzip header.
is_gzip(struct bitbuf * bb)405 static int is_gzip(struct bitbuf *bb)
406 {
407 int flags;
408
409 // Confirm signature
410 if (bitbuf_get(bb, 24) != 0x088b1f || (flags = bitbuf_get(bb, 8)) > 31)
411 return 0;
412 bitbuf_skip(bb, 6*8);
413
414 // Skip extra, name, comment, header CRC fields
415 if (flags & 4) bitbuf_skip(bb, bitbuf_get(bb, 16) * 8);
416 if (flags & 8) while (bitbuf_get(bb, 8));
417 if (flags & 16) while (bitbuf_get(bb, 8));
418 if (flags & 2) bitbuf_skip(bb, 16);
419
420 return 1;
421 }
422
gzip_crc(struct deflate * dd,char * data,unsigned len)423 static void gzip_crc(struct deflate *dd, char *data, unsigned len)
424 {
425 int i;
426 unsigned crc, *crc_table = dd->crctable;
427
428 crc = dd->crc;
429 for (i = 0; i<len; i++) crc = crc_table[(crc^data[i])&0xff] ^ (crc>>8);
430 dd->crc = crc;
431 dd->len += len;
432 }
433
434 /*
435 // Start with crc = 1, or pass in last crc to append more data
436 // Deferred modulus good for paged size inputs (can't overflow for ~5500 bytes)
437 unsigned adler32(char *buf, unsigned len, unsigned crc)
438 {
439 unsigned aa = crc&((1<<16)-1), bb = crc>>16;
440
441 while (len--) {
442 aa += *buf++;
443 bb += aa;
444 }
445
446 return ((bb%65521)<<16)+aa%65521;
447 }
448 */
449
gzip_fd(int infd,int outfd)450 long long gzip_fd(int infd, int outfd)
451 {
452 struct bitbuf *bb = bitbuf_init(outfd, 4096);
453 struct deflate *dd = init_deflate(1);
454 long long rc;
455
456 // Header from RFC 1952 section 2.2:
457 // 2 ID bytes (1F, 8b), gzip method byte (8=deflate), FLAG byte (none),
458 // 4 byte MTIME (zeroed), Extra Flags (2=maximum compression),
459 // Operating System (FF=unknown)
460
461 dd->infd = infd;
462 xwrite(bb->fd, "\x1f\x8b\x08\0\0\0\0\0\x02\xff", 10);
463
464 // Little endian crc table
465 crc_init(dd->crctable, 1);
466 dd->crcfunc = gzip_crc;
467
468 deflate(dd, bb);
469
470 // tail: crc32, len32
471
472 bitbuf_put(bb, 0, (8-bb->bitpos)&7);
473 bitbuf_put(bb, ~dd->crc, 32);
474 bitbuf_put(bb, dd->len, 32);
475 rc = dd->len;
476
477 bitbuf_flush(bb);
478 free(bb);
479 free(dd);
480
481 return rc;
482 }
483
gunzip_fd(int infd,int outfd)484 long long gunzip_fd(int infd, int outfd)
485 {
486 struct bitbuf *bb = bitbuf_init(infd, 4096);
487 struct deflate *dd = init_deflate(0);
488 long long rc = 0;
489
490 // Little endian crc table
491 crc_init(dd->crctable, 1);
492 dd->crcfunc = gzip_crc;
493 dd->outfd = outfd;
494
495 do {
496 if (!is_gzip(bb)) error_exit("not gzip");
497
498 inflate(dd, bb);
499
500 // tail: crc32, len32
501 bitbuf_skip(bb, (8-bb->bitpos)&7);
502 if (~dd->crc != bitbuf_get(bb, 32) || dd->len != bitbuf_get(bb, 32))
503 error_exit("bad crc");
504 rc += dd->len;
505
506 bitbuf_skip(bb, (8-bb->bitpos)&7);
507 dd->pos = dd->len = 0;
508 } while (bitbuf_skip(bb, 0));
509 free(bb);
510 free(dd);
511
512 return rc;
513 }
514