1 /* compress.c - deflate/inflate code for zip, gzip, zlib, and raw
2 *
3 * Copyright 2014 Rob Landley <rob@landley.net>
4 *
5 * The inflate/deflate code lives here, so the various things that use it
6 * either live here or call these commands to pipe data through them.
7 *
8 * Divergence from posix: replace obsolete/patented "compress" with mutiplexer.
9 * (gzip already replaces "uncompress".)
10 *
11 * See RFCs 1950 (zlib), 1951 (deflate), and 1952 (gzip)
12 * LSB 4.1 has gzip, gunzip, and zcat
13 * TODO: zip -d DIR -x LIST -list -quiet -no overwrite -overwrite -p to stdout
14
15 // Accept many different kinds of command line argument.
16 // Leave Lrg at end so flag values line up.
17
18 USE_COMPRESS(NEWTOY(compress, "zcd9lrg[-cd][!zgLr]", TOYFLAG_USR|TOYFLAG_BIN))
19
20 //zip unzip gzip gunzip zcat
21
22 config COMPRESS
23 bool "compress"
24 default n
25 help
26 usage: compress [-zgLR19] [FILE]
27
28 Compress or decompress file (or stdin) using "deflate" algorithm.
29
30 -1 min compression
31 -9 max compression (default)
32 -g gzip (default)
33 -L zlib
34 -R raw
35 -z zip
36
37 config DECOMPRESS
38 bool "decompress"
39 default n
40 help
41 usage: compress [-zglrcd9] [FILE]
42
43 Compress or decompress file (or stdin) using "deflate" algorithm.
44
45 -c compress with -g gzip (default) -l zlib -r raw -z zip
46 -d decompress (autodetects type)
47 */
48
49 #define FOR_compress
50 #include "toys.h"
51
52 GLOBALS(
53 // Huffman codes: base offset and extra bits tables (length and distance)
54 char lenbits[29], distbits[30];
55 unsigned short lenbase[29], distbase[30];
56 void *fixdisthuff, *fixlithuff;
57
58 // CRC
59 void (*crcfunc)(char *data, int len);
60 unsigned crc;
61
62 // Compressed data buffer
63 char *data;
64 unsigned pos, len;
65 int infd, outfd;
66
67 // Tables only used for deflation
68 unsigned short *hashhead, *hashchain;
69 )
70
71 // little endian bit buffer
72 struct bitbuf {
73 int fd, bitpos, len, max;
74 char buf[];
75 };
76
77 // malloc a struct bitbuf
bitbuf_init(int fd,int size)78 struct bitbuf *bitbuf_init(int fd, int size)
79 {
80 struct bitbuf *bb = xzalloc(sizeof(struct bitbuf)+size);
81
82 bb->max = size;
83 bb->fd = fd;
84
85 return bb;
86 }
87
88 // Advance bitpos without the overhead of recording bits
bitbuf_skip(struct bitbuf * bb,int bits)89 void bitbuf_skip(struct bitbuf *bb, int bits)
90 {
91 int pos = bb->bitpos + bits, len = bb->len << 3;
92
93 while (pos >= len) {
94 pos -= len;
95 len = (bb->len = read(bb->fd, bb->buf, bb->max)) << 3;
96 if (bb->len < 1) perror_exit("inflate EOF");
97 }
98 bb->bitpos = pos;
99 }
100
101 // Optimized single bit inlined version
bitbuf_bit(struct bitbuf * bb)102 static inline int bitbuf_bit(struct bitbuf *bb)
103 {
104 int bufpos = bb->bitpos>>3;
105
106 if (bufpos == bb->len) {
107 bitbuf_skip(bb, 0);
108 bufpos = 0;
109 }
110
111 return (bb->buf[bufpos]>>(bb->bitpos++&7))&1;
112 }
113
114 // Fetch the next X bits from the bitbuf, little endian
bitbuf_get(struct bitbuf * bb,int bits)115 unsigned bitbuf_get(struct bitbuf *bb, int bits)
116 {
117 int result = 0, offset = 0;
118
119 while (bits) {
120 int click = bb->bitpos >> 3, blow, blen;
121
122 // Load more data if buffer empty
123 if (click == bb->len) bitbuf_skip(bb, click = 0);
124
125 // grab bits from next byte
126 blow = bb->bitpos & 7;
127 blen = 8-blow;
128 if (blen > bits) blen = bits;
129 result |= ((bb->buf[click] >> blow) & ((1<<blen)-1)) << offset;
130 offset += blen;
131 bits -= blen;
132 bb->bitpos += blen;
133 }
134
135 return result;
136 }
137
bitbuf_flush(struct bitbuf * bb)138 void bitbuf_flush(struct bitbuf *bb)
139 {
140 if (!bb->bitpos) return;
141
142 xwrite(bb->fd, bb->buf, (bb->bitpos+7)/8);
143 memset(bb->buf, 0, bb->max);
144 bb->bitpos = 0;
145 }
146
bitbuf_put(struct bitbuf * bb,int data,int len)147 void bitbuf_put(struct bitbuf *bb, int data, int len)
148 {
149 while (len) {
150 int click = bb->bitpos >> 3, blow, blen;
151
152 // Flush buffer if necessary
153 if (click == bb->max) {
154 bitbuf_flush(bb);
155 click = 0;
156 }
157 blow = bb->bitpos & 7;
158 blen = 8-blow;
159 if (blen > len) blen = len;
160 bb->buf[click] |= data << blow;
161 bb->bitpos += blen;
162 data >>= blen;
163 len -= blen;
164 }
165 }
166
output_byte(char sym)167 static void output_byte(char sym)
168 {
169 int pos = TT.pos++ & 32767;
170
171 TT.data[pos] = sym;
172
173 if (pos == 32767) {
174 xwrite(TT.outfd, TT.data, 32768);
175 if (TT.crcfunc) TT.crcfunc(TT.data, 32768);
176 }
177 }
178
179 // Huffman coding uses bits to traverse a binary tree to a leaf node,
180 // By placing frequently occurring symbols at shorter paths, frequently
181 // used symbols may be represented in fewer bits than uncommon symbols.
182
183 struct huff {
184 unsigned short length[16];
185 unsigned short symbol[288];
186 };
187
188 // Create simple huffman tree from array of bit lengths.
189
190 // The symbols in the huffman trees are sorted (first by bit length
191 // of the code to reach them, then by symbol number). This means that given
192 // the bit length of each symbol, we can construct a unique tree.
len2huff(struct huff * huff,char bitlen[],int len)193 static void len2huff(struct huff *huff, char bitlen[], int len)
194 {
195 int offset[16];
196 int i;
197
198 // Count number of codes at each bit length
199 memset(huff, 0, sizeof(struct huff));
200 for (i = 0; i<len; i++) huff->length[bitlen[i]]++;
201
202 // Sort symbols by bit length. (They'll remain sorted by symbol within that.)
203 *huff->length = *offset = 0;
204 for (i = 1; i<16; i++) offset[i] = offset[i-1] + huff->length[i-1];
205
206 for (i = 0; i<len; i++) if (bitlen[i]) huff->symbol[offset[bitlen[i]]++] = i;
207 }
208
209 // Fetch and decode next huffman coded symbol from bitbuf.
210 // This takes advantage of the sorting to navigate the tree as an array:
211 // each time we fetch a bit we have all the codes at that bit level in
212 // order with no gaps.
huff_and_puff(struct bitbuf * bb,struct huff * huff)213 static unsigned huff_and_puff(struct bitbuf *bb, struct huff *huff)
214 {
215 unsigned short *length = huff->length;
216 int start = 0, offset = 0;
217
218 // Traverse through the bit lengths until our code is in this range
219 for (;;) {
220 offset = (offset << 1) | bitbuf_bit(bb);
221 start += *++length;
222 if ((offset -= *length) < 0) break;
223 if ((length - huff->length) & 16) error_exit("bad symbol");
224 }
225
226 return huff->symbol[start + offset];
227 }
228
229 // Decompress deflated data from bitbuf to TT.outfd.
inflate(struct bitbuf * bb)230 static void inflate(struct bitbuf *bb)
231 {
232 TT.crc = ~0;
233 // repeat until spanked
234 for (;;) {
235 int final, type;
236
237 final = bitbuf_get(bb, 1);
238 type = bitbuf_get(bb, 2);
239
240 if (type == 3) error_exit("bad type");
241
242 // Uncompressed block?
243 if (!type) {
244 int len, nlen;
245
246 // Align to byte, read length
247 bitbuf_skip(bb, (8-bb->bitpos)&7);
248 len = bitbuf_get(bb, 16);
249 nlen = bitbuf_get(bb, 16);
250 if (len != (0xffff & ~nlen)) error_exit("bad len");
251
252 // Dump literal output data
253 while (len) {
254 int pos = bb->bitpos >> 3, bblen = bb->len - pos;
255 char *p = bb->buf+pos;
256
257 // dump bytes until done or end of current bitbuf contents
258 if (bblen > len) bblen = len;
259 pos = bblen;
260 while (pos--) output_byte(*(p++));
261 bitbuf_skip(bb, bblen << 3);
262 len -= bblen;
263 }
264
265 // Compressed block
266 } else {
267 struct huff *disthuff, *lithuff;
268
269 // Dynamic huffman codes?
270 if (type == 2) {
271 struct huff *h2 = ((struct huff *)toybuf)+1;
272 int i, litlen, distlen, hufflen;
273 char *hufflen_order = "\x10\x11\x12\0\x08\x07\x09\x06\x0a\x05\x0b"
274 "\x04\x0c\x03\x0d\x02\x0e\x01\x0f", *bits;
275
276 // The huffman trees are stored as a series of bit lengths
277 litlen = bitbuf_get(bb, 5)+257; // max 288
278 distlen = bitbuf_get(bb, 5)+1; // max 32
279 hufflen = bitbuf_get(bb, 4)+4; // max 19
280
281 // The literal and distance codes are themselves compressed, in
282 // a complicated way: an array of bit lengths (hufflen many
283 // entries, each 3 bits) is used to fill out an array of 19 entries
284 // in a magic order, leaving the rest 0. Then make a tree out of it:
285 memset(bits = toybuf+1, 0, 19);
286 for (i=0; i<hufflen; i++) bits[hufflen_order[i]] = bitbuf_get(bb, 3);
287 len2huff(h2, bits, 19);
288
289 // Use that tree to read in the literal and distance bit lengths
290 for (i = 0; i < litlen + distlen;) {
291 int sym = huff_and_puff(bb, h2);
292
293 // 0-15 are literals, 16 = repeat previous code 3-6 times,
294 // 17 = 3-10 zeroes (3 bit), 18 = 11-138 zeroes (7 bit)
295 if (sym < 16) bits[i++] = sym;
296 else {
297 int len = sym & 2;
298
299 len = bitbuf_get(bb, sym-14+len+(len>>1)) + 3 + (len<<2);
300 memset(bits+i, bits[i-1] * !(sym&3), len);
301 i += len;
302 }
303 }
304 if (i > litlen+distlen) error_exit("bad tree");
305
306 len2huff(lithuff = h2, bits, litlen);
307 len2huff(disthuff = ((struct huff *)toybuf)+2, bits+litlen, distlen);
308
309 // Static huffman codes
310 } else {
311 lithuff = TT.fixlithuff;
312 disthuff = TT.fixdisthuff;
313 }
314
315 // Use huffman tables to decode block of compressed symbols
316 for (;;) {
317 int sym = huff_and_puff(bb, lithuff);
318
319 // Literal?
320 if (sym < 256) output_byte(sym);
321
322 // Copy range?
323 else if (sym > 256) {
324 int len, dist;
325
326 sym -= 257;
327 len = TT.lenbase[sym] + bitbuf_get(bb, TT.lenbits[sym]);
328 sym = huff_and_puff(bb, disthuff);
329 dist = TT.distbase[sym] + bitbuf_get(bb, TT.distbits[sym]);
330 sym = TT.pos & 32767;
331
332 while (len--) output_byte(TT.data[(TT.pos-dist) & 32767]);
333
334 // End of block
335 } else break;
336 }
337 }
338
339 // Was that the last block?
340 if (final) break;
341 }
342
343 if (TT.pos & 32767) {
344 xwrite(TT.outfd, TT.data, TT.pos & 32767);
345 if (TT.crcfunc) TT.crcfunc(TT.data, TT.pos & 32767);
346 }
347 }
348
349 // Deflate from TT.infd to bitbuf
350 // For deflate, TT.len = input read, TT.pos = input consumed
deflate(struct bitbuf * bb)351 static void deflate(struct bitbuf *bb)
352 {
353 char *data = TT.data;
354 int len, final = 0;
355
356 TT.crc = ~0;
357
358 while (!final) {
359 // Read next half-window of data if we haven't hit EOF yet.
360 len = readall(TT.infd, data+(TT.len&32768), 32768);
361 if (len < 0) perror_exit("read"); // todo: add filename
362 if (len != 32768) final++;
363 if (TT.crcfunc) TT.crcfunc(data+(TT.len&32768), len);
364 // TT.len += len; crcfunc advances len
365
366 // store block as literal
367 bitbuf_put(bb, final, 1);
368 bitbuf_put(bb, 0, 1);
369
370 bitbuf_put(bb, 0, (8-bb->bitpos)&7);
371 bitbuf_put(bb, len, 16);
372 bitbuf_put(bb, 0xffff & ~len, 16);
373
374 // repeat until spanked
375 while (TT.pos != TT.len) {
376 unsigned pos = TT.pos & 65535;
377
378 bitbuf_put(bb, data[pos], 8);
379
380 // need to refill buffer?
381 if (!(32767 & ++TT.pos) && !final) break;
382 }
383 }
384 bitbuf_flush(bb);
385 }
386
387 // Allocate memory for deflate/inflate.
init_deflate(int compress)388 static void init_deflate(int compress)
389 {
390 int i, n = 1;
391
392 // compress needs 64k data and 32k each for hashhead and hashchain.
393 // decompress just needs 32k data.
394 TT.data = xmalloc(32768*(compress ? 4 : 1));
395 if (compress) {
396 TT.hashhead = (unsigned short *)(TT.data + 65536);
397 TT.hashchain = (unsigned short *)(TT.data + 65536 + 32768);
398 }
399
400 // Calculate lenbits, lenbase, distbits, distbase
401 *TT.lenbase = 3;
402 for (i = 0; i<sizeof(TT.lenbits)-1; i++) {
403 if (i>4) {
404 if (!(i&3)) {
405 TT.lenbits[i]++;
406 n <<= 1;
407 }
408 if (i == 27) n--;
409 else TT.lenbits[i+1] = TT.lenbits[i];
410 }
411 TT.lenbase[i+1] = n + TT.lenbase[i];
412 }
413 n = 0;
414 for (i = 0; i<sizeof(TT.distbits); i++) {
415 TT.distbase[i] = 1<<n;
416 if (i) TT.distbase[i] += TT.distbase[i-1];
417 if (i>3 && !(i&1)) n++;
418 TT.distbits[i] = n;
419 }
420
421 // Init fixed huffman tables
422 for (i=0; i<288; i++) toybuf[i] = 8 + (i>143) - ((i>255)<<1) + (i>279);
423 len2huff(TT.fixlithuff = ((struct huff *)toybuf)+3, toybuf, 288);
424 memset(toybuf, 5, 30);
425 len2huff(TT.fixdisthuff = ((struct huff *)toybuf)+4, toybuf, 30);
426 }
427
428 // Return true/false whether we consumed a gzip header.
is_gzip(struct bitbuf * bb)429 static int is_gzip(struct bitbuf *bb)
430 {
431 int flags;
432
433 // Confirm signature
434 if (bitbuf_get(bb, 24) != 0x088b1f || (flags = bitbuf_get(bb, 8)) > 31)
435 return 0;
436 bitbuf_skip(bb, 6*8);
437
438 // Skip extra, name, comment, header CRC fields
439 if (flags & 4) bitbuf_skip(bb, 16);
440 if (flags & 8) while (bitbuf_get(bb, 8));
441 if (flags & 16) while (bitbuf_get(bb, 8));
442 if (flags & 2) bitbuf_skip(bb, 16);
443
444 return 1;
445 }
446
gzip_crc(char * data,int len)447 void gzip_crc(char *data, int len)
448 {
449 int i;
450 unsigned crc, *crc_table = (unsigned *)(toybuf+sizeof(toybuf)-1024);
451
452 crc = TT.crc;
453 for (i=0; i<len; i++) crc = crc_table[(crc^data[i])&0xff] ^ (crc>>8);
454 TT.crc = crc;
455 TT.len += len;
456 }
457
do_gzip(int fd,char * name)458 static void do_gzip(int fd, char *name)
459 {
460 struct bitbuf *bb = bitbuf_init(1, sizeof(toybuf));
461
462 // Header from RFC 1952 section 2.2:
463 // 2 ID bytes (1F, 8b), gzip method byte (8=deflate), FLAG byte (none),
464 // 4 byte MTIME (zeroed), Extra Flags (2=maximum compression),
465 // Operating System (FF=unknown)
466
467 TT.infd = fd;
468 xwrite(bb->fd, "\x1f\x8b\x08\0\0\0\0\0\x02\xff", 10);
469
470 // Use last 1k of toybuf for little endian crc table
471 crc_init((unsigned *)(toybuf+sizeof(toybuf)-1024), 1);
472 TT.crcfunc = gzip_crc;
473
474 deflate(bb);
475
476 // tail: crc32, len32
477
478 bitbuf_put(bb, 0, (8-bb->bitpos)&7);
479 bitbuf_put(bb, ~TT.crc, 32);
480 bitbuf_put(bb, TT.len, 32);
481
482 bitbuf_flush(bb);
483 free(bb);
484 }
485
do_zcat(int fd,char * name)486 static void do_zcat(int fd, char *name)
487 {
488 struct bitbuf *bb = bitbuf_init(fd, sizeof(toybuf));
489
490 if (!is_gzip(bb)) error_exit("not gzip");
491 TT.outfd = 1;
492
493 // Use last 1k of toybuf for little endian crc table
494 crc_init((unsigned *)(toybuf+sizeof(toybuf)-1024), 1);
495 TT.crcfunc = gzip_crc;
496
497 inflate(bb);
498
499 // tail: crc32, len32
500
501 bitbuf_skip(bb, (8-bb->bitpos)&7);
502 if (~TT.crc != bitbuf_get(bb, 32) || TT.len != bitbuf_get(bb, 32))
503 error_exit("bad crc");
504 free(bb);
505 }
506
507 // Parse many different kinds of command line argument:
508
compress_main(void)509 void compress_main(void)
510 {
511 // todo: this
512 printf("hello world");
513 }
514