1 /* compress.c - deflate/inflate code for zip, gzip, zlib, and raw
2 *
3 * Copyright 2014 Rob Landley <rob@landley.net>
4 *
5 * The inflate/deflate code lives here, so the various things that use it
6 * either live here or call these commands to pipe data through them.
7 *
8 * Divergence from posix: replace obsolete/patented "compress" with mutiplexer.
9 * (gzip already replaces "uncompress".)
10 *
11 * See RFCs 1950 (zlib), 1951 (deflate), and 1952 (gzip)
12 * LSB 4.1 has gzip, gunzip, and zcat
13 * TODO: zip -d DIR -x LIST -list -quiet -no overwrite -overwrite -p to stdout
14
15 // Accept many different kinds of command line argument.
16 // Leave Lrg at end so flag values line up.
17
18 USE_COMPRESS(NEWTOY(compress, "zcd9lrg[-cd][!zgLr]", TOYFLAG_USR|TOYFLAG_BIN))
19 USE_GZIP(NEWTOY(gzip, USE_GZIP_D("d")"19dcflqStvgLRz[!gLRz]", TOYFLAG_USR|TOYFLAG_BIN))
20 USE_ZCAT(NEWTOY(zcat, 0, TOYFLAG_USR|TOYFLAG_BIN))
21 USE_GUNZIP(NEWTOY(gunzip, "cflqStv", TOYFLAG_USR|TOYFLAG_BIN))
22
23 //zip unzip gzip gunzip zcat
24
25 config COMPRESS
26 bool "compress"
27 default n
28 help
29 usage: compress [-zgLR19] [FILE]
30
31 Compress or decompress file (or stdin) using "deflate" algorithm.
32
33 -1 min compression
34 -9 max compression (default)
35 -g gzip (default)
36 -L zlib
37 -R raw
38 -z zip
39
40 config GZIP
41 bool "gzip"
42 default y
43 depends on COMPRESS
44 help
45 usage: gzip [-19cfqStvzgLR] [FILE...]
46
47 Compess (deflate) file(s). With no files, compress stdin to stdout.
48
49 On successful decompression, compressed files are replaced with the
50 uncompressed version. The input file is removed and replaced with
51 a new file without the .gz extension (with same ownership/permissions).
52
53 -1 Minimal compression (fastest)
54 -9 Max compression (default)
55 -c cat to stdout (act as zcat)
56 -f force (if output file exists, input is tty, unrecognized extension)
57 -q quiet (no warnings)
58 -S specify exension (default .*)
59 -t test compressed file(s)
60 -v verbose (like -l, but compress files)
61
62 Compression type:
63 -g gzip (default) -L zlib -R raw -z zip
64
65 config GZIP_D
66 bool
67 default y
68 depends on GZIP && DECOMPRESS
69 help
70 usage: gzip [-d]
71
72 -d decompress (act as gunzip)
73
74 config DECOMPRESS
75 bool "decompress"
76 default n
77 help
78 usage: compress [-zglrcd9] [FILE]
79
80 Compress or decompress file (or stdin) using "deflate" algorithm.
81
82 -c compress with -g gzip (default) -l zlib -r raw -z zip
83 -d decompress (autodetects type)
84
85
86 config ZCAT
87 bool "zcat"
88 default y
89 depends on DECOMPRESS
90 help
91 usage: zcat [FILE...]
92
93 Decompress deflated file(s) to stdout
94
95 config GUNZIP
96 bool "gunzip"
97 default y
98 depends on DECOMPRESS
99 help
100 usage: gunzip [-cflqStv] [FILE...]
101
102 Decompess (deflate) file(s). With no files, compress stdin to stdout.
103
104 On successful decompression, compressed files are replaced with the
105 uncompressed version. The input file is removed and replaced with
106 a new file without the .gz extension (with same ownership/permissions).
107
108 -c cat to stdout (act as zcat)
109 -f force (output file exists, input is tty, unrecognized extension)
110 -l list compressed/uncompressed/ratio/name for each input file.
111 -q quiet (no warnings)
112 -S specify exension (default .*)
113 -t test compressed file(s)
114 -v verbose (like -l, but decompress files)
115 */
116
117 #define FOR_compress
118 #include "toys.h"
119
120 GLOBALS(
121 // Huffman codes: base offset and extra bits tables (length and distance)
122 char lenbits[29], distbits[30];
123 unsigned short lenbase[29], distbase[30];
124 void *fixdisthuff, *fixlithuff;
125
126 // CRC
127 void (*crcfunc)(char *data, int len);
128 unsigned crc;
129
130 // Compressed data buffer
131 char *data;
132 unsigned pos, len;
133 int infd, outfd;
134
135 // Tables only used for deflation
136 unsigned short *hashhead, *hashchain;
137 )
138
139 // little endian bit buffer
140 struct bitbuf {
141 int fd, bitpos, len, max;
142 char buf[];
143 };
144
145 // malloc a struct bitbuf
bitbuf_init(int fd,int size)146 struct bitbuf *bitbuf_init(int fd, int size)
147 {
148 struct bitbuf *bb = xzalloc(sizeof(struct bitbuf)+size);
149
150 bb->max = size;
151 bb->fd = fd;
152
153 return bb;
154 }
155
156 // Advance bitpos without the overhead of recording bits
bitbuf_skip(struct bitbuf * bb,int bits)157 void bitbuf_skip(struct bitbuf *bb, int bits)
158 {
159 int pos = bb->bitpos + bits, len = bb->len << 3;
160
161 while (pos >= len) {
162 pos -= len;
163 len = (bb->len = read(bb->fd, bb->buf, bb->max)) << 3;
164 if (bb->len < 1) perror_exit("inflate EOF");
165 }
166 bb->bitpos = pos;
167 }
168
169 // Optimized single bit inlined version
bitbuf_bit(struct bitbuf * bb)170 static inline int bitbuf_bit(struct bitbuf *bb)
171 {
172 int bufpos = bb->bitpos>>3;
173
174 if (bufpos == bb->len) {
175 bitbuf_skip(bb, 0);
176 bufpos = 0;
177 }
178
179 return (bb->buf[bufpos]>>(bb->bitpos++&7))&1;
180 }
181
182 // Fetch the next X bits from the bitbuf, little endian
bitbuf_get(struct bitbuf * bb,int bits)183 unsigned bitbuf_get(struct bitbuf *bb, int bits)
184 {
185 int result = 0, offset = 0;
186
187 while (bits) {
188 int click = bb->bitpos >> 3, blow, blen;
189
190 // Load more data if buffer empty
191 if (click == bb->len) bitbuf_skip(bb, click = 0);
192
193 // grab bits from next byte
194 blow = bb->bitpos & 7;
195 blen = 8-blow;
196 if (blen > bits) blen = bits;
197 result |= ((bb->buf[click] >> blow) & ((1<<blen)-1)) << offset;
198 offset += blen;
199 bits -= blen;
200 bb->bitpos += blen;
201 }
202
203 return result;
204 }
205
bitbuf_flush(struct bitbuf * bb)206 void bitbuf_flush(struct bitbuf *bb)
207 {
208 if (!bb->bitpos) return;
209
210 xwrite(bb->fd, bb->buf, (bb->bitpos+7)/8);
211 memset(bb->buf, 0, bb->max);
212 bb->bitpos = 0;
213 }
214
bitbuf_put(struct bitbuf * bb,int data,int len)215 void bitbuf_put(struct bitbuf *bb, int data, int len)
216 {
217 while (len) {
218 int click = bb->bitpos >> 3, blow, blen;
219
220 // Flush buffer if necessary
221 if (click == bb->max) {
222 bitbuf_flush(bb);
223 click = 0;
224 }
225 blow = bb->bitpos & 7;
226 blen = 8-blow;
227 if (blen > len) blen = len;
228 bb->buf[click] |= data << blow;
229 bb->bitpos += blen;
230 data >>= blen;
231 len -= blen;
232 }
233 }
234
data_crc(char sym)235 static void data_crc(char sym)
236 {
237 TT.data[TT.pos++ & 32767] = sym;
238
239 if (!(TT.pos & 32767)) {
240 xwrite(TT.outfd, TT.data, 32768);
241 if (TT.crcfunc) TT.crcfunc(TT.data, 32768);
242 }
243 }
244
245 // Huffman coding uses bits to traverse a binary tree to a leaf node,
246 // By placing frequently occurring symbols at shorter paths, frequently
247 // used symbols may be represented in fewer bits than uncommon symbols.
248
249 struct huff {
250 unsigned short length[16];
251 unsigned short symbol[288];
252 };
253
254 // Create simple huffman tree from array of bit lengths.
255
256 // The symbols in the huffman trees are sorted (first by bit length
257 // of the code to reach them, then by symbol number). This means that given
258 // the bit length of each symbol, we can construct a unique tree.
len2huff(struct huff * huff,char bitlen[],int len)259 static void len2huff(struct huff *huff, char bitlen[], int len)
260 {
261 int offset[16];
262 int i;
263
264 // Count number of codes at each bit length
265 memset(huff, 0, sizeof(struct huff));
266 for (i = 0; i<len; i++) huff->length[bitlen[i]]++;
267
268 // Sort symbols by bit length. (They'll remain sorted by symbol within that.)
269 *huff->length = *offset = 0;
270 for (i = 1; i<16; i++) offset[i] = offset[i-1] + huff->length[i-1];
271
272 for (i = 0; i<len; i++) if (bitlen[i]) huff->symbol[offset[bitlen[i]]++] = i;
273 }
274
275 // Fetch and decode next huffman coded symbol from bitbuf.
276 // This takes advantage of the sorting to navigate the tree as an array:
277 // each time we fetch a bit we have all the codes at that bit level in
278 // order with no gaps.
huff_and_puff(struct bitbuf * bb,struct huff * huff)279 static unsigned huff_and_puff(struct bitbuf *bb, struct huff *huff)
280 {
281 unsigned short *length = huff->length;
282 int start = 0, offset = 0;
283
284 // Traverse through the bit lengths until our code is in this range
285 for (;;) {
286 offset = (offset << 1) | bitbuf_bit(bb);
287 start += *++length;
288 if ((offset -= *length) < 0) break;
289 if ((length - huff->length) & 16) error_exit("bad symbol");
290 }
291
292 return huff->symbol[start + offset];
293 }
294
295 // Decompress deflated data from bitbuf to TT.outfd.
inflate(struct bitbuf * bb)296 static void inflate(struct bitbuf *bb)
297 {
298 TT.crc = ~0;
299 // repeat until spanked
300 for (;;) {
301 int final, type;
302
303 final = bitbuf_get(bb, 1);
304 type = bitbuf_get(bb, 2);
305
306 if (type == 3) error_exit("bad type");
307
308 // Uncompressed block?
309 if (!type) {
310 int len, nlen;
311
312 // Align to byte, read length
313 bitbuf_skip(bb, (8-bb->bitpos)&7);
314 len = bitbuf_get(bb, 16);
315 nlen = bitbuf_get(bb, 16);
316 if (len != (0xffff & ~nlen)) error_exit("bad len");
317
318 // Dump literal output data
319 while (len) {
320 int pos = bb->bitpos >> 3, bblen = bb->len - pos;
321 char *p = bb->buf+pos;
322
323 // dump bytes until done or end of current bitbuf contents
324 if (bblen > len) bblen = len;
325 pos = bblen;
326 while (pos--) data_crc(*(p++));
327 bitbuf_skip(bb, bblen << 3);
328 len -= bblen;
329 }
330
331 // Compressed block
332 } else {
333 struct huff *disthuff, *lithuff;
334
335 // Dynamic huffman codes?
336 if (type == 2) {
337 struct huff *h2 = ((struct huff *)toybuf)+1;
338 int i, litlen, distlen, hufflen;
339 char *hufflen_order = "\x10\x11\x12\0\x08\x07\x09\x06\x0a\x05\x0b"
340 "\x04\x0c\x03\x0d\x02\x0e\x01\x0f", *bits;
341
342 // The huffman trees are stored as a series of bit lengths
343 litlen = bitbuf_get(bb, 5)+257; // max 288
344 distlen = bitbuf_get(bb, 5)+1; // max 32
345 hufflen = bitbuf_get(bb, 4)+4; // max 19
346
347 // The literal and distance codes are themselves compressed, in
348 // a complicated way: an array of bit lengths (hufflen many
349 // entries, each 3 bits) is used to fill out an array of 19 entries
350 // in a magic order, leaving the rest 0. Then make a tree out of it:
351 memset(bits = toybuf+1, 0, 19);
352 for (i=0; i<hufflen; i++) bits[hufflen_order[i]] = bitbuf_get(bb, 3);
353 len2huff(h2, bits, 19);
354
355 // Use that tree to read in the literal and distance bit lengths
356 for (i = 0; i < litlen + distlen;) {
357 int sym = huff_and_puff(bb, h2);
358
359 // 0-15 are literals, 16 = repeat previous code 3-6 times,
360 // 17 = 3-10 zeroes (3 bit), 18 = 11-138 zeroes (7 bit)
361 if (sym < 16) bits[i++] = sym;
362 else {
363 int len = sym & 2;
364
365 len = bitbuf_get(bb, sym-14+len+(len>>1)) + 3 + (len<<2);
366 memset(bits+i, bits[i-1] * !(sym&3), len);
367 i += len;
368 }
369 }
370 if (i > litlen+distlen) error_exit("bad tree");
371
372 len2huff(lithuff = h2, bits, litlen);
373 len2huff(disthuff = ((struct huff *)toybuf)+2, bits+litlen, distlen);
374
375 // Static huffman codes
376 } else {
377 lithuff = TT.fixlithuff;
378 disthuff = TT.fixdisthuff;
379 }
380
381 // Use huffman tables to decode block of compressed symbols
382 for (;;) {
383 int sym = huff_and_puff(bb, lithuff);
384
385 // Literal?
386 if (sym < 256) data_crc(sym);
387
388 // Copy range?
389 else if (sym > 256) {
390 int len, dist;
391
392 sym -= 257;
393 len = TT.lenbase[sym] + bitbuf_get(bb, TT.lenbits[sym]);
394 sym = huff_and_puff(bb, disthuff);
395 dist = TT.distbase[sym] + bitbuf_get(bb, TT.distbits[sym]);
396 sym = TT.pos & 32767;
397
398 while (len--) data_crc(TT.data[(TT.pos-dist) & 32767]);
399
400 // End of block
401 } else break;
402 }
403 }
404
405 // Was that the last block?
406 if (final) break;
407 }
408
409 if (TT.pos & 32767) {
410 xwrite(TT.outfd, TT.data, TT.pos & 32767);
411 if (TT.crcfunc) TT.crcfunc(TT.data, TT.pos & 32767);
412 }
413 }
414
415 // Deflate from TT.infd to bitbuf
416 // For deflate, TT.len = input read, TT.pos = input consumed
deflate(struct bitbuf * bb)417 static void deflate(struct bitbuf *bb)
418 {
419 char *data = TT.data;
420 int len, final = 0;
421
422 TT.crc = ~0;
423
424 while (!final) {
425 // Read next half-window of data if we haven't hit EOF yet.
426 len = readall(TT.infd, data+(TT.len&32768), 32768);
427 if (len < 0) perror_exit("read"); // todo: add filename
428 if (len != 32768) final++;
429 if (TT.crcfunc) TT.crcfunc(data+(TT.len&32768), len);
430 // TT.len += len; crcfunc advances len
431
432 // store block as literal
433 bitbuf_put(bb, final, 1);
434 bitbuf_put(bb, 0, 1);
435
436 bitbuf_put(bb, 0, (8-bb->bitpos)&7);
437 bitbuf_put(bb, len, 16);
438 bitbuf_put(bb, 0xffff & ~len, 16);
439
440 // repeat until spanked
441 while (TT.pos != TT.len) {
442 unsigned pos = TT.pos & 65535;
443
444 bitbuf_put(bb, data[pos], 8);
445
446 // need to refill buffer?
447 if (!(32767 & ++TT.pos) && !final) break;
448 }
449 }
450 bitbuf_flush(bb);
451 }
452
453 // Allocate memory for deflate/inflate.
init_deflate(int compress)454 static void init_deflate(int compress)
455 {
456 int i, n = 1;
457
458 // compress needs 64k data and 32k each for hashhead and hashchain.
459 // decompress just needs 32k data.
460 TT.data = xmalloc(32768*(compress ? 4 : 1));
461 if (compress) {
462 TT.hashhead = (unsigned short *)(TT.data + 65536);
463 TT.hashchain = (unsigned short *)(TT.data + 65536 + 32768);
464 }
465
466 // Calculate lenbits, lenbase, distbits, distbase
467 *TT.lenbase = 3;
468 for (i = 0; i<sizeof(TT.lenbits)-1; i++) {
469 if (i>4) {
470 if (!(i&3)) {
471 TT.lenbits[i]++;
472 n <<= 1;
473 }
474 if (i == 27) n--;
475 else TT.lenbits[i+1] = TT.lenbits[i];
476 }
477 TT.lenbase[i+1] = n + TT.lenbase[i];
478 }
479 n = 0;
480 for (i = 0; i<sizeof(TT.distbits); i++) {
481 TT.distbase[i] = 1<<n;
482 if (i) TT.distbase[i] += TT.distbase[i-1];
483 if (i>3 && !(i&1)) n++;
484 TT.distbits[i] = n;
485 }
486
487 // Init fixed huffman tables
488 for (i=0; i<288; i++) toybuf[i] = 8 + (i>143) - ((i>255)<<1) + (i>279);
489 len2huff(TT.fixlithuff = ((struct huff *)toybuf)+3, toybuf, 288);
490 memset(toybuf, 5, 30);
491 len2huff(TT.fixdisthuff = ((struct huff *)toybuf)+4, toybuf, 30);
492 }
493
494 // Return true/false whether we consumed a gzip header.
is_gzip(struct bitbuf * bb)495 static int is_gzip(struct bitbuf *bb)
496 {
497 int flags;
498
499 // Confirm signature
500 if (bitbuf_get(bb, 24) != 0x088b1f || (flags = bitbuf_get(bb, 8)) > 31)
501 return 0;
502 bitbuf_skip(bb, 6*8);
503
504 // Skip extra, name, comment, header CRC fields
505 if (flags & 4) bitbuf_skip(bb, 16);
506 if (flags & 8) while (bitbuf_get(bb, 8));
507 if (flags & 16) while (bitbuf_get(bb, 8));
508 if (flags & 2) bitbuf_skip(bb, 16);
509
510 return 1;
511 }
512
gzip_crc(char * data,int len)513 void gzip_crc(char *data, int len)
514 {
515 int i;
516 unsigned crc, *crc_table = (unsigned *)(toybuf+sizeof(toybuf)-1024);
517
518 crc = TT.crc;
519 for (i=0; i<len; i++) crc = crc_table[(crc^data[i])&0xff] ^ (crc>>8);
520 TT.crc = crc;
521 TT.len += len;
522 }
523
do_gzip(int fd,char * name)524 static void do_gzip(int fd, char *name)
525 {
526 struct bitbuf *bb = bitbuf_init(1, sizeof(toybuf));
527
528 // Header from RFC 1952 section 2.2:
529 // 2 ID bytes (1F, 8b), gzip method byte (8=deflate), FLAG byte (none),
530 // 4 byte MTIME (zeroed), Extra Flags (2=maximum compression),
531 // Operating System (FF=unknown)
532
533 TT.infd = fd;
534 xwrite(bb->fd, "\x1f\x8b\x08\0\0\0\0\0\x02\xff", 10);
535
536 // Use last 1k of toybuf for little endian crc table
537 crc_init((unsigned *)(toybuf+sizeof(toybuf)-1024), 1);
538 TT.crcfunc = gzip_crc;
539
540 deflate(bb);
541
542 // tail: crc32, len32
543
544 bitbuf_put(bb, 0, (8-bb->bitpos)&7);
545 bitbuf_put(bb, ~TT.crc, 32);
546 bitbuf_put(bb, TT.len, 32);
547
548 bitbuf_flush(bb);
549 free(bb);
550 }
551
do_zcat(int fd,char * name)552 static void do_zcat(int fd, char *name)
553 {
554 struct bitbuf *bb = bitbuf_init(fd, sizeof(toybuf));
555
556 if (!is_gzip(bb)) error_exit("not gzip");
557 TT.outfd = 1;
558
559 // Use last 1k of toybuf for little endian crc table
560 crc_init((unsigned *)(toybuf+sizeof(toybuf)-1024), 1);
561 TT.crcfunc = gzip_crc;
562
563 inflate(bb);
564
565 // tail: crc32, len32
566
567 bitbuf_skip(bb, (8-bb->bitpos)&7);
568 if (~TT.crc != bitbuf_get(bb, 32) || TT.len != bitbuf_get(bb, 32))
569 error_exit("bad crc");
570 free(bb);
571 }
572
573 // Parse many different kinds of command line argument:
574
compress_main(void)575 void compress_main(void)
576 {
577 // todo: this
578 printf("hello world");
579 }
580
581 //#define CLEANUP_compress
582 //#define FOR_zcat
583 //#include "generated/flags.h"
584
zcat_main(void)585 void zcat_main(void)
586 {
587 init_deflate(0);
588
589 loopfiles(toys.optargs, do_zcat);
590 }
591
gunzip_main(void)592 void gunzip_main(void)
593 {
594 init_deflate(0);
595
596 loopfiles(toys.optargs, do_zcat);
597 }
598
gzip_main(void)599 void gzip_main(void)
600 {
601 init_deflate(1);
602
603 loopfiles(toys.optargs, do_gzip);
604 }
605