• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* compress.c - deflate/inflate code for zip, gzip, zlib, and raw
2  *
3  * Copyright 2014 Rob Landley <rob@landley.net>
4  *
5  * The inflate/deflate code lives here, so the various things that use it
6  * either live here or call these commands to pipe data through them.
7  *
8  * Divergence from posix: replace obsolete/patented "compress" with mutiplexer.
9  * (gzip already replaces "uncompress".)
10  *
11  * See RFCs 1950 (zlib), 1951 (deflate), and 1952 (gzip)
12  * LSB 4.1 has gzip, gunzip, and zcat
13  * TODO: zip -d DIR -x LIST -list -quiet -no overwrite -overwrite -p to stdout
14 
15 // Accept many different kinds of command line argument.
16 // Leave Lrg at end so flag values line up.
17 
18 USE_COMPRESS(NEWTOY(compress, "zcd9lrg[-cd][!zgLr]", TOYFLAG_USR|TOYFLAG_BIN))
19 USE_GZIP(NEWTOY(gzip, USE_GZIP_D("d")"19dcflqStvgLRz[!gLRz]", TOYFLAG_USR|TOYFLAG_BIN))
20 USE_ZCAT(NEWTOY(zcat, 0, TOYFLAG_USR|TOYFLAG_BIN))
21 USE_GUNZIP(NEWTOY(gunzip, "cflqStv", TOYFLAG_USR|TOYFLAG_BIN))
22 
23 //zip unzip gzip gunzip zcat
24 
25 config COMPRESS
26   bool "compress"
27   default n
28   help
29     usage: compress [-zgLR19] [FILE]
30 
31     Compress or decompress file (or stdin) using "deflate" algorithm.
32 
33     -1	min compression
34     -9	max compression (default)
35     -g	gzip (default)
36     -L	zlib
37     -R	raw
38     -z	zip
39 
40 config GZIP
41   bool "gzip"
42   default y
43   depends on COMPRESS
44   help
45     usage: gzip [-19cfqStvzgLR] [FILE...]
46 
47     Compess (deflate) file(s). With no files, compress stdin to stdout.
48 
49     On successful decompression, compressed files are replaced with the
50     uncompressed version. The input file is removed and replaced with
51     a new file without the .gz extension (with same ownership/permissions).
52 
53     -1	Minimal compression (fastest)
54     -9	Max compression (default)
55     -c	cat to stdout (act as zcat)
56     -f	force (if output file exists, input is tty, unrecognized extension)
57     -q	quiet (no warnings)
58     -S	specify exension (default .*)
59     -t	test compressed file(s)
60     -v	verbose (like -l, but compress files)
61 
62     Compression type:
63     -g gzip (default)    -L zlib    -R raw    -z zip
64 
65 config GZIP_D
66   bool
67   default y
68   depends on GZIP && DECOMPRESS
69   help
70     usage: gzip [-d]
71 
72     -d	decompress (act as gunzip)
73 
74 config DECOMPRESS
75   bool "decompress"
76   default n
77   help
78     usage: compress [-zglrcd9] [FILE]
79 
80     Compress or decompress file (or stdin) using "deflate" algorithm.
81 
82     -c	compress with -g gzip (default)  -l zlib  -r raw  -z zip
83     -d	decompress (autodetects type)
84 
85 
86 config ZCAT
87   bool "zcat"
88   default y
89   depends on DECOMPRESS
90   help
91     usage: zcat [FILE...]
92 
93     Decompress deflated file(s) to stdout
94 
95 config GUNZIP
96   bool "gunzip"
97   default y
98   depends on DECOMPRESS
99   help
100     usage: gunzip [-cflqStv] [FILE...]
101 
102     Decompess (deflate) file(s). With no files, compress stdin to stdout.
103 
104     On successful decompression, compressed files are replaced with the
105     uncompressed version. The input file is removed and replaced with
106     a new file without the .gz extension (with same ownership/permissions).
107 
108     -c	cat to stdout (act as zcat)
109     -f	force (output file exists, input is tty, unrecognized extension)
110     -l	list compressed/uncompressed/ratio/name for each input file.
111     -q	quiet (no warnings)
112     -S	specify exension (default .*)
113     -t	test compressed file(s)
114     -v	verbose (like -l, but decompress files)
115 */
116 
117 #define FOR_compress
118 #include "toys.h"
119 
120 GLOBALS(
121   // Huffman codes: base offset and extra bits tables (length and distance)
122   char lenbits[29], distbits[30];
123   unsigned short lenbase[29], distbase[30];
124   void *fixdisthuff, *fixlithuff;
125 
126   // CRC
127   void (*crcfunc)(char *data, int len);
128   unsigned crc;
129 
130   // Compressed data buffer
131   char *data;
132   unsigned pos, len;
133   int infd, outfd;
134 
135   // Tables only used for deflation
136   unsigned short *hashhead, *hashchain;
137 )
138 
139 // little endian bit buffer
140 struct bitbuf {
141   int fd, bitpos, len, max;
142   char buf[];
143 };
144 
145 // malloc a struct bitbuf
bitbuf_init(int fd,int size)146 struct bitbuf *bitbuf_init(int fd, int size)
147 {
148   struct bitbuf *bb = xzalloc(sizeof(struct bitbuf)+size);
149 
150   bb->max = size;
151   bb->fd = fd;
152 
153   return bb;
154 }
155 
156 // Advance bitpos without the overhead of recording bits
bitbuf_skip(struct bitbuf * bb,int bits)157 void bitbuf_skip(struct bitbuf *bb, int bits)
158 {
159   int pos = bb->bitpos + bits, len = bb->len << 3;
160 
161   while (pos >= len) {
162     pos -= len;
163     len = (bb->len = read(bb->fd, bb->buf, bb->max)) << 3;
164     if (bb->len < 1) perror_exit("inflate EOF");
165   }
166   bb->bitpos = pos;
167 }
168 
169 // Optimized single bit inlined version
bitbuf_bit(struct bitbuf * bb)170 static inline int bitbuf_bit(struct bitbuf *bb)
171 {
172   int bufpos = bb->bitpos>>3;
173 
174   if (bufpos == bb->len) {
175     bitbuf_skip(bb, 0);
176     bufpos = 0;
177   }
178 
179   return (bb->buf[bufpos]>>(bb->bitpos++&7))&1;
180 }
181 
182 // Fetch the next X bits from the bitbuf, little endian
bitbuf_get(struct bitbuf * bb,int bits)183 unsigned bitbuf_get(struct bitbuf *bb, int bits)
184 {
185   int result = 0, offset = 0;
186 
187   while (bits) {
188     int click = bb->bitpos >> 3, blow, blen;
189 
190     // Load more data if buffer empty
191     if (click == bb->len) bitbuf_skip(bb, click = 0);
192 
193     // grab bits from next byte
194     blow = bb->bitpos & 7;
195     blen = 8-blow;
196     if (blen > bits) blen = bits;
197     result |= ((bb->buf[click] >> blow) & ((1<<blen)-1)) << offset;
198     offset += blen;
199     bits -= blen;
200     bb->bitpos += blen;
201   }
202 
203   return result;
204 }
205 
bitbuf_flush(struct bitbuf * bb)206 void bitbuf_flush(struct bitbuf *bb)
207 {
208   if (!bb->bitpos) return;
209 
210   xwrite(bb->fd, bb->buf, (bb->bitpos+7)/8);
211   memset(bb->buf, 0, bb->max);
212   bb->bitpos = 0;
213 }
214 
bitbuf_put(struct bitbuf * bb,int data,int len)215 void bitbuf_put(struct bitbuf *bb, int data, int len)
216 {
217   while (len) {
218     int click = bb->bitpos >> 3, blow, blen;
219 
220     // Flush buffer if necessary
221     if (click == bb->max) {
222       bitbuf_flush(bb);
223       click = 0;
224     }
225     blow = bb->bitpos & 7;
226     blen = 8-blow;
227     if (blen > len) blen = len;
228     bb->buf[click] |= data << blow;
229     bb->bitpos += blen;
230     data >>= blen;
231     len -= blen;
232   }
233 }
234 
data_crc(char sym)235 static void data_crc(char sym)
236 {
237   TT.data[TT.pos++ & 32767] = sym;
238 
239   if (!(TT.pos & 32767)) {
240     xwrite(TT.outfd, TT.data, 32768);
241     if (TT.crcfunc) TT.crcfunc(TT.data, 32768);
242   }
243 }
244 
245 // Huffman coding uses bits to traverse a binary tree to a leaf node,
246 // By placing frequently occurring symbols at shorter paths, frequently
247 // used symbols may be represented in fewer bits than uncommon symbols.
248 
249 struct huff {
250   unsigned short length[16];
251   unsigned short symbol[288];
252 };
253 
254 // Create simple huffman tree from array of bit lengths.
255 
256 // The symbols in the huffman trees are sorted (first by bit length
257 // of the code to reach them, then by symbol number). This means that given
258 // the bit length of each symbol, we can construct a unique tree.
len2huff(struct huff * huff,char bitlen[],int len)259 static void len2huff(struct huff *huff, char bitlen[], int len)
260 {
261   int offset[16];
262   int i;
263 
264   // Count number of codes at each bit length
265   memset(huff, 0, sizeof(struct huff));
266   for (i = 0; i<len; i++) huff->length[bitlen[i]]++;
267 
268   // Sort symbols by bit length. (They'll remain sorted by symbol within that.)
269   *huff->length = *offset = 0;
270   for (i = 1; i<16; i++) offset[i] = offset[i-1] + huff->length[i-1];
271 
272   for (i = 0; i<len; i++) if (bitlen[i]) huff->symbol[offset[bitlen[i]]++] = i;
273 }
274 
275 // Fetch and decode next huffman coded symbol from bitbuf.
276 // This takes advantage of the sorting to navigate the tree as an array:
277 // each time we fetch a bit we have all the codes at that bit level in
278 // order with no gaps.
huff_and_puff(struct bitbuf * bb,struct huff * huff)279 static unsigned huff_and_puff(struct bitbuf *bb, struct huff *huff)
280 {
281   unsigned short *length = huff->length;
282   int start = 0, offset = 0;
283 
284   // Traverse through the bit lengths until our code is in this range
285   for (;;) {
286     offset = (offset << 1) | bitbuf_bit(bb);
287     start += *++length;
288     if ((offset -= *length) < 0) break;
289     if ((length - huff->length) & 16) error_exit("bad symbol");
290   }
291 
292   return huff->symbol[start + offset];
293 }
294 
295 // Decompress deflated data from bitbuf to TT.outfd.
inflate(struct bitbuf * bb)296 static void inflate(struct bitbuf *bb)
297 {
298   TT.crc = ~0;
299   // repeat until spanked
300   for (;;) {
301     int final, type;
302 
303     final = bitbuf_get(bb, 1);
304     type = bitbuf_get(bb, 2);
305 
306     if (type == 3) error_exit("bad type");
307 
308     // Uncompressed block?
309     if (!type) {
310       int len, nlen;
311 
312       // Align to byte, read length
313       bitbuf_skip(bb, (8-bb->bitpos)&7);
314       len = bitbuf_get(bb, 16);
315       nlen = bitbuf_get(bb, 16);
316       if (len != (0xffff & ~nlen)) error_exit("bad len");
317 
318       // Dump literal output data
319       while (len) {
320         int pos = bb->bitpos >> 3, bblen = bb->len - pos;
321         char *p = bb->buf+pos;
322 
323         // dump bytes until done or end of current bitbuf contents
324         if (bblen > len) bblen = len;
325         pos = bblen;
326         while (pos--) data_crc(*(p++));
327         bitbuf_skip(bb, bblen << 3);
328         len -= bblen;
329       }
330 
331     // Compressed block
332     } else {
333       struct huff *disthuff, *lithuff;
334 
335       // Dynamic huffman codes?
336       if (type == 2) {
337         struct huff *h2 = ((struct huff *)toybuf)+1;
338         int i, litlen, distlen, hufflen;
339         char *hufflen_order = "\x10\x11\x12\0\x08\x07\x09\x06\x0a\x05\x0b"
340                               "\x04\x0c\x03\x0d\x02\x0e\x01\x0f", *bits;
341 
342         // The huffman trees are stored as a series of bit lengths
343         litlen = bitbuf_get(bb, 5)+257;  // max 288
344         distlen = bitbuf_get(bb, 5)+1;   // max 32
345         hufflen = bitbuf_get(bb, 4)+4;   // max 19
346 
347         // The literal and distance codes are themselves compressed, in
348         // a complicated way: an array of bit lengths (hufflen many
349         // entries, each 3 bits) is used to fill out an array of 19 entries
350         // in a magic order, leaving the rest 0. Then make a tree out of it:
351         memset(bits = toybuf+1, 0, 19);
352         for (i=0; i<hufflen; i++) bits[hufflen_order[i]] = bitbuf_get(bb, 3);
353         len2huff(h2, bits, 19);
354 
355         // Use that tree to read in the literal and distance bit lengths
356         for (i = 0; i < litlen + distlen;) {
357           int sym = huff_and_puff(bb, h2);
358 
359           // 0-15 are literals, 16 = repeat previous code 3-6 times,
360           // 17 = 3-10 zeroes (3 bit), 18 = 11-138 zeroes (7 bit)
361           if (sym < 16) bits[i++] = sym;
362           else {
363             int len = sym & 2;
364 
365             len = bitbuf_get(bb, sym-14+len+(len>>1)) + 3 + (len<<2);
366             memset(bits+i, bits[i-1] * !(sym&3), len);
367             i += len;
368           }
369         }
370         if (i > litlen+distlen) error_exit("bad tree");
371 
372         len2huff(lithuff = h2, bits, litlen);
373         len2huff(disthuff = ((struct huff *)toybuf)+2, bits+litlen, distlen);
374 
375       // Static huffman codes
376       } else {
377         lithuff = TT.fixlithuff;
378         disthuff = TT.fixdisthuff;
379       }
380 
381       // Use huffman tables to decode block of compressed symbols
382       for (;;) {
383         int sym = huff_and_puff(bb, lithuff);
384 
385         // Literal?
386         if (sym < 256) data_crc(sym);
387 
388         // Copy range?
389         else if (sym > 256) {
390           int len, dist;
391 
392           sym -= 257;
393           len = TT.lenbase[sym] + bitbuf_get(bb, TT.lenbits[sym]);
394           sym = huff_and_puff(bb, disthuff);
395           dist = TT.distbase[sym] + bitbuf_get(bb, TT.distbits[sym]);
396           sym = TT.pos & 32767;
397 
398           while (len--) data_crc(TT.data[(TT.pos-dist) & 32767]);
399 
400         // End of block
401         } else break;
402       }
403     }
404 
405     // Was that the last block?
406     if (final) break;
407   }
408 
409   if (TT.pos & 32767) {
410     xwrite(TT.outfd, TT.data, TT.pos & 32767);
411     if (TT.crcfunc) TT.crcfunc(TT.data, TT.pos & 32767);
412   }
413 }
414 
415 // Deflate from TT.infd to bitbuf
416 // For deflate, TT.len = input read, TT.pos = input consumed
deflate(struct bitbuf * bb)417 static void deflate(struct bitbuf *bb)
418 {
419   char *data = TT.data;
420   int len, final = 0;
421 
422   TT.crc = ~0;
423 
424   while (!final) {
425     // Read next half-window of data if we haven't hit EOF yet.
426     len = readall(TT.infd, data+(TT.len&32768), 32768);
427     if (len < 0) perror_exit("read"); // todo: add filename
428     if (len != 32768) final++;
429     if (TT.crcfunc) TT.crcfunc(data+(TT.len&32768), len);
430     // TT.len += len;  crcfunc advances len
431 
432     // store block as literal
433     bitbuf_put(bb, final, 1);
434     bitbuf_put(bb, 0, 1);
435 
436     bitbuf_put(bb, 0, (8-bb->bitpos)&7);
437     bitbuf_put(bb, len, 16);
438     bitbuf_put(bb, 0xffff & ~len, 16);
439 
440     // repeat until spanked
441     while (TT.pos != TT.len) {
442       unsigned pos = TT.pos & 65535;
443 
444       bitbuf_put(bb, data[pos], 8);
445 
446       // need to refill buffer?
447       if (!(32767 & ++TT.pos) && !final) break;
448     }
449   }
450   bitbuf_flush(bb);
451 }
452 
453 // Allocate memory for deflate/inflate.
init_deflate(int compress)454 static void init_deflate(int compress)
455 {
456   int i, n = 1;
457 
458   // compress needs 64k data and 32k each for hashhead and hashchain.
459   // decompress just needs 32k data.
460   TT.data = xmalloc(32768*(compress ? 4 : 1));
461   if (compress) {
462     TT.hashhead = (unsigned short *)(TT.data + 65536);
463     TT.hashchain = (unsigned short *)(TT.data + 65536 + 32768);
464   }
465 
466   // Calculate lenbits, lenbase, distbits, distbase
467   *TT.lenbase = 3;
468   for (i = 0; i<sizeof(TT.lenbits)-1; i++) {
469     if (i>4) {
470       if (!(i&3)) {
471         TT.lenbits[i]++;
472         n <<= 1;
473       }
474       if (i == 27) n--;
475       else TT.lenbits[i+1] = TT.lenbits[i];
476     }
477     TT.lenbase[i+1] = n + TT.lenbase[i];
478   }
479   n = 0;
480   for (i = 0; i<sizeof(TT.distbits); i++) {
481     TT.distbase[i] = 1<<n;
482     if (i) TT.distbase[i] += TT.distbase[i-1];
483     if (i>3 && !(i&1)) n++;
484     TT.distbits[i] = n;
485   }
486 
487   // Init fixed huffman tables
488   for (i=0; i<288; i++) toybuf[i] = 8 + (i>143) - ((i>255)<<1) + (i>279);
489   len2huff(TT.fixlithuff = ((struct huff *)toybuf)+3, toybuf, 288);
490   memset(toybuf, 5, 30);
491   len2huff(TT.fixdisthuff = ((struct huff *)toybuf)+4, toybuf, 30);
492 }
493 
494 // Return true/false whether we consumed a gzip header.
is_gzip(struct bitbuf * bb)495 static int is_gzip(struct bitbuf *bb)
496 {
497   int flags;
498 
499   // Confirm signature
500   if (bitbuf_get(bb, 24) != 0x088b1f || (flags = bitbuf_get(bb, 8)) > 31)
501     return 0;
502   bitbuf_skip(bb, 6*8);
503 
504   // Skip extra, name, comment, header CRC fields
505   if (flags & 4) bitbuf_skip(bb, 16);
506   if (flags & 8) while (bitbuf_get(bb, 8));
507   if (flags & 16) while (bitbuf_get(bb, 8));
508   if (flags & 2) bitbuf_skip(bb, 16);
509 
510   return 1;
511 }
512 
gzip_crc(char * data,int len)513 void gzip_crc(char *data, int len)
514 {
515   int i;
516   unsigned crc, *crc_table = (unsigned *)(toybuf+sizeof(toybuf)-1024);
517 
518   crc = TT.crc;
519   for (i=0; i<len; i++) crc = crc_table[(crc^data[i])&0xff] ^ (crc>>8);
520   TT.crc = crc;
521   TT.len += len;
522 }
523 
do_gzip(int fd,char * name)524 static void do_gzip(int fd, char *name)
525 {
526   struct bitbuf *bb = bitbuf_init(1, sizeof(toybuf));
527 
528   // Header from RFC 1952 section 2.2:
529   // 2 ID bytes (1F, 8b), gzip method byte (8=deflate), FLAG byte (none),
530   // 4 byte MTIME (zeroed), Extra Flags (2=maximum compression),
531   // Operating System (FF=unknown)
532 
533   TT.infd = fd;
534   xwrite(bb->fd, "\x1f\x8b\x08\0\0\0\0\0\x02\xff", 10);
535 
536   // Use last 1k of toybuf for little endian crc table
537   crc_init((unsigned *)(toybuf+sizeof(toybuf)-1024), 1);
538   TT.crcfunc = gzip_crc;
539 
540   deflate(bb);
541 
542   // tail: crc32, len32
543 
544   bitbuf_put(bb, 0, (8-bb->bitpos)&7);
545   bitbuf_put(bb, ~TT.crc, 32);
546   bitbuf_put(bb, TT.len, 32);
547 
548   bitbuf_flush(bb);
549   free(bb);
550 }
551 
do_zcat(int fd,char * name)552 static void do_zcat(int fd, char *name)
553 {
554   struct bitbuf *bb = bitbuf_init(fd, sizeof(toybuf));
555 
556   if (!is_gzip(bb)) error_exit("not gzip");
557   TT.outfd = 1;
558 
559   // Use last 1k of toybuf for little endian crc table
560   crc_init((unsigned *)(toybuf+sizeof(toybuf)-1024), 1);
561   TT.crcfunc = gzip_crc;
562 
563   inflate(bb);
564 
565   // tail: crc32, len32
566 
567   bitbuf_skip(bb, (8-bb->bitpos)&7);
568   if (~TT.crc != bitbuf_get(bb, 32) || TT.len != bitbuf_get(bb, 32))
569     error_exit("bad crc");
570   free(bb);
571 }
572 
573 // Parse many different kinds of command line argument:
574 
compress_main(void)575 void compress_main(void)
576 {
577   // todo: this
578   printf("hello world");
579 }
580 
581 //#define CLEANUP_compress
582 //#define FOR_zcat
583 //#include "generated/flags.h"
584 
zcat_main(void)585 void zcat_main(void)
586 {
587   init_deflate(0);
588 
589   loopfiles(toys.optargs, do_zcat);
590 }
591 
gunzip_main(void)592 void gunzip_main(void)
593 {
594   init_deflate(0);
595 
596   loopfiles(toys.optargs, do_zcat);
597 }
598 
gzip_main(void)599 void gzip_main(void)
600 {
601   init_deflate(1);
602 
603   loopfiles(toys.optargs, do_gzip);
604 }
605