• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* compress.c - deflate/inflate code for zip, gzip, zlib, and raw
2  *
3  * Copyright 2014 Rob Landley <rob@landley.net>
4  *
5  * The inflate/deflate code lives here, so the various things that use it
6  * either live here or call these commands to pipe data through them.
7  *
8  * Divergence from posix: replace obsolete/patented "compress" with mutiplexer.
9  * (gzip already replaces "uncompress".)
10  *
11  * See RFCs 1950 (zlib), 1951 (deflate), and 1952 (gzip)
12  * LSB 4.1 has gzip, gunzip, and zcat
13  * TODO: zip -d DIR -x LIST -list -quiet -no overwrite -overwrite -p to stdout
14 
15 // Accept many different kinds of command line argument.
16 // Leave Lrg at end so flag values line up.
17 
18 USE_COMPRESS(NEWTOY(compress, "zcd9lrg[-cd][!zgLr]", TOYFLAG_USR|TOYFLAG_BIN))
19 
20 //zip unzip gzip gunzip zcat
21 
22 config COMPRESS
23   bool "compress"
24   default n
25   help
26     usage: compress [-zgLR19] [FILE]
27 
28     Compress or decompress file (or stdin) using "deflate" algorithm.
29 
30     -1	min compression
31     -9	max compression (default)
32     -g	gzip (default)
33     -L	zlib
34     -R	raw
35     -z	zip
36 
37 config DECOMPRESS
38   bool "decompress"
39   default n
40   help
41     usage: compress [-zglrcd9] [FILE]
42 
43     Compress or decompress file (or stdin) using "deflate" algorithm.
44 
45     -c	compress with -g gzip (default)  -l zlib  -r raw  -z zip
46     -d	decompress (autodetects type)
47 */
48 
49 #define FOR_compress
50 #include "toys.h"
51 
52 GLOBALS(
53   // Huffman codes: base offset and extra bits tables (length and distance)
54   char lenbits[29], distbits[30];
55   unsigned short lenbase[29], distbase[30];
56   void *fixdisthuff, *fixlithuff;
57 
58   // CRC
59   void (*crcfunc)(char *data, int len);
60   unsigned crc;
61 
62   // Compressed data buffer
63   char *data;
64   unsigned pos, len;
65   int infd, outfd;
66 
67   // Tables only used for deflation
68   unsigned short *hashhead, *hashchain;
69 )
70 
71 // little endian bit buffer
72 struct bitbuf {
73   int fd, bitpos, len, max;
74   char buf[];
75 };
76 
77 // malloc a struct bitbuf
bitbuf_init(int fd,int size)78 struct bitbuf *bitbuf_init(int fd, int size)
79 {
80   struct bitbuf *bb = xzalloc(sizeof(struct bitbuf)+size);
81 
82   bb->max = size;
83   bb->fd = fd;
84 
85   return bb;
86 }
87 
88 // Advance bitpos without the overhead of recording bits
bitbuf_skip(struct bitbuf * bb,int bits)89 void bitbuf_skip(struct bitbuf *bb, int bits)
90 {
91   int pos = bb->bitpos + bits, len = bb->len << 3;
92 
93   while (pos >= len) {
94     pos -= len;
95     len = (bb->len = read(bb->fd, bb->buf, bb->max)) << 3;
96     if (bb->len < 1) perror_exit("inflate EOF");
97   }
98   bb->bitpos = pos;
99 }
100 
101 // Optimized single bit inlined version
bitbuf_bit(struct bitbuf * bb)102 static inline int bitbuf_bit(struct bitbuf *bb)
103 {
104   int bufpos = bb->bitpos>>3;
105 
106   if (bufpos == bb->len) {
107     bitbuf_skip(bb, 0);
108     bufpos = 0;
109   }
110 
111   return (bb->buf[bufpos]>>(bb->bitpos++&7))&1;
112 }
113 
114 // Fetch the next X bits from the bitbuf, little endian
bitbuf_get(struct bitbuf * bb,int bits)115 unsigned bitbuf_get(struct bitbuf *bb, int bits)
116 {
117   int result = 0, offset = 0;
118 
119   while (bits) {
120     int click = bb->bitpos >> 3, blow, blen;
121 
122     // Load more data if buffer empty
123     if (click == bb->len) bitbuf_skip(bb, click = 0);
124 
125     // grab bits from next byte
126     blow = bb->bitpos & 7;
127     blen = 8-blow;
128     if (blen > bits) blen = bits;
129     result |= ((bb->buf[click] >> blow) & ((1<<blen)-1)) << offset;
130     offset += blen;
131     bits -= blen;
132     bb->bitpos += blen;
133   }
134 
135   return result;
136 }
137 
bitbuf_flush(struct bitbuf * bb)138 void bitbuf_flush(struct bitbuf *bb)
139 {
140   if (!bb->bitpos) return;
141 
142   xwrite(bb->fd, bb->buf, (bb->bitpos+7)/8);
143   memset(bb->buf, 0, bb->max);
144   bb->bitpos = 0;
145 }
146 
bitbuf_put(struct bitbuf * bb,int data,int len)147 void bitbuf_put(struct bitbuf *bb, int data, int len)
148 {
149   while (len) {
150     int click = bb->bitpos >> 3, blow, blen;
151 
152     // Flush buffer if necessary
153     if (click == bb->max) {
154       bitbuf_flush(bb);
155       click = 0;
156     }
157     blow = bb->bitpos & 7;
158     blen = 8-blow;
159     if (blen > len) blen = len;
160     bb->buf[click] |= data << blow;
161     bb->bitpos += blen;
162     data >>= blen;
163     len -= blen;
164   }
165 }
166 
output_byte(char sym)167 static void output_byte(char sym)
168 {
169   int pos = TT.pos++ & 32767;
170 
171   TT.data[pos] = sym;
172 
173   if (pos == 32767) {
174     xwrite(TT.outfd, TT.data, 32768);
175     if (TT.crcfunc) TT.crcfunc(TT.data, 32768);
176   }
177 }
178 
179 // Huffman coding uses bits to traverse a binary tree to a leaf node,
180 // By placing frequently occurring symbols at shorter paths, frequently
181 // used symbols may be represented in fewer bits than uncommon symbols.
182 
183 struct huff {
184   unsigned short length[16];
185   unsigned short symbol[288];
186 };
187 
188 // Create simple huffman tree from array of bit lengths.
189 
190 // The symbols in the huffman trees are sorted (first by bit length
191 // of the code to reach them, then by symbol number). This means that given
192 // the bit length of each symbol, we can construct a unique tree.
len2huff(struct huff * huff,char bitlen[],int len)193 static void len2huff(struct huff *huff, char bitlen[], int len)
194 {
195   int offset[16];
196   int i;
197 
198   // Count number of codes at each bit length
199   memset(huff, 0, sizeof(struct huff));
200   for (i = 0; i<len; i++) huff->length[bitlen[i]]++;
201 
202   // Sort symbols by bit length. (They'll remain sorted by symbol within that.)
203   *huff->length = *offset = 0;
204   for (i = 1; i<16; i++) offset[i] = offset[i-1] + huff->length[i-1];
205 
206   for (i = 0; i<len; i++) if (bitlen[i]) huff->symbol[offset[bitlen[i]]++] = i;
207 }
208 
209 // Fetch and decode next huffman coded symbol from bitbuf.
210 // This takes advantage of the sorting to navigate the tree as an array:
211 // each time we fetch a bit we have all the codes at that bit level in
212 // order with no gaps.
huff_and_puff(struct bitbuf * bb,struct huff * huff)213 static unsigned huff_and_puff(struct bitbuf *bb, struct huff *huff)
214 {
215   unsigned short *length = huff->length;
216   int start = 0, offset = 0;
217 
218   // Traverse through the bit lengths until our code is in this range
219   for (;;) {
220     offset = (offset << 1) | bitbuf_bit(bb);
221     start += *++length;
222     if ((offset -= *length) < 0) break;
223     if ((length - huff->length) & 16) error_exit("bad symbol");
224   }
225 
226   return huff->symbol[start + offset];
227 }
228 
229 // Decompress deflated data from bitbuf to TT.outfd.
inflate(struct bitbuf * bb)230 static void inflate(struct bitbuf *bb)
231 {
232   TT.crc = ~0;
233   // repeat until spanked
234   for (;;) {
235     int final, type;
236 
237     final = bitbuf_get(bb, 1);
238     type = bitbuf_get(bb, 2);
239 
240     if (type == 3) error_exit("bad type");
241 
242     // Uncompressed block?
243     if (!type) {
244       int len, nlen;
245 
246       // Align to byte, read length
247       bitbuf_skip(bb, (8-bb->bitpos)&7);
248       len = bitbuf_get(bb, 16);
249       nlen = bitbuf_get(bb, 16);
250       if (len != (0xffff & ~nlen)) error_exit("bad len");
251 
252       // Dump literal output data
253       while (len) {
254         int pos = bb->bitpos >> 3, bblen = bb->len - pos;
255         char *p = bb->buf+pos;
256 
257         // dump bytes until done or end of current bitbuf contents
258         if (bblen > len) bblen = len;
259         pos = bblen;
260         while (pos--) output_byte(*(p++));
261         bitbuf_skip(bb, bblen << 3);
262         len -= bblen;
263       }
264 
265     // Compressed block
266     } else {
267       struct huff *disthuff, *lithuff;
268 
269       // Dynamic huffman codes?
270       if (type == 2) {
271         struct huff *h2 = ((struct huff *)toybuf)+1;
272         int i, litlen, distlen, hufflen;
273         char *hufflen_order = "\x10\x11\x12\0\x08\x07\x09\x06\x0a\x05\x0b"
274                               "\x04\x0c\x03\x0d\x02\x0e\x01\x0f", *bits;
275 
276         // The huffman trees are stored as a series of bit lengths
277         litlen = bitbuf_get(bb, 5)+257;  // max 288
278         distlen = bitbuf_get(bb, 5)+1;   // max 32
279         hufflen = bitbuf_get(bb, 4)+4;   // max 19
280 
281         // The literal and distance codes are themselves compressed, in
282         // a complicated way: an array of bit lengths (hufflen many
283         // entries, each 3 bits) is used to fill out an array of 19 entries
284         // in a magic order, leaving the rest 0. Then make a tree out of it:
285         memset(bits = toybuf+1, 0, 19);
286         for (i=0; i<hufflen; i++) bits[hufflen_order[i]] = bitbuf_get(bb, 3);
287         len2huff(h2, bits, 19);
288 
289         // Use that tree to read in the literal and distance bit lengths
290         for (i = 0; i < litlen + distlen;) {
291           int sym = huff_and_puff(bb, h2);
292 
293           // 0-15 are literals, 16 = repeat previous code 3-6 times,
294           // 17 = 3-10 zeroes (3 bit), 18 = 11-138 zeroes (7 bit)
295           if (sym < 16) bits[i++] = sym;
296           else {
297             int len = sym & 2;
298 
299             len = bitbuf_get(bb, sym-14+len+(len>>1)) + 3 + (len<<2);
300             memset(bits+i, bits[i-1] * !(sym&3), len);
301             i += len;
302           }
303         }
304         if (i > litlen+distlen) error_exit("bad tree");
305 
306         len2huff(lithuff = h2, bits, litlen);
307         len2huff(disthuff = ((struct huff *)toybuf)+2, bits+litlen, distlen);
308 
309       // Static huffman codes
310       } else {
311         lithuff = TT.fixlithuff;
312         disthuff = TT.fixdisthuff;
313       }
314 
315       // Use huffman tables to decode block of compressed symbols
316       for (;;) {
317         int sym = huff_and_puff(bb, lithuff);
318 
319         // Literal?
320         if (sym < 256) output_byte(sym);
321 
322         // Copy range?
323         else if (sym > 256) {
324           int len, dist;
325 
326           sym -= 257;
327           len = TT.lenbase[sym] + bitbuf_get(bb, TT.lenbits[sym]);
328           sym = huff_and_puff(bb, disthuff);
329           dist = TT.distbase[sym] + bitbuf_get(bb, TT.distbits[sym]);
330           sym = TT.pos & 32767;
331 
332           while (len--) output_byte(TT.data[(TT.pos-dist) & 32767]);
333 
334         // End of block
335         } else break;
336       }
337     }
338 
339     // Was that the last block?
340     if (final) break;
341   }
342 
343   if (TT.pos & 32767) {
344     xwrite(TT.outfd, TT.data, TT.pos & 32767);
345     if (TT.crcfunc) TT.crcfunc(TT.data, TT.pos & 32767);
346   }
347 }
348 
349 // Deflate from TT.infd to bitbuf
350 // For deflate, TT.len = input read, TT.pos = input consumed
deflate(struct bitbuf * bb)351 static void deflate(struct bitbuf *bb)
352 {
353   char *data = TT.data;
354   int len, final = 0;
355 
356   TT.crc = ~0;
357 
358   while (!final) {
359     // Read next half-window of data if we haven't hit EOF yet.
360     len = readall(TT.infd, data+(TT.len&32768), 32768);
361     if (len < 0) perror_exit("read"); // todo: add filename
362     if (len != 32768) final++;
363     if (TT.crcfunc) TT.crcfunc(data+(TT.len&32768), len);
364     // TT.len += len;  crcfunc advances len
365 
366     // store block as literal
367     bitbuf_put(bb, final, 1);
368     bitbuf_put(bb, 0, 1);
369 
370     bitbuf_put(bb, 0, (8-bb->bitpos)&7);
371     bitbuf_put(bb, len, 16);
372     bitbuf_put(bb, 0xffff & ~len, 16);
373 
374     // repeat until spanked
375     while (TT.pos != TT.len) {
376       unsigned pos = TT.pos & 65535;
377 
378       bitbuf_put(bb, data[pos], 8);
379 
380       // need to refill buffer?
381       if (!(32767 & ++TT.pos) && !final) break;
382     }
383   }
384   bitbuf_flush(bb);
385 }
386 
387 // Allocate memory for deflate/inflate.
init_deflate(int compress)388 static void init_deflate(int compress)
389 {
390   int i, n = 1;
391 
392   // compress needs 64k data and 32k each for hashhead and hashchain.
393   // decompress just needs 32k data.
394   TT.data = xmalloc(32768*(compress ? 4 : 1));
395   if (compress) {
396     TT.hashhead = (unsigned short *)(TT.data + 65536);
397     TT.hashchain = (unsigned short *)(TT.data + 65536 + 32768);
398   }
399 
400   // Calculate lenbits, lenbase, distbits, distbase
401   *TT.lenbase = 3;
402   for (i = 0; i<sizeof(TT.lenbits)-1; i++) {
403     if (i>4) {
404       if (!(i&3)) {
405         TT.lenbits[i]++;
406         n <<= 1;
407       }
408       if (i == 27) n--;
409       else TT.lenbits[i+1] = TT.lenbits[i];
410     }
411     TT.lenbase[i+1] = n + TT.lenbase[i];
412   }
413   n = 0;
414   for (i = 0; i<sizeof(TT.distbits); i++) {
415     TT.distbase[i] = 1<<n;
416     if (i) TT.distbase[i] += TT.distbase[i-1];
417     if (i>3 && !(i&1)) n++;
418     TT.distbits[i] = n;
419   }
420 
421   // Init fixed huffman tables
422   for (i=0; i<288; i++) toybuf[i] = 8 + (i>143) - ((i>255)<<1) + (i>279);
423   len2huff(TT.fixlithuff = ((struct huff *)toybuf)+3, toybuf, 288);
424   memset(toybuf, 5, 30);
425   len2huff(TT.fixdisthuff = ((struct huff *)toybuf)+4, toybuf, 30);
426 }
427 
428 // Return true/false whether we consumed a gzip header.
is_gzip(struct bitbuf * bb)429 static int is_gzip(struct bitbuf *bb)
430 {
431   int flags;
432 
433   // Confirm signature
434   if (bitbuf_get(bb, 24) != 0x088b1f || (flags = bitbuf_get(bb, 8)) > 31)
435     return 0;
436   bitbuf_skip(bb, 6*8);
437 
438   // Skip extra, name, comment, header CRC fields
439   if (flags & 4) bitbuf_skip(bb, 16);
440   if (flags & 8) while (bitbuf_get(bb, 8));
441   if (flags & 16) while (bitbuf_get(bb, 8));
442   if (flags & 2) bitbuf_skip(bb, 16);
443 
444   return 1;
445 }
446 
gzip_crc(char * data,int len)447 void gzip_crc(char *data, int len)
448 {
449   int i;
450   unsigned crc, *crc_table = (unsigned *)(toybuf+sizeof(toybuf)-1024);
451 
452   crc = TT.crc;
453   for (i=0; i<len; i++) crc = crc_table[(crc^data[i])&0xff] ^ (crc>>8);
454   TT.crc = crc;
455   TT.len += len;
456 }
457 
do_gzip(int fd,char * name)458 static void do_gzip(int fd, char *name)
459 {
460   struct bitbuf *bb = bitbuf_init(1, sizeof(toybuf));
461 
462   // Header from RFC 1952 section 2.2:
463   // 2 ID bytes (1F, 8b), gzip method byte (8=deflate), FLAG byte (none),
464   // 4 byte MTIME (zeroed), Extra Flags (2=maximum compression),
465   // Operating System (FF=unknown)
466 
467   TT.infd = fd;
468   xwrite(bb->fd, "\x1f\x8b\x08\0\0\0\0\0\x02\xff", 10);
469 
470   // Use last 1k of toybuf for little endian crc table
471   crc_init((unsigned *)(toybuf+sizeof(toybuf)-1024), 1);
472   TT.crcfunc = gzip_crc;
473 
474   deflate(bb);
475 
476   // tail: crc32, len32
477 
478   bitbuf_put(bb, 0, (8-bb->bitpos)&7);
479   bitbuf_put(bb, ~TT.crc, 32);
480   bitbuf_put(bb, TT.len, 32);
481 
482   bitbuf_flush(bb);
483   free(bb);
484 }
485 
do_zcat(int fd,char * name)486 static void do_zcat(int fd, char *name)
487 {
488   struct bitbuf *bb = bitbuf_init(fd, sizeof(toybuf));
489 
490   if (!is_gzip(bb)) error_exit("not gzip");
491   TT.outfd = 1;
492 
493   // Use last 1k of toybuf for little endian crc table
494   crc_init((unsigned *)(toybuf+sizeof(toybuf)-1024), 1);
495   TT.crcfunc = gzip_crc;
496 
497   inflate(bb);
498 
499   // tail: crc32, len32
500 
501   bitbuf_skip(bb, (8-bb->bitpos)&7);
502   if (~TT.crc != bitbuf_get(bb, 32) || TT.len != bitbuf_get(bb, 32))
503     error_exit("bad crc");
504   free(bb);
505 }
506 
507 // Parse many different kinds of command line argument:
508 
compress_main(void)509 void compress_main(void)
510 {
511   // todo: this
512   printf("hello world");
513 }
514