• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*************************************************************************
2 * Name:        huffman.c
3 * Author:      Marcus Geelnard
4 * Description: Huffman coder/decoder implementation.
5 * Reentrant:   Yes
6 * $Id: huffman.c,v 1.6 2004/12/14 18:59:40 marcus256 Exp $
7 *
8 * This is a very straight forward implementation of a Huffman coder and
9 * decoder.
10 *
11 * Primary flaws with this primitive implementation are:
12 *  - Slow bit stream implementation
13 *  - Fairly slow decoding (slower than encoding)
14 *  - Maximum tree depth of 32 (the coder aborts if any code exceeds a
15 *    size of 32 bits). If I'm not mistaking, this should not be possible
16 *    unless the input buffer is larger than 2^32 bytes, which is not
17 *    supported by the coder anyway (max 2^32-1 bytes can be specified with
18 *    an unsigned 32-bit integer).
19 *
20 * On the other hand, there are a few advantages of this implementation:
21 *  - The Huffman tree is stored in a very compact form, requiring only
22 *    12 bits per symbol (for 8 bit symbols), meaning a maximum of 384
23 *    bytes overhead.
24 *  - The Huffman coder does quite well in situations where the data is
25 *    noisy, in which case most dictionary based coders run into problems.
26 *
27 * Possible improvements (probably not worth it):
28 *  - Partition the input data stream into blocks, where each block has
29 *    its own Huffman tree. With variable block sizes, it should be
30 *    possible to find locally optimal Huffman trees, which in turn could
31 *    reduce the total size.
32 *  - Allow for a few different predefined Huffman trees, which could
33 *    reduce the size of a block even further.
34 *-------------------------------------------------------------------------
35 * Copyright (c) 2003-2011 Marcus Geelnard
36 *
37 * This software is provided 'as-is', without any express or implied
38 * warranty. In no event will the authors be held liable for any damages
39 * arising from the use of this software.
40 *
41 * Permission is granted to anyone to use this software for any purpose,
42 * including commercial applications, and to alter it and redistribute it
43 * freely, subject to the following restrictions:
44 *
45 * 1. The origin of this software must not be misrepresented; you must not
46 *    claim that you wrote the original software. If you use this software
47 *    in a product, an acknowledgment in the product documentation would
48 *    be appreciated but is not required.
49 *
50 * 2. Altered source versions must be plainly marked as such, and must not
51 *    be misrepresented as being the original software.
52 *
53 * 3. This notice may not be removed or altered from any source
54 *    distribution.
55 *
56 * Marcus Geelnard
57 * marcus.geelnard at home.se
58 *************************************************************************/
59 
60 /* Modified May 06 by Julian Seward for use in Valgrind.
61    - changed integral types to V's versions (UInt, UChar etc)
62    - added initialisation in _Huffman_WriteBits, as described in
63      comment in that function.
64 */
65 
66 /*************************************************************************
67 * Types used for Huffman coding
68 *************************************************************************/
69 
70 typedef struct {
71     UInt Symbol;
72     UInt Count;
73     UInt Code;
74     UInt Bits;
75 } huff_sym_t;
76 
77 typedef struct {
78     UChar *BytePtr;
79     UInt  BitPos;
80 } huff_bitstream_t;
81 
82 
83 
84 /*************************************************************************
85 *                           INTERNAL FUNCTIONS                           *
86 *************************************************************************/
87 
88 
89 /*************************************************************************
90 * _Huffman_InitBitstream() - Initialize a bitstream.
91 *************************************************************************/
92 
_Huffman_InitBitstream(huff_bitstream_t * stream,UChar * buf)93 static void _Huffman_InitBitstream( huff_bitstream_t *stream,
94     UChar *buf )
95 {
96     stream->BytePtr  = buf;
97     stream->BitPos   = 0;
98 }
99 
100 
101 /*************************************************************************
102 * _Huffman_ReadBits() - Read bits from a bitstream.
103 *************************************************************************/
104 
_Huffman_ReadBits(huff_bitstream_t * stream,UInt bits)105 static UInt _Huffman_ReadBits( huff_bitstream_t *stream,
106     UInt bits )
107 {
108     UInt  x, bit, count;
109     UChar *buf;
110 
111     /* Get current stream state */
112     buf = stream->BytePtr;
113     bit = stream->BitPos;
114 
115     /* Extract bits */
116     x = 0;
117     for( count = 0; count < bits; ++ count )
118     {
119         x = (x<<1) + (*buf & (1<<(7-bit)) ? 1 : 0);
120         bit = (bit+1) & 7;
121         if( !bit )
122         {
123             ++ buf;
124         }
125     }
126 
127     /* Store new stream state */
128     stream->BytePtr = buf;
129     stream->BitPos  = bit;
130 
131     return x;
132 }
133 
134 
135 /*************************************************************************
136 * _Huffman_WriteBits() - Write bits to a bitstream.
137 *************************************************************************/
138 
_Huffman_WriteBits(huff_bitstream_t * stream,UInt x,UInt bits)139 static void _Huffman_WriteBits( huff_bitstream_t *stream, UInt x,
140     UInt bits )
141 {
142     UInt  bit, count;
143     UChar *buf;
144     UInt  mask;
145 
146     /* Get current stream state */
147     buf = stream->BytePtr;
148     bit = stream->BitPos;
149 
150     /* Append bits */
151     mask = 1 << (bits-1);
152     for( count = 0; count < bits; ++ count )
153     {
154         /* If we're starting a new byte, zero it out, so that the
155            resulting byte sequence looks completely defined from
156            Valgrind's point of view.  If this doesn't happen then the
157            last byte in the stream may look partially undefined. */
158         if (bit == 0)
159            *buf = 0;
160         *buf = (*buf & (0xff^(1<<(7-bit)))) +
161                ((x & mask ? 1 : 0) << (7-bit));
162         x <<= 1;
163         bit = (bit+1) & 7;
164         if( !bit )
165         {
166             ++ buf;
167         }
168     }
169 
170     /* Store new stream state */
171     stream->BytePtr = buf;
172     stream->BitPos  = bit;
173 }
174 
175 
176 /*************************************************************************
177 * _Huffman_Hist() - Calculate (sorted) histogram for a block of data.
178 *************************************************************************/
179 
_Huffman_Hist(UChar * in,huff_sym_t * sym,UInt size)180 static void _Huffman_Hist( UChar *in, huff_sym_t *sym,
181     UInt size )
182 {
183     Int k, swaps;
184     huff_sym_t tmp;
185 
186     /* Clear/init histogram */
187     for( k = 0; k < 256; ++ k )
188     {
189         sym[k].Symbol = k;
190         sym[k].Count  = 0;
191         sym[k].Code   = 0;
192         sym[k].Bits   = 0;
193     }
194 
195     /* Build histogram */
196     for( k = size; k; -- k )
197     {
198         sym[ *in ++ ].Count ++;
199     }
200 
201     /* Sort histogram - most frequent symbol first (bubble sort) */
202     do
203     {
204         swaps = 0;
205         for( k = 0; k < 255; ++ k )
206         {
207             if( sym[k].Count < sym[k+1].Count )
208             {
209                 tmp      = sym[k];
210                 sym[k]   = sym[k+1];
211                 sym[k+1] = tmp;
212                 swaps    = 1;
213             }
214         }
215     }
216     while( swaps );
217 }
218 
219 
220 /*************************************************************************
221 * _Huffman_MakeTree() - Generate a Huffman tree.
222 *************************************************************************/
223 
_Huffman_MakeTree(huff_sym_t * sym,huff_bitstream_t * stream,UInt code,UInt bits,UInt first,UInt last)224 static void _Huffman_MakeTree( huff_sym_t *sym, huff_bitstream_t *stream,
225     UInt code, UInt bits, UInt first,
226     UInt last )
227 {
228     UInt k, size, size_a, size_b, last_a, first_b;
229 
230     /* Is this a leaf node? */
231     if( first == last )
232     {
233         /* Append symbol to tree description */
234         _Huffman_WriteBits( stream, 1, 1 );
235         _Huffman_WriteBits( stream, sym[first].Symbol, 8 );
236 
237         /* Store code info in symbol array */
238         sym[first].Code = code;
239         sym[first].Bits = bits;
240         return;
241     }
242     else
243     {
244         /* This was not a leaf node */
245         _Huffman_WriteBits( stream, 0, 1 );
246     }
247 
248     /* Total size of interval */
249     size = 0;
250     for( k = first; k <= last; ++ k )
251     {
252         size += sym[k].Count;
253     }
254 
255     /* Find size of branch a */
256     size_a = 0;
257     for( k = first; size_a < ((size+1)>>1) && k < last; ++ k )
258     {
259         size_a += sym[k].Count;
260     }
261 
262     /* Non-empty branch? */
263     if( size_a > 0 )
264     {
265         /* Continue branching */
266         _Huffman_WriteBits( stream, 1, 1 );
267 
268         /* Branch a cut in histogram */
269         last_a  = k-1;
270 
271         /* Create branch a */
272         _Huffman_MakeTree( sym, stream, (code<<1)+0, bits+1,
273                                first, last_a );
274     }
275     else
276     {
277         /* This was an empty branch */
278         _Huffman_WriteBits( stream, 0, 1 );
279     }
280 
281     /* Size of branch b */
282     size_b = size - size_a;
283 
284     /* Non-empty branch? */
285     if( size_b > 0 )
286     {
287         /* Continue branching */
288         _Huffman_WriteBits( stream, 1, 1 );
289 
290         /* Branch b cut in histogram */
291         first_b = k;
292 
293         /* Create branch b */
294         _Huffman_MakeTree( sym, stream, (code<<1)+1, bits+1,
295                                first_b, last );
296     }
297     else
298     {
299         /* This was an empty branch */
300         _Huffman_WriteBits( stream, 0, 1 );
301     }
302 }
303 
304 
305 /*************************************************************************
306 * _Huffman_RecoverTree() - Recover a Huffman tree from a bitstream.
307 *************************************************************************/
308 
_Huffman_RecoverTree(huff_sym_t * sym,huff_bitstream_t * stream,UInt code,UInt bits,UInt * symnum)309 static void _Huffman_RecoverTree( huff_sym_t *sym,
310     huff_bitstream_t *stream, UInt code, UInt bits,
311     UInt *symnum )
312 {
313     UInt symbol;
314 
315     /* Is this a leaf node? */
316     if( _Huffman_ReadBits( stream, 1 ) )
317     {
318         /* Get symbol from tree description */
319         symbol = _Huffman_ReadBits( stream, 8 );
320 
321         /* Store code info in symbol array */
322         sym[*symnum].Symbol = symbol;
323         sym[*symnum].Code   = code;
324         sym[*symnum].Bits   = bits;
325 
326         /* Increase symbol counter */
327         *symnum = *symnum + 1;
328 
329         return;
330     }
331 
332     /* Non-empty branch? */
333     if( _Huffman_ReadBits( stream, 1 ) )
334     {
335         /* Create branch a */
336         _Huffman_RecoverTree( sym, stream, (code<<1)+0, bits+1,
337                               symnum );
338     }
339 
340     /* Non-empty branch? */
341     if( _Huffman_ReadBits( stream, 1 ) )
342     {
343         /* Create branch b */
344         _Huffman_RecoverTree( sym, stream, (code<<1)+1, bits+1,
345                               symnum );
346     }
347 }
348 
349 
350 
351 
352 /*************************************************************************
353 *                            PUBLIC FUNCTIONS                            *
354 *************************************************************************/
355 
356 
357 /*************************************************************************
358 * Huffman_Compress() - Compress a block of data using a Huffman coder.
359 *  in     - Input (uncompressed) buffer.
360 *  out    - Output (compressed) buffer. This buffer must be 384 bytes
361 *           larger than the input buffer.
362 *  insize - Number of input bytes.
363 * The function returns the size of the compressed data.
364 *************************************************************************/
365 static
Huffman_Compress(UChar * in,UChar * out,UInt insize)366 Int Huffman_Compress( UChar *in, UChar *out,
367     UInt insize )
368 {
369     huff_sym_t       sym[ 256 ], tmp;
370     huff_bitstream_t stream;
371     UInt     k, total_bytes, swaps, symbol, last_symbol;
372 
373     /* Do we have anything to compress? */
374     if( insize < 1 ) return 0;
375 
376     /* Initialize bitstream */
377     _Huffman_InitBitstream( &stream, out );
378 
379     /* Calculate and sort histogram for input data */
380     _Huffman_Hist( in, sym, insize );
381 
382     /* Find number of used symbols */
383     for( last_symbol = 255; sym[last_symbol].Count == 0; -- last_symbol );
384 
385     /* Special case: In order to build a correct tree, we need at least
386        two symbols (otherwise we get zero-bit representations). */
387     if( last_symbol == 0 ) ++ last_symbol;
388 
389     /* Build Huffman tree */
390     _Huffman_MakeTree( sym, &stream, 0, 0, 0, last_symbol );
391 
392     /* Was any code > 32 bits? (we do not handle that at present) */
393     for( k = 0; k < 255; ++ k )
394     {
395         if( sym[k].Bits > 32 )
396         {
397             return 0;
398         }
399     }
400 
401     /* Sort histogram - first symbol first (bubble sort) */
402     do
403     {
404         swaps = 0;
405         for( k = 0; k < 255; ++ k )
406         {
407             if( sym[k].Symbol > sym[k+1].Symbol )
408             {
409                 tmp      = sym[k];
410                 sym[k]   = sym[k+1];
411                 sym[k+1] = tmp;
412                 swaps    = 1;
413             }
414         }
415     }
416     while( swaps );
417 
418     /* Encode input stream */
419     for( k = 0; k < insize; ++ k )
420     {
421         symbol = in[ k ];
422         _Huffman_WriteBits( &stream, sym[symbol].Code,
423                             sym[symbol].Bits );
424     }
425 
426     /* Calculate size of output data */
427     total_bytes = (Int)(stream.BytePtr - out);
428     if( stream.BitPos > 0 )
429     {
430         ++ total_bytes;
431     }
432 
433     return total_bytes;
434 }
435 
436 
437 
438 /*************************************************************************
439 * Huffman_Uncompress() - Uncompress a block of data using a Huffman
440 * decoder.
441 *  in      - Input (compressed) buffer.
442 *  out     - Output (uncompressed) buffer. This buffer must be large
443 *            enough to hold the uncompressed data.
444 *  insize  - Number of input bytes.
445 *  outsize - Number of output bytes.
446 *************************************************************************/
447 static
Huffman_Uncompress(UChar * in,UChar * out,UInt insize,UInt outsize)448 void Huffman_Uncompress( UChar *in, UChar *out,
449     UInt insize, UInt outsize )
450 {
451     huff_sym_t       sym[ 256 ], tmp;
452     huff_bitstream_t stream;
453     UInt     k, m, symbol_count, swaps;
454     UChar    *buf;
455     UInt     bits, delta_bits, new_bits, code;
456 
457     /* Do we have anything to decompress? */
458     if( insize < 1 ) return;
459 
460     /* Initialize bitstream */
461     _Huffman_InitBitstream( &stream, in );
462 
463     /* Clear tree/histogram */
464     for( k = 0; k < 256; ++ k )
465     {
466         sym[k].Bits = 0x7fffffff;
467     }
468 
469     /* Recover Huffman tree */
470     symbol_count = 0;
471     _Huffman_RecoverTree( sym, &stream, 0, 0, &symbol_count );
472 
473     /* Sort histogram - shortest code first (bubble sort) */
474     do
475     {
476         swaps = 0;
477         for( k = 0; k < symbol_count-1; ++ k )
478         {
479             if( sym[k].Bits > sym[k+1].Bits )
480             {
481                 tmp      = sym[k];
482                 sym[k]   = sym[k+1];
483                 sym[k+1] = tmp;
484                 swaps    = 1;
485             }
486         }
487     }
488     while( swaps );
489 
490     /* Decode input stream */
491     buf = out;
492     for( k = 0; k < outsize; ++ k )
493     {
494         /* Search tree for matching code */
495         bits = 0;
496         code = 0;
497         for( m = 0; m < symbol_count; ++ m )
498         {
499             delta_bits = sym[m].Bits - bits;
500             if( delta_bits )
501             {
502                 new_bits = _Huffman_ReadBits( &stream, delta_bits );
503                 code = code | (new_bits << (32-bits-delta_bits));
504                 bits = sym[m].Bits;
505             }
506             if( code == (sym[m].Code << (32-sym[m].Bits)) )
507             {
508                 *buf ++ = (UChar) sym[m].Symbol;
509                 break;
510             }
511         }
512     }
513 }
514