1 /*************************************************************************
2 * Name: huffman.c
3 * Author: Marcus Geelnard
4 * Description: Huffman coder/decoder implementation.
5 * Reentrant: Yes
6 * $Id: huffman.c,v 1.6 2004/12/14 18:59:40 marcus256 Exp $
7 *
8 * This is a very straight forward implementation of a Huffman coder and
9 * decoder.
10 *
11 * Primary flaws with this primitive implementation are:
12 * - Slow bit stream implementation
13 * - Fairly slow decoding (slower than encoding)
14 * - Maximum tree depth of 32 (the coder aborts if any code exceeds a
15 * size of 32 bits). If I'm not mistaking, this should not be possible
16 * unless the input buffer is larger than 2^32 bytes, which is not
17 * supported by the coder anyway (max 2^32-1 bytes can be specified with
18 * an unsigned 32-bit integer).
19 *
20 * On the other hand, there are a few advantages of this implementation:
21 * - The Huffman tree is stored in a very compact form, requiring only
22 * 12 bits per symbol (for 8 bit symbols), meaning a maximum of 384
23 * bytes overhead.
24 * - The Huffman coder does quite well in situations where the data is
25 * noisy, in which case most dictionary based coders run into problems.
26 *
27 * Possible improvements (probably not worth it):
28 * - Partition the input data stream into blocks, where each block has
29 * its own Huffman tree. With variable block sizes, it should be
30 * possible to find locally optimal Huffman trees, which in turn could
31 * reduce the total size.
32 * - Allow for a few different predefined Huffman trees, which could
33 * reduce the size of a block even further.
34 *-------------------------------------------------------------------------
35 * Copyright (c) 2003-2011 Marcus Geelnard
36 *
37 * This software is provided 'as-is', without any express or implied
38 * warranty. In no event will the authors be held liable for any damages
39 * arising from the use of this software.
40 *
41 * Permission is granted to anyone to use this software for any purpose,
42 * including commercial applications, and to alter it and redistribute it
43 * freely, subject to the following restrictions:
44 *
45 * 1. The origin of this software must not be misrepresented; you must not
46 * claim that you wrote the original software. If you use this software
47 * in a product, an acknowledgment in the product documentation would
48 * be appreciated but is not required.
49 *
50 * 2. Altered source versions must be plainly marked as such, and must not
51 * be misrepresented as being the original software.
52 *
53 * 3. This notice may not be removed or altered from any source
54 * distribution.
55 *
56 * Marcus Geelnard
57 * marcus.geelnard at home.se
58 *************************************************************************/
59
60 /* Modified May 06 by Julian Seward for use in Valgrind.
61 - changed integral types to V's versions (UInt, UChar etc)
62 - added initialisation in _Huffman_WriteBits, as described in
63 comment in that function.
64 */
65
66 /*************************************************************************
67 * Types used for Huffman coding
68 *************************************************************************/
69
70 typedef struct {
71 UInt Symbol;
72 UInt Count;
73 UInt Code;
74 UInt Bits;
75 } huff_sym_t;
76
77 typedef struct {
78 UChar *BytePtr;
79 UInt BitPos;
80 } huff_bitstream_t;
81
82
83
84 /*************************************************************************
85 * INTERNAL FUNCTIONS *
86 *************************************************************************/
87
88
89 /*************************************************************************
90 * _Huffman_InitBitstream() - Initialize a bitstream.
91 *************************************************************************/
92
_Huffman_InitBitstream(huff_bitstream_t * stream,UChar * buf)93 static void _Huffman_InitBitstream( huff_bitstream_t *stream,
94 UChar *buf )
95 {
96 stream->BytePtr = buf;
97 stream->BitPos = 0;
98 }
99
100
101 /*************************************************************************
102 * _Huffman_ReadBits() - Read bits from a bitstream.
103 *************************************************************************/
104
_Huffman_ReadBits(huff_bitstream_t * stream,UInt bits)105 static UInt _Huffman_ReadBits( huff_bitstream_t *stream,
106 UInt bits )
107 {
108 UInt x, bit, count;
109 UChar *buf;
110
111 /* Get current stream state */
112 buf = stream->BytePtr;
113 bit = stream->BitPos;
114
115 /* Extract bits */
116 x = 0;
117 for( count = 0; count < bits; ++ count )
118 {
119 x = (x<<1) + (*buf & (1<<(7-bit)) ? 1 : 0);
120 bit = (bit+1) & 7;
121 if( !bit )
122 {
123 ++ buf;
124 }
125 }
126
127 /* Store new stream state */
128 stream->BytePtr = buf;
129 stream->BitPos = bit;
130
131 return x;
132 }
133
134
135 /*************************************************************************
136 * _Huffman_WriteBits() - Write bits to a bitstream.
137 *************************************************************************/
138
_Huffman_WriteBits(huff_bitstream_t * stream,UInt x,UInt bits)139 static void _Huffman_WriteBits( huff_bitstream_t *stream, UInt x,
140 UInt bits )
141 {
142 UInt bit, count;
143 UChar *buf;
144 UInt mask;
145
146 /* Get current stream state */
147 buf = stream->BytePtr;
148 bit = stream->BitPos;
149
150 /* Append bits */
151 mask = 1 << (bits-1);
152 for( count = 0; count < bits; ++ count )
153 {
154 /* If we're starting a new byte, zero it out, so that the
155 resulting byte sequence looks completely defined from
156 Valgrind's point of view. If this doesn't happen then the
157 last byte in the stream may look partially undefined. */
158 if (bit == 0)
159 *buf = 0;
160 *buf = (*buf & (0xff^(1<<(7-bit)))) +
161 ((x & mask ? 1 : 0) << (7-bit));
162 x <<= 1;
163 bit = (bit+1) & 7;
164 if( !bit )
165 {
166 ++ buf;
167 }
168 }
169
170 /* Store new stream state */
171 stream->BytePtr = buf;
172 stream->BitPos = bit;
173 }
174
175
176 /*************************************************************************
177 * _Huffman_Hist() - Calculate (sorted) histogram for a block of data.
178 *************************************************************************/
179
_Huffman_Hist(UChar * in,huff_sym_t * sym,UInt size)180 static void _Huffman_Hist( UChar *in, huff_sym_t *sym,
181 UInt size )
182 {
183 Int k, swaps;
184 huff_sym_t tmp;
185
186 /* Clear/init histogram */
187 for( k = 0; k < 256; ++ k )
188 {
189 sym[k].Symbol = k;
190 sym[k].Count = 0;
191 sym[k].Code = 0;
192 sym[k].Bits = 0;
193 }
194
195 /* Build histogram */
196 for( k = size; k; -- k )
197 {
198 sym[ *in ++ ].Count ++;
199 }
200
201 /* Sort histogram - most frequent symbol first (bubble sort) */
202 do
203 {
204 swaps = 0;
205 for( k = 0; k < 255; ++ k )
206 {
207 if( sym[k].Count < sym[k+1].Count )
208 {
209 tmp = sym[k];
210 sym[k] = sym[k+1];
211 sym[k+1] = tmp;
212 swaps = 1;
213 }
214 }
215 }
216 while( swaps );
217 }
218
219
220 /*************************************************************************
221 * _Huffman_MakeTree() - Generate a Huffman tree.
222 *************************************************************************/
223
_Huffman_MakeTree(huff_sym_t * sym,huff_bitstream_t * stream,UInt code,UInt bits,UInt first,UInt last)224 static void _Huffman_MakeTree( huff_sym_t *sym, huff_bitstream_t *stream,
225 UInt code, UInt bits, UInt first,
226 UInt last )
227 {
228 UInt k, size, size_a, size_b, last_a, first_b;
229
230 /* Is this a leaf node? */
231 if( first == last )
232 {
233 /* Append symbol to tree description */
234 _Huffman_WriteBits( stream, 1, 1 );
235 _Huffman_WriteBits( stream, sym[first].Symbol, 8 );
236
237 /* Store code info in symbol array */
238 sym[first].Code = code;
239 sym[first].Bits = bits;
240 return;
241 }
242 else
243 {
244 /* This was not a leaf node */
245 _Huffman_WriteBits( stream, 0, 1 );
246 }
247
248 /* Total size of interval */
249 size = 0;
250 for( k = first; k <= last; ++ k )
251 {
252 size += sym[k].Count;
253 }
254
255 /* Find size of branch a */
256 size_a = 0;
257 for( k = first; size_a < ((size+1)>>1) && k < last; ++ k )
258 {
259 size_a += sym[k].Count;
260 }
261
262 /* Non-empty branch? */
263 if( size_a > 0 )
264 {
265 /* Continue branching */
266 _Huffman_WriteBits( stream, 1, 1 );
267
268 /* Branch a cut in histogram */
269 last_a = k-1;
270
271 /* Create branch a */
272 _Huffman_MakeTree( sym, stream, (code<<1)+0, bits+1,
273 first, last_a );
274 }
275 else
276 {
277 /* This was an empty branch */
278 _Huffman_WriteBits( stream, 0, 1 );
279 }
280
281 /* Size of branch b */
282 size_b = size - size_a;
283
284 /* Non-empty branch? */
285 if( size_b > 0 )
286 {
287 /* Continue branching */
288 _Huffman_WriteBits( stream, 1, 1 );
289
290 /* Branch b cut in histogram */
291 first_b = k;
292
293 /* Create branch b */
294 _Huffman_MakeTree( sym, stream, (code<<1)+1, bits+1,
295 first_b, last );
296 }
297 else
298 {
299 /* This was an empty branch */
300 _Huffman_WriteBits( stream, 0, 1 );
301 }
302 }
303
304
305 /*************************************************************************
306 * _Huffman_RecoverTree() - Recover a Huffman tree from a bitstream.
307 *************************************************************************/
308
_Huffman_RecoverTree(huff_sym_t * sym,huff_bitstream_t * stream,UInt code,UInt bits,UInt * symnum)309 static void _Huffman_RecoverTree( huff_sym_t *sym,
310 huff_bitstream_t *stream, UInt code, UInt bits,
311 UInt *symnum )
312 {
313 UInt symbol;
314
315 /* Is this a leaf node? */
316 if( _Huffman_ReadBits( stream, 1 ) )
317 {
318 /* Get symbol from tree description */
319 symbol = _Huffman_ReadBits( stream, 8 );
320
321 /* Store code info in symbol array */
322 sym[*symnum].Symbol = symbol;
323 sym[*symnum].Code = code;
324 sym[*symnum].Bits = bits;
325
326 /* Increase symbol counter */
327 *symnum = *symnum + 1;
328
329 return;
330 }
331
332 /* Non-empty branch? */
333 if( _Huffman_ReadBits( stream, 1 ) )
334 {
335 /* Create branch a */
336 _Huffman_RecoverTree( sym, stream, (code<<1)+0, bits+1,
337 symnum );
338 }
339
340 /* Non-empty branch? */
341 if( _Huffman_ReadBits( stream, 1 ) )
342 {
343 /* Create branch b */
344 _Huffman_RecoverTree( sym, stream, (code<<1)+1, bits+1,
345 symnum );
346 }
347 }
348
349
350
351
352 /*************************************************************************
353 * PUBLIC FUNCTIONS *
354 *************************************************************************/
355
356
357 /*************************************************************************
358 * Huffman_Compress() - Compress a block of data using a Huffman coder.
359 * in - Input (uncompressed) buffer.
360 * out - Output (compressed) buffer. This buffer must be 384 bytes
361 * larger than the input buffer.
362 * insize - Number of input bytes.
363 * The function returns the size of the compressed data.
364 *************************************************************************/
365 static
Huffman_Compress(UChar * in,UChar * out,UInt insize)366 Int Huffman_Compress( UChar *in, UChar *out,
367 UInt insize )
368 {
369 huff_sym_t sym[ 256 ], tmp;
370 huff_bitstream_t stream;
371 UInt k, total_bytes, swaps, symbol, last_symbol;
372
373 /* Do we have anything to compress? */
374 if( insize < 1 ) return 0;
375
376 /* Initialize bitstream */
377 _Huffman_InitBitstream( &stream, out );
378
379 /* Calculate and sort histogram for input data */
380 _Huffman_Hist( in, sym, insize );
381
382 /* Find number of used symbols */
383 for( last_symbol = 255; sym[last_symbol].Count == 0; -- last_symbol );
384
385 /* Special case: In order to build a correct tree, we need at least
386 two symbols (otherwise we get zero-bit representations). */
387 if( last_symbol == 0 ) ++ last_symbol;
388
389 /* Build Huffman tree */
390 _Huffman_MakeTree( sym, &stream, 0, 0, 0, last_symbol );
391
392 /* Was any code > 32 bits? (we do not handle that at present) */
393 for( k = 0; k < 255; ++ k )
394 {
395 if( sym[k].Bits > 32 )
396 {
397 return 0;
398 }
399 }
400
401 /* Sort histogram - first symbol first (bubble sort) */
402 do
403 {
404 swaps = 0;
405 for( k = 0; k < 255; ++ k )
406 {
407 if( sym[k].Symbol > sym[k+1].Symbol )
408 {
409 tmp = sym[k];
410 sym[k] = sym[k+1];
411 sym[k+1] = tmp;
412 swaps = 1;
413 }
414 }
415 }
416 while( swaps );
417
418 /* Encode input stream */
419 for( k = 0; k < insize; ++ k )
420 {
421 symbol = in[ k ];
422 _Huffman_WriteBits( &stream, sym[symbol].Code,
423 sym[symbol].Bits );
424 }
425
426 /* Calculate size of output data */
427 total_bytes = (Int)(stream.BytePtr - out);
428 if( stream.BitPos > 0 )
429 {
430 ++ total_bytes;
431 }
432
433 return total_bytes;
434 }
435
436
437
438 /*************************************************************************
439 * Huffman_Uncompress() - Uncompress a block of data using a Huffman
440 * decoder.
441 * in - Input (compressed) buffer.
442 * out - Output (uncompressed) buffer. This buffer must be large
443 * enough to hold the uncompressed data.
444 * insize - Number of input bytes.
445 * outsize - Number of output bytes.
446 *************************************************************************/
447 static
Huffman_Uncompress(UChar * in,UChar * out,UInt insize,UInt outsize)448 void Huffman_Uncompress( UChar *in, UChar *out,
449 UInt insize, UInt outsize )
450 {
451 huff_sym_t sym[ 256 ], tmp;
452 huff_bitstream_t stream;
453 UInt k, m, symbol_count, swaps;
454 UChar *buf;
455 UInt bits, delta_bits, new_bits, code;
456
457 /* Do we have anything to decompress? */
458 if( insize < 1 ) return;
459
460 /* Initialize bitstream */
461 _Huffman_InitBitstream( &stream, in );
462
463 /* Clear tree/histogram */
464 for( k = 0; k < 256; ++ k )
465 {
466 sym[k].Bits = 0x7fffffff;
467 }
468
469 /* Recover Huffman tree */
470 symbol_count = 0;
471 _Huffman_RecoverTree( sym, &stream, 0, 0, &symbol_count );
472
473 /* Sort histogram - shortest code first (bubble sort) */
474 do
475 {
476 swaps = 0;
477 for( k = 0; k < symbol_count-1; ++ k )
478 {
479 if( sym[k].Bits > sym[k+1].Bits )
480 {
481 tmp = sym[k];
482 sym[k] = sym[k+1];
483 sym[k+1] = tmp;
484 swaps = 1;
485 }
486 }
487 }
488 while( swaps );
489
490 /* Decode input stream */
491 buf = out;
492 for( k = 0; k < outsize; ++ k )
493 {
494 /* Search tree for matching code */
495 bits = 0;
496 code = 0;
497 for( m = 0; m < symbol_count; ++ m )
498 {
499 delta_bits = sym[m].Bits - bits;
500 if( delta_bits )
501 {
502 new_bits = _Huffman_ReadBits( &stream, delta_bits );
503 code = code | (new_bits << (32-bits-delta_bits));
504 bits = sym[m].Bits;
505 }
506 if( code == (sym[m].Code << (32-sym[m].Bits)) )
507 {
508 *buf ++ = (UChar) sym[m].Symbol;
509 break;
510 }
511 }
512 }
513 }
514