• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* zran.c -- example of zlib/gzip stream indexing and random access
2  * Copyright (C) 2005, 2012, 2018 Mark Adler
3  * For conditions of distribution and use, see copyright notice in zlib.h
4  * Version 1.2  14 Oct 2018  Mark Adler */
5 
6 /* Version History:
7  1.0  29 May 2005  First version
8  1.1  29 Sep 2012  Fix memory reallocation error
9  1.2  14 Oct 2018  Handle gzip streams with multiple members
10                    Add a header file to facilitate usage in applications
11  */
12 
13 /* Illustrate the use of Z_BLOCK, inflatePrime(), and inflateSetDictionary()
14    for random access of a compressed file.  A file containing a zlib or gzip
15    stream is provided on the command line.  The compressed stream is decoded in
16    its entirety, and an index built with access points about every SPAN bytes
17    in the uncompressed output.  The compressed file is left open, and can then
18    be read randomly, having to decompress on the average SPAN/2 uncompressed
19    bytes before getting to the desired block of data.
20 
21    An access point can be created at the start of any deflate block, by saving
22    the starting file offset and bit of that block, and the 32K bytes of
23    uncompressed data that precede that block.  Also the uncompressed offset of
24    that block is saved to provide a referece for locating a desired starting
25    point in the uncompressed stream.  deflate_index_build() works by
26    decompressing the input zlib or gzip stream a block at a time, and at the
27    end of each block deciding if enough uncompressed data has gone by to
28    justify the creation of a new access point.  If so, that point is saved in a
29    data structure that grows as needed to accommodate the points.
30 
31    To use the index, an offset in the uncompressed data is provided, for which
32    the latest access point at or preceding that offset is located in the index.
33    The input file is positioned to the specified location in the index, and if
34    necessary the first few bits of the compressed data is read from the file.
35    inflate is initialized with those bits and the 32K of uncompressed data, and
36    the decompression then proceeds until the desired offset in the file is
37    reached.  Then the decompression continues to read the desired uncompressed
38    data from the file.
39 
40    Another approach would be to generate the index on demand.  In that case,
41    requests for random access reads from the compressed data would try to use
42    the index, but if a read far enough past the end of the index is required,
43    then further index entries would be generated and added.
44 
45    There is some fair bit of overhead to starting inflation for the random
46    access, mainly copying the 32K byte dictionary.  So if small pieces of the
47    file are being accessed, it would make sense to implement a cache to hold
48    some lookahead and avoid many calls to deflate_index_extract() for small
49    lengths.
50 
51    Another way to build an index would be to use inflateCopy().  That would
52    not be constrained to have access points at block boundaries, but requires
53    more memory per access point, and also cannot be saved to file due to the
54    use of pointers in the state.  The approach here allows for storage of the
55    index in a file.
56  */
57 
58 #include <stdio.h>
59 #include <stdlib.h>
60 #include <string.h>
61 #include "zlib.h"
62 #include "zran.h"
63 
64 #define WINSIZE 32768U      /* sliding window size */
65 #define CHUNK 16384         /* file input buffer size */
66 
67 /* Access point entry. */
68 struct point {
69     off_t out;          /* corresponding offset in uncompressed data */
70     off_t in;           /* offset in input file of first full byte */
71     int bits;           /* number of bits (1-7) from byte at in-1, or 0 */
72     unsigned char window[WINSIZE];  /* preceding 32K of uncompressed data */
73 };
74 
75 /* See comments in zran.h. */
deflate_index_free(struct deflate_index * index)76 void deflate_index_free(struct deflate_index *index)
77 {
78     if (index != NULL) {
79         free(index->list);
80         free(index);
81     }
82 }
83 
84 /* Add an entry to the access point list. If out of memory, deallocate the
85    existing list and return NULL. index->gzip is the allocated size of the
86    index in point entries, until it is time for deflate_index_build() to
87    return, at which point gzip is set to indicate a gzip file or not.
88  */
addpoint(struct deflate_index * index,int bits,off_t in,off_t out,unsigned left,unsigned char * window)89 static struct deflate_index *addpoint(struct deflate_index *index, int bits,
90                                       off_t in, off_t out, unsigned left,
91                                       unsigned char *window)
92 {
93     struct point *next;
94 
95     /* if list is empty, create it (start with eight points) */
96     if (index == NULL) {
97         index = malloc(sizeof(struct deflate_index));
98         if (index == NULL) return NULL;
99         index->list = malloc(sizeof(struct point) << 3);
100         if (index->list == NULL) {
101             free(index);
102             return NULL;
103         }
104         index->gzip = 8;
105         index->have = 0;
106     }
107 
108     /* if list is full, make it bigger */
109     else if (index->have == index->gzip) {
110         index->gzip <<= 1;
111         next = realloc(index->list, sizeof(struct point) * index->gzip);
112         if (next == NULL) {
113             deflate_index_free(index);
114             return NULL;
115         }
116         index->list = next;
117     }
118 
119     /* fill in entry and increment how many we have */
120     next = (struct point *)(index->list) + index->have;
121     next->bits = bits;
122     next->in = in;
123     next->out = out;
124     if (left)
125         memcpy(next->window, window + WINSIZE - left, left);
126     if (left < WINSIZE)
127         memcpy(next->window + left, window, WINSIZE - left);
128     index->have++;
129 
130     /* return list, possibly reallocated */
131     return index;
132 }
133 
134 /* See comments in zran.h. */
deflate_index_build(FILE * in,off_t span,struct deflate_index ** built)135 int deflate_index_build(FILE *in, off_t span, struct deflate_index **built)
136 {
137     int ret;
138     int gzip = 0;               /* true if reading a gzip file */
139     off_t totin, totout;        /* our own total counters to avoid 4GB limit */
140     off_t last;                 /* totout value of last access point */
141     struct deflate_index *index;    /* access points being generated */
142     z_stream strm;
143     unsigned char input[CHUNK];
144     unsigned char window[WINSIZE];
145 
146     /* initialize inflate */
147     strm.zalloc = Z_NULL;
148     strm.zfree = Z_NULL;
149     strm.opaque = Z_NULL;
150     strm.avail_in = 0;
151     strm.next_in = Z_NULL;
152     ret = inflateInit2(&strm, 47);      /* automatic zlib or gzip decoding */
153     if (ret != Z_OK)
154         return ret;
155 
156     /* inflate the input, maintain a sliding window, and build an index -- this
157        also validates the integrity of the compressed data using the check
158        information in the gzip or zlib stream */
159     totin = totout = last = 0;
160     index = NULL;               /* will be allocated by first addpoint() */
161     strm.avail_out = 0;
162     do {
163         /* get some compressed data from input file */
164         strm.avail_in = fread(input, 1, CHUNK, in);
165         if (ferror(in)) {
166             ret = Z_ERRNO;
167             goto deflate_index_build_error;
168         }
169         if (strm.avail_in == 0) {
170             ret = Z_DATA_ERROR;
171             goto deflate_index_build_error;
172         }
173         strm.next_in = input;
174 
175         /* check for a gzip stream */
176         if (totin == 0 && strm.avail_in >= 3 &&
177             input[0] == 31 && input[1] == 139 && input[2] == 8)
178             gzip = 1;
179 
180         /* process all of that, or until end of stream */
181         do {
182             /* reset sliding window if necessary */
183             if (strm.avail_out == 0) {
184                 strm.avail_out = WINSIZE;
185                 strm.next_out = window;
186             }
187 
188             /* inflate until out of input, output, or at end of block --
189                update the total input and output counters */
190             totin += strm.avail_in;
191             totout += strm.avail_out;
192             ret = inflate(&strm, Z_BLOCK);      /* return at end of block */
193             totin -= strm.avail_in;
194             totout -= strm.avail_out;
195             if (ret == Z_NEED_DICT)
196                 ret = Z_DATA_ERROR;
197             if (ret == Z_MEM_ERROR || ret == Z_DATA_ERROR)
198                 goto deflate_index_build_error;
199             if (ret == Z_STREAM_END) {
200                 if (gzip &&
201                     (strm.avail_in || ungetc(getc(in), in) != EOF)) {
202                     ret = inflateReset(&strm);
203                     if (ret != Z_OK)
204                         goto deflate_index_build_error;
205                     continue;
206                 }
207                 break;
208             }
209 
210             /* if at end of block, consider adding an index entry (note that if
211                data_type indicates an end-of-block, then all of the
212                uncompressed data from that block has been delivered, and none
213                of the compressed data after that block has been consumed,
214                except for up to seven bits) -- the totout == 0 provides an
215                entry point after the zlib or gzip header, and assures that the
216                index always has at least one access point; we avoid creating an
217                access point after the last block by checking bit 6 of data_type
218              */
219             if ((strm.data_type & 128) && !(strm.data_type & 64) &&
220                 (totout == 0 || totout - last > span)) {
221                 index = addpoint(index, strm.data_type & 7, totin,
222                                  totout, strm.avail_out, window);
223                 if (index == NULL) {
224                     ret = Z_MEM_ERROR;
225                     goto deflate_index_build_error;
226                 }
227                 last = totout;
228             }
229         } while (strm.avail_in != 0);
230     } while (ret != Z_STREAM_END);
231 
232     /* clean up and return index (release unused entries in list) */
233     (void)inflateEnd(&strm);
234     index->list = realloc(index->list, sizeof(struct point) * index->have);
235     index->gzip = gzip;
236     index->length = totout;
237     *built = index;
238     return index->have;
239 
240     /* return error */
241   deflate_index_build_error:
242     (void)inflateEnd(&strm);
243     deflate_index_free(index);
244     return ret;
245 }
246 
247 /* See comments in zran.h. */
deflate_index_extract(FILE * in,struct deflate_index * index,off_t offset,unsigned char * buf,int len)248 int deflate_index_extract(FILE *in, struct deflate_index *index, off_t offset,
249                           unsigned char *buf, int len)
250 {
251     int ret, skip;
252     z_stream strm;
253     struct point *here;
254     unsigned char input[CHUNK];
255     unsigned char discard[WINSIZE];
256 
257     /* proceed only if something reasonable to do */
258     if (len < 0)
259         return 0;
260 
261     /* find where in stream to start */
262     here = index->list;
263     ret = index->have;
264     while (--ret && here[1].out <= offset)
265         here++;
266 
267     /* initialize file and inflate state to start there */
268     strm.zalloc = Z_NULL;
269     strm.zfree = Z_NULL;
270     strm.opaque = Z_NULL;
271     strm.avail_in = 0;
272     strm.next_in = Z_NULL;
273     ret = inflateInit2(&strm, -15);         /* raw inflate */
274     if (ret != Z_OK)
275         return ret;
276     ret = fseeko(in, here->in - (here->bits ? 1 : 0), SEEK_SET);
277     if (ret == -1)
278         goto deflate_index_extract_ret;
279     if (here->bits) {
280         ret = getc(in);
281         if (ret == -1) {
282             ret = ferror(in) ? Z_ERRNO : Z_DATA_ERROR;
283             goto deflate_index_extract_ret;
284         }
285         (void)inflatePrime(&strm, here->bits, ret >> (8 - here->bits));
286     }
287     (void)inflateSetDictionary(&strm, here->window, WINSIZE);
288 
289     /* skip uncompressed bytes until offset reached, then satisfy request */
290     offset -= here->out;
291     strm.avail_in = 0;
292     skip = 1;                               /* while skipping to offset */
293     do {
294         /* define where to put uncompressed data, and how much */
295         if (offset > WINSIZE) {             /* skip WINSIZE bytes */
296             strm.avail_out = WINSIZE;
297             strm.next_out = discard;
298             offset -= WINSIZE;
299         }
300         else if (offset > 0) {              /* last skip */
301             strm.avail_out = (unsigned)offset;
302             strm.next_out = discard;
303             offset = 0;
304         }
305         else if (skip) {                    /* at offset now */
306             strm.avail_out = len;
307             strm.next_out = buf;
308             skip = 0;                       /* only do this once */
309         }
310 
311         /* uncompress until avail_out filled, or end of stream */
312         do {
313             if (strm.avail_in == 0) {
314                 strm.avail_in = fread(input, 1, CHUNK, in);
315                 if (ferror(in)) {
316                     ret = Z_ERRNO;
317                     goto deflate_index_extract_ret;
318                 }
319                 if (strm.avail_in == 0) {
320                     ret = Z_DATA_ERROR;
321                     goto deflate_index_extract_ret;
322                 }
323                 strm.next_in = input;
324             }
325             ret = inflate(&strm, Z_NO_FLUSH);       /* normal inflate */
326             if (ret == Z_NEED_DICT)
327                 ret = Z_DATA_ERROR;
328             if (ret == Z_MEM_ERROR || ret == Z_DATA_ERROR)
329                 goto deflate_index_extract_ret;
330             if (ret == Z_STREAM_END) {
331                 /* the raw deflate stream has ended */
332                 if (index->gzip == 0)
333                     /* this is a zlib stream that has ended -- done */
334                     break;
335 
336                 /* near the end of a gzip member, which might be followed by
337                    another gzip member -- skip the gzip trailer and see if
338                    there is more input after it */
339                 if (strm.avail_in < 8) {
340                     fseeko(in, 8 - strm.avail_in, SEEK_CUR);
341                     strm.avail_in = 0;
342                 }
343                 else {
344                     strm.avail_in -= 8;
345                     strm.next_in += 8;
346                 }
347                 if (strm.avail_in == 0 && ungetc(getc(in), in) == EOF)
348                     /* the input ended after the gzip trailer -- done */
349                     break;
350 
351                 /* there is more input, so another gzip member should follow --
352                    validate and skip the gzip header */
353                 ret = inflateReset2(&strm, 31);
354                 if (ret != Z_OK)
355                     goto deflate_index_extract_ret;
356                 do {
357                     if (strm.avail_in == 0) {
358                         strm.avail_in = fread(input, 1, CHUNK, in);
359                         if (ferror(in)) {
360                             ret = Z_ERRNO;
361                             goto deflate_index_extract_ret;
362                         }
363                         if (strm.avail_in == 0) {
364                             ret = Z_DATA_ERROR;
365                             goto deflate_index_extract_ret;
366                         }
367                         strm.next_in = input;
368                     }
369                     ret = inflate(&strm, Z_BLOCK);
370                     if (ret == Z_MEM_ERROR || ret == Z_DATA_ERROR)
371                         goto deflate_index_extract_ret;
372                 } while ((strm.data_type & 128) == 0);
373 
374                 /* set up to continue decompression of the raw deflate stream
375                    that follows the gzip header */
376                 ret = inflateReset2(&strm, -15);
377                 if (ret != Z_OK)
378                     goto deflate_index_extract_ret;
379             }
380 
381             /* continue to process the available input before reading more */
382         } while (strm.avail_out != 0);
383 
384         if (ret == Z_STREAM_END)
385             /* reached the end of the compressed data -- return the data that
386                was available, possibly less than requested */
387             break;
388 
389         /* do until offset reached and requested data read */
390     } while (skip);
391 
392     /* compute the number of uncompressed bytes read after the offset */
393     ret = skip ? 0 : len - strm.avail_out;
394 
395     /* clean up and return the bytes read, or the negative error */
396   deflate_index_extract_ret:
397     (void)inflateEnd(&strm);
398     return ret;
399 }
400 
401 #ifdef TEST
402 
403 #define SPAN 1048576L       /* desired distance between access points */
404 #define LEN 16384           /* number of bytes to extract */
405 
406 /* Demonstrate the use of deflate_index_build() and deflate_index_extract() by
407    processing the file provided on the command line, and extracting LEN bytes
408    from 2/3rds of the way through the uncompressed output, writing that to
409    stdout. An offset can be provided as the second argument, in which case the
410    data is extracted from there instead. */
main(int argc,char ** argv)411 int main(int argc, char **argv)
412 {
413     int len;
414     off_t offset = -1;
415     FILE *in;
416     struct deflate_index *index = NULL;
417     unsigned char buf[LEN];
418 
419     /* open input file */
420     if (argc < 2 || argc > 3) {
421         fprintf(stderr, "usage: zran file.gz [offset]\n");
422         return 1;
423     }
424     in = fopen(argv[1], "rb");
425     if (in == NULL) {
426         fprintf(stderr, "zran: could not open %s for reading\n", argv[1]);
427         return 1;
428     }
429 
430     /* get optional offset */
431     if (argc == 3) {
432         char *end;
433         offset = strtoll(argv[2], &end, 10);
434         if (*end || offset < 0) {
435             fprintf(stderr, "zran: %s is not a valid offset\n", argv[2]);
436             return 1;
437         }
438     }
439 
440     /* build index */
441     len = deflate_index_build(in, SPAN, &index);
442     if (len < 0) {
443         fclose(in);
444         switch (len) {
445         case Z_MEM_ERROR:
446             fprintf(stderr, "zran: out of memory\n");
447             break;
448         case Z_DATA_ERROR:
449             fprintf(stderr, "zran: compressed data error in %s\n", argv[1]);
450             break;
451         case Z_ERRNO:
452             fprintf(stderr, "zran: read error on %s\n", argv[1]);
453             break;
454         default:
455             fprintf(stderr, "zran: error %d while building index\n", len);
456         }
457         return 1;
458     }
459     fprintf(stderr, "zran: built index with %d access points\n", len);
460 
461     /* use index by reading some bytes from an arbitrary offset */
462     if (offset == -1)
463         offset = (index->length << 1) / 3;
464     len = deflate_index_extract(in, index, offset, buf, LEN);
465     if (len < 0)
466         fprintf(stderr, "zran: extraction failed: %s error\n",
467                 len == Z_MEM_ERROR ? "out of memory" : "input corrupted");
468     else {
469         fwrite(buf, 1, len, stdout);
470         fprintf(stderr, "zran: extracted %d bytes at %llu\n", len, offset);
471     }
472 
473     /* clean up and exit */
474     deflate_index_free(index);
475     fclose(in);
476     return 0;
477 }
478 
479 #endif
480