• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1"""Functions that read and write gzipped files.
2
3The user of the file doesn't have to worry about the compression,
4but random access is not allowed."""
5
6# based on Andrew Kuchling's minigzip.py distributed with the zlib module
7
8import struct, sys, time, os
9import zlib
10import io
11import __builtin__
12
13__all__ = ["GzipFile","open"]
14
15FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16
16
17READ, WRITE = 1, 2
18
19def write32u(output, value):
20    # The L format writes the bit pattern correctly whether signed
21    # or unsigned.
22    output.write(struct.pack("<L", value))
23
24def read32(input):
25    return struct.unpack("<I", input.read(4))[0]
26
27def open(filename, mode="rb", compresslevel=9):
28    """Shorthand for GzipFile(filename, mode, compresslevel).
29
30    The filename argument is required; mode defaults to 'rb'
31    and compresslevel defaults to 9.
32
33    """
34    return GzipFile(filename, mode, compresslevel)
35
36class GzipFile(io.BufferedIOBase):
37    """The GzipFile class simulates most of the methods of a file object with
38    the exception of the readinto() and truncate() methods.
39
40    """
41
42    myfileobj = None
43    max_read_chunk = 10 * 1024 * 1024   # 10Mb
44
45    def __init__(self, filename=None, mode=None,
46                 compresslevel=9, fileobj=None, mtime=None):
47        """Constructor for the GzipFile class.
48
49        At least one of fileobj and filename must be given a
50        non-trivial value.
51
52        The new class instance is based on fileobj, which can be a regular
53        file, a StringIO object, or any other object which simulates a file.
54        It defaults to None, in which case filename is opened to provide
55        a file object.
56
57        When fileobj is not None, the filename argument is only used to be
58        included in the gzip file header, which may includes the original
59        filename of the uncompressed file.  It defaults to the filename of
60        fileobj, if discernible; otherwise, it defaults to the empty string,
61        and in this case the original filename is not included in the header.
62
63        The mode argument can be any of 'r', 'rb', 'a', 'ab', 'w', or 'wb',
64        depending on whether the file will be read or written.  The default
65        is the mode of fileobj if discernible; otherwise, the default is 'rb'.
66        Be aware that only the 'rb', 'ab', and 'wb' values should be used
67        for cross-platform portability.
68
69        The compresslevel argument is an integer from 1 to 9 controlling the
70        level of compression; 1 is fastest and produces the least compression,
71        and 9 is slowest and produces the most compression.  The default is 9.
72
73        The mtime argument is an optional numeric timestamp to be written
74        to the stream when compressing.  All gzip compressed streams
75        are required to contain a timestamp.  If omitted or None, the
76        current time is used.  This module ignores the timestamp when
77        decompressing; however, some programs, such as gunzip, make use
78        of it.  The format of the timestamp is the same as that of the
79        return value of time.time() and of the st_mtime member of the
80        object returned by os.stat().
81
82        """
83
84        # guarantee the file is opened in binary mode on platforms
85        # that care about that sort of thing
86        if mode and 'b' not in mode:
87            mode += 'b'
88        if fileobj is None:
89            fileobj = self.myfileobj = __builtin__.open(filename, mode or 'rb')
90        if filename is None:
91            if hasattr(fileobj, 'name'): filename = fileobj.name
92            else: filename = ''
93        if mode is None:
94            if hasattr(fileobj, 'mode'): mode = fileobj.mode
95            else: mode = 'rb'
96
97        if mode[0:1] == 'r':
98            self.mode = READ
99            # Set flag indicating start of a new member
100            self._new_member = True
101            # Buffer data read from gzip file. extrastart is offset in
102            # stream where buffer starts. extrasize is number of
103            # bytes remaining in buffer from current stream position.
104            self.extrabuf = ""
105            self.extrasize = 0
106            self.extrastart = 0
107            self.name = filename
108            # Starts small, scales exponentially
109            self.min_readsize = 100
110
111        elif mode[0:1] == 'w' or mode[0:1] == 'a':
112            self.mode = WRITE
113            self._init_write(filename)
114            self.compress = zlib.compressobj(compresslevel,
115                                             zlib.DEFLATED,
116                                             -zlib.MAX_WBITS,
117                                             zlib.DEF_MEM_LEVEL,
118                                             0)
119        else:
120            raise IOError, "Mode " + mode + " not supported"
121
122        self.fileobj = fileobj
123        self.offset = 0
124        self.mtime = mtime
125
126        if self.mode == WRITE:
127            self._write_gzip_header()
128
129    @property
130    def filename(self):
131        import warnings
132        warnings.warn("use the name attribute", DeprecationWarning, 2)
133        if self.mode == WRITE and self.name[-3:] != ".gz":
134            return self.name + ".gz"
135        return self.name
136
137    def __repr__(self):
138        s = repr(self.fileobj)
139        return '<gzip ' + s[1:-1] + ' ' + hex(id(self)) + '>'
140
141    def _check_closed(self):
142        """Raises a ValueError if the underlying file object has been closed.
143
144        """
145        if self.closed:
146            raise ValueError('I/O operation on closed file.')
147
148    def _init_write(self, filename):
149        self.name = filename
150        self.crc = zlib.crc32("") & 0xffffffffL
151        self.size = 0
152        self.writebuf = []
153        self.bufsize = 0
154
155    def _write_gzip_header(self):
156        self.fileobj.write('\037\213')             # magic header
157        self.fileobj.write('\010')                 # compression method
158        fname = os.path.basename(self.name)
159        if fname.endswith(".gz"):
160            fname = fname[:-3]
161        flags = 0
162        if fname:
163            flags = FNAME
164        self.fileobj.write(chr(flags))
165        mtime = self.mtime
166        if mtime is None:
167            mtime = time.time()
168        write32u(self.fileobj, long(mtime))
169        self.fileobj.write('\002')
170        self.fileobj.write('\377')
171        if fname:
172            self.fileobj.write(fname + '\000')
173
174    def _init_read(self):
175        self.crc = zlib.crc32("") & 0xffffffffL
176        self.size = 0
177
178    def _read_gzip_header(self):
179        magic = self.fileobj.read(2)
180        if magic != '\037\213':
181            raise IOError, 'Not a gzipped file'
182        method = ord( self.fileobj.read(1) )
183        if method != 8:
184            raise IOError, 'Unknown compression method'
185        flag = ord( self.fileobj.read(1) )
186        self.mtime = read32(self.fileobj)
187        # extraflag = self.fileobj.read(1)
188        # os = self.fileobj.read(1)
189        self.fileobj.read(2)
190
191        if flag & FEXTRA:
192            # Read & discard the extra field, if present
193            xlen = ord(self.fileobj.read(1))
194            xlen = xlen + 256*ord(self.fileobj.read(1))
195            self.fileobj.read(xlen)
196        if flag & FNAME:
197            # Read and discard a null-terminated string containing the filename
198            while True:
199                s = self.fileobj.read(1)
200                if not s or s=='\000':
201                    break
202        if flag & FCOMMENT:
203            # Read and discard a null-terminated string containing a comment
204            while True:
205                s = self.fileobj.read(1)
206                if not s or s=='\000':
207                    break
208        if flag & FHCRC:
209            self.fileobj.read(2)     # Read & discard the 16-bit header CRC
210
211    def write(self,data):
212        self._check_closed()
213        if self.mode != WRITE:
214            import errno
215            raise IOError(errno.EBADF, "write() on read-only GzipFile object")
216
217        if self.fileobj is None:
218            raise ValueError, "write() on closed GzipFile object"
219
220        # Convert data type if called by io.BufferedWriter.
221        if isinstance(data, memoryview):
222            data = data.tobytes()
223
224        if len(data) > 0:
225            self.size = self.size + len(data)
226            self.crc = zlib.crc32(data, self.crc) & 0xffffffffL
227            self.fileobj.write( self.compress.compress(data) )
228            self.offset += len(data)
229
230        return len(data)
231
232    def read(self, size=-1):
233        self._check_closed()
234        if self.mode != READ:
235            import errno
236            raise IOError(errno.EBADF, "read() on write-only GzipFile object")
237
238        if self.extrasize <= 0 and self.fileobj is None:
239            return ''
240
241        readsize = 1024
242        if size < 0:        # get the whole thing
243            try:
244                while True:
245                    self._read(readsize)
246                    readsize = min(self.max_read_chunk, readsize * 2)
247            except EOFError:
248                size = self.extrasize
249        else:               # just get some more of it
250            try:
251                while size > self.extrasize:
252                    self._read(readsize)
253                    readsize = min(self.max_read_chunk, readsize * 2)
254            except EOFError:
255                if size > self.extrasize:
256                    size = self.extrasize
257
258        offset = self.offset - self.extrastart
259        chunk = self.extrabuf[offset: offset + size]
260        self.extrasize = self.extrasize - size
261
262        self.offset += size
263        return chunk
264
265    def _unread(self, buf):
266        self.extrasize = len(buf) + self.extrasize
267        self.offset -= len(buf)
268
269    def _read(self, size=1024):
270        if self.fileobj is None:
271            raise EOFError, "Reached EOF"
272
273        if self._new_member:
274            # If the _new_member flag is set, we have to
275            # jump to the next member, if there is one.
276            #
277            # First, check if we're at the end of the file;
278            # if so, it's time to stop; no more members to read.
279            pos = self.fileobj.tell()   # Save current position
280            self.fileobj.seek(0, 2)     # Seek to end of file
281            if pos == self.fileobj.tell():
282                raise EOFError, "Reached EOF"
283            else:
284                self.fileobj.seek( pos ) # Return to original position
285
286            self._init_read()
287            self._read_gzip_header()
288            self.decompress = zlib.decompressobj(-zlib.MAX_WBITS)
289            self._new_member = False
290
291        # Read a chunk of data from the file
292        buf = self.fileobj.read(size)
293
294        # If the EOF has been reached, flush the decompression object
295        # and mark this object as finished.
296
297        if buf == "":
298            uncompress = self.decompress.flush()
299            self._read_eof()
300            self._add_read_data( uncompress )
301            raise EOFError, 'Reached EOF'
302
303        uncompress = self.decompress.decompress(buf)
304        self._add_read_data( uncompress )
305
306        if self.decompress.unused_data != "":
307            # Ending case: we've come to the end of a member in the file,
308            # so seek back to the start of the unused data, finish up
309            # this member, and read a new gzip header.
310            # (The number of bytes to seek back is the length of the unused
311            # data, minus 8 because _read_eof() will rewind a further 8 bytes)
312            self.fileobj.seek( -len(self.decompress.unused_data)+8, 1)
313
314            # Check the CRC and file size, and set the flag so we read
315            # a new member on the next call
316            self._read_eof()
317            self._new_member = True
318
319    def _add_read_data(self, data):
320        self.crc = zlib.crc32(data, self.crc) & 0xffffffffL
321        offset = self.offset - self.extrastart
322        self.extrabuf = self.extrabuf[offset:] + data
323        self.extrasize = self.extrasize + len(data)
324        self.extrastart = self.offset
325        self.size = self.size + len(data)
326
327    def _read_eof(self):
328        # We've read to the end of the file, so we have to rewind in order
329        # to reread the 8 bytes containing the CRC and the file size.
330        # We check the that the computed CRC and size of the
331        # uncompressed data matches the stored values.  Note that the size
332        # stored is the true file size mod 2**32.
333        self.fileobj.seek(-8, 1)
334        crc32 = read32(self.fileobj)
335        isize = read32(self.fileobj)  # may exceed 2GB
336        if crc32 != self.crc:
337            raise IOError("CRC check failed %s != %s" % (hex(crc32),
338                                                         hex(self.crc)))
339        elif isize != (self.size & 0xffffffffL):
340            raise IOError, "Incorrect length of data produced"
341
342        # Gzip files can be padded with zeroes and still have archives.
343        # Consume all zero bytes and set the file position to the first
344        # non-zero byte. See http://www.gzip.org/#faq8
345        c = "\x00"
346        while c == "\x00":
347            c = self.fileobj.read(1)
348        if c:
349            self.fileobj.seek(-1, 1)
350
351    @property
352    def closed(self):
353        return self.fileobj is None
354
355    def close(self):
356        if self.fileobj is None:
357            return
358        if self.mode == WRITE:
359            self.fileobj.write(self.compress.flush())
360            write32u(self.fileobj, self.crc)
361            # self.size may exceed 2GB, or even 4GB
362            write32u(self.fileobj, self.size & 0xffffffffL)
363            self.fileobj = None
364        elif self.mode == READ:
365            self.fileobj = None
366        if self.myfileobj:
367            self.myfileobj.close()
368            self.myfileobj = None
369
370    def flush(self,zlib_mode=zlib.Z_SYNC_FLUSH):
371        self._check_closed()
372        if self.mode == WRITE:
373            # Ensure the compressor's buffer is flushed
374            self.fileobj.write(self.compress.flush(zlib_mode))
375            self.fileobj.flush()
376
377    def fileno(self):
378        """Invoke the underlying file object's fileno() method.
379
380        This will raise AttributeError if the underlying file object
381        doesn't support fileno().
382        """
383        return self.fileobj.fileno()
384
385    def rewind(self):
386        '''Return the uncompressed stream file position indicator to the
387        beginning of the file'''
388        if self.mode != READ:
389            raise IOError("Can't rewind in write mode")
390        self.fileobj.seek(0)
391        self._new_member = True
392        self.extrabuf = ""
393        self.extrasize = 0
394        self.extrastart = 0
395        self.offset = 0
396
397    def readable(self):
398        return self.mode == READ
399
400    def writable(self):
401        return self.mode == WRITE
402
403    def seekable(self):
404        return True
405
406    def seek(self, offset, whence=0):
407        if whence:
408            if whence == 1:
409                offset = self.offset + offset
410            else:
411                raise ValueError('Seek from end not supported')
412        if self.mode == WRITE:
413            if offset < self.offset:
414                raise IOError('Negative seek in write mode')
415            count = offset - self.offset
416            for i in range(count // 1024):
417                self.write(1024 * '\0')
418            self.write((count % 1024) * '\0')
419        elif self.mode == READ:
420            if offset < self.offset:
421                # for negative seek, rewind and do positive seek
422                self.rewind()
423            count = offset - self.offset
424            for i in range(count // 1024):
425                self.read(1024)
426            self.read(count % 1024)
427
428        return self.offset
429
430    def readline(self, size=-1):
431        if size < 0:
432            # Shortcut common case - newline found in buffer.
433            offset = self.offset - self.extrastart
434            i = self.extrabuf.find('\n', offset) + 1
435            if i > 0:
436                self.extrasize -= i - offset
437                self.offset += i - offset
438                return self.extrabuf[offset: i]
439
440            size = sys.maxint
441            readsize = self.min_readsize
442        else:
443            readsize = size
444        bufs = []
445        while size != 0:
446            c = self.read(readsize)
447            i = c.find('\n')
448
449            # We set i=size to break out of the loop under two
450            # conditions: 1) there's no newline, and the chunk is
451            # larger than size, or 2) there is a newline, but the
452            # resulting line would be longer than 'size'.
453            if (size <= i) or (i == -1 and len(c) > size):
454                i = size - 1
455
456            if i >= 0 or c == '':
457                bufs.append(c[:i + 1])    # Add portion of last chunk
458                self._unread(c[i + 1:])   # Push back rest of chunk
459                break
460
461            # Append chunk to list, decrease 'size',
462            bufs.append(c)
463            size = size - len(c)
464            readsize = min(size, readsize * 2)
465        if readsize > self.min_readsize:
466            self.min_readsize = min(readsize, self.min_readsize * 2, 512)
467        return ''.join(bufs) # Return resulting line
468
469
470def _test():
471    # Act like gzip; with -d, act like gunzip.
472    # The input file is not deleted, however, nor are any other gzip
473    # options or features supported.
474    args = sys.argv[1:]
475    decompress = args and args[0] == "-d"
476    if decompress:
477        args = args[1:]
478    if not args:
479        args = ["-"]
480    for arg in args:
481        if decompress:
482            if arg == "-":
483                f = GzipFile(filename="", mode="rb", fileobj=sys.stdin)
484                g = sys.stdout
485            else:
486                if arg[-3:] != ".gz":
487                    print "filename doesn't end in .gz:", repr(arg)
488                    continue
489                f = open(arg, "rb")
490                g = __builtin__.open(arg[:-3], "wb")
491        else:
492            if arg == "-":
493                f = sys.stdin
494                g = GzipFile(filename="", mode="wb", fileobj=sys.stdout)
495            else:
496                f = __builtin__.open(arg, "rb")
497                g = open(arg + ".gz", "wb")
498        while True:
499            chunk = f.read(1024)
500            if not chunk:
501                break
502            g.write(chunk)
503        if g is not sys.stdout:
504            g.close()
505        if f is not sys.stdin:
506            f.close()
507
508if __name__ == '__main__':
509    _test()
510