• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1"""Functions that read and write gzipped files.
2
3The user of the file doesn't have to worry about the compression,
4but random access is not allowed."""
5
6# based on Andrew Kuchling's minigzip.py distributed with the zlib module
7
8import struct, sys, time, os
9import zlib
10import io
11import __builtin__
12
13__all__ = ["GzipFile","open"]
14
15FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16
16
17READ, WRITE = 1, 2
18
19def write32u(output, value):
20    # The L format writes the bit pattern correctly whether signed
21    # or unsigned.
22    output.write(struct.pack("<L", value))
23
24def read32(input):
25    return struct.unpack("<I", input.read(4))[0]
26
27def open(filename, mode="rb", compresslevel=9):
28    """Shorthand for GzipFile(filename, mode, compresslevel).
29
30    The filename argument is required; mode defaults to 'rb'
31    and compresslevel defaults to 9.
32
33    """
34    return GzipFile(filename, mode, compresslevel)
35
36class GzipFile(io.BufferedIOBase):
37    """The GzipFile class simulates most of the methods of a file object with
38    the exception of the readinto() and truncate() methods.
39
40    """
41
42    myfileobj = None
43    max_read_chunk = 10 * 1024 * 1024   # 10Mb
44
45    def __init__(self, filename=None, mode=None,
46                 compresslevel=9, fileobj=None, mtime=None):
47        """Constructor for the GzipFile class.
48
49        At least one of fileobj and filename must be given a
50        non-trivial value.
51
52        The new class instance is based on fileobj, which can be a regular
53        file, a StringIO object, or any other object which simulates a file.
54        It defaults to None, in which case filename is opened to provide
55        a file object.
56
57        When fileobj is not None, the filename argument is only used to be
58        included in the gzip file header, which may include the original
59        filename of the uncompressed file.  It defaults to the filename of
60        fileobj, if discernible; otherwise, it defaults to the empty string,
61        and in this case the original filename is not included in the header.
62
63        The mode argument can be any of 'r', 'rb', 'a', 'ab', 'w', or 'wb',
64        depending on whether the file will be read or written.  The default
65        is the mode of fileobj if discernible; otherwise, the default is 'rb'.
66        Be aware that only the 'rb', 'ab', and 'wb' values should be used
67        for cross-platform portability.
68
69        The compresslevel argument is an integer from 0 to 9 controlling the
70        level of compression; 1 is fastest and produces the least compression,
71        and 9 is slowest and produces the most compression. 0 is no compression
72        at all. The default is 9.
73
74        The mtime argument is an optional numeric timestamp to be written
75        to the stream when compressing.  All gzip compressed streams
76        are required to contain a timestamp.  If omitted or None, the
77        current time is used.  This module ignores the timestamp when
78        decompressing; however, some programs, such as gunzip, make use
79        of it.  The format of the timestamp is the same as that of the
80        return value of time.time() and of the st_mtime member of the
81        object returned by os.stat().
82
83        """
84
85        # Make sure we don't inadvertently enable universal newlines on the
86        # underlying file object - in read mode, this causes data corruption.
87        if mode:
88            mode = mode.replace('U', '')
89        # guarantee the file is opened in binary mode on platforms
90        # that care about that sort of thing
91        if mode and 'b' not in mode:
92            mode += 'b'
93        if fileobj is None:
94            fileobj = self.myfileobj = __builtin__.open(filename, mode or 'rb')
95        if filename is None:
96            # Issue #13781: os.fdopen() creates a fileobj with a bogus name
97            # attribute. Avoid saving this in the gzip header's filename field.
98            if hasattr(fileobj, 'name') and fileobj.name != '<fdopen>':
99                filename = fileobj.name
100            else:
101                filename = ''
102        if mode is None:
103            if hasattr(fileobj, 'mode'): mode = fileobj.mode
104            else: mode = 'rb'
105
106        if mode[0:1] == 'r':
107            self.mode = READ
108            # Set flag indicating start of a new member
109            self._new_member = True
110            # Buffer data read from gzip file. extrastart is offset in
111            # stream where buffer starts. extrasize is number of
112            # bytes remaining in buffer from current stream position.
113            self.extrabuf = ""
114            self.extrasize = 0
115            self.extrastart = 0
116            self.name = filename
117            # Starts small, scales exponentially
118            self.min_readsize = 100
119
120        elif mode[0:1] == 'w' or mode[0:1] == 'a':
121            self.mode = WRITE
122            self._init_write(filename)
123            self.compress = zlib.compressobj(compresslevel,
124                                             zlib.DEFLATED,
125                                             -zlib.MAX_WBITS,
126                                             zlib.DEF_MEM_LEVEL,
127                                             0)
128        else:
129            raise IOError, "Mode " + mode + " not supported"
130
131        self.fileobj = fileobj
132        self.offset = 0
133        self.mtime = mtime
134
135        if self.mode == WRITE:
136            self._write_gzip_header()
137
138    @property
139    def filename(self):
140        import warnings
141        warnings.warn("use the name attribute", DeprecationWarning, 2)
142        if self.mode == WRITE and self.name[-3:] != ".gz":
143            return self.name + ".gz"
144        return self.name
145
146    def __repr__(self):
147        s = repr(self.fileobj)
148        return '<gzip ' + s[1:-1] + ' ' + hex(id(self)) + '>'
149
150    def _check_closed(self):
151        """Raises a ValueError if the underlying file object has been closed.
152
153        """
154        if self.closed:
155            raise ValueError('I/O operation on closed file.')
156
157    def _init_write(self, filename):
158        self.name = filename
159        self.crc = zlib.crc32("") & 0xffffffffL
160        self.size = 0
161        self.writebuf = []
162        self.bufsize = 0
163
164    def _write_gzip_header(self):
165        self.fileobj.write('\037\213')             # magic header
166        self.fileobj.write('\010')                 # compression method
167        try:
168            # RFC 1952 requires the FNAME field to be Latin-1. Do not
169            # include filenames that cannot be represented that way.
170            fname = os.path.basename(self.name)
171            if not isinstance(fname, str):
172                fname = fname.encode('latin-1')
173            if fname.endswith('.gz'):
174                fname = fname[:-3]
175        except UnicodeEncodeError:
176            fname = ''
177        flags = 0
178        if fname:
179            flags = FNAME
180        self.fileobj.write(chr(flags))
181        mtime = self.mtime
182        if mtime is None:
183            mtime = time.time()
184        write32u(self.fileobj, long(mtime))
185        self.fileobj.write('\002')
186        self.fileobj.write('\377')
187        if fname:
188            self.fileobj.write(fname + '\000')
189
190    def _init_read(self):
191        self.crc = zlib.crc32("") & 0xffffffffL
192        self.size = 0
193
194    def _read_gzip_header(self):
195        magic = self.fileobj.read(2)
196        if magic != '\037\213':
197            raise IOError, 'Not a gzipped file'
198        method = ord( self.fileobj.read(1) )
199        if method != 8:
200            raise IOError, 'Unknown compression method'
201        flag = ord( self.fileobj.read(1) )
202        self.mtime = read32(self.fileobj)
203        # extraflag = self.fileobj.read(1)
204        # os = self.fileobj.read(1)
205        self.fileobj.read(2)
206
207        if flag & FEXTRA:
208            # Read & discard the extra field, if present
209            xlen = ord(self.fileobj.read(1))
210            xlen = xlen + 256*ord(self.fileobj.read(1))
211            self.fileobj.read(xlen)
212        if flag & FNAME:
213            # Read and discard a null-terminated string containing the filename
214            while True:
215                s = self.fileobj.read(1)
216                if not s or s=='\000':
217                    break
218        if flag & FCOMMENT:
219            # Read and discard a null-terminated string containing a comment
220            while True:
221                s = self.fileobj.read(1)
222                if not s or s=='\000':
223                    break
224        if flag & FHCRC:
225            self.fileobj.read(2)     # Read & discard the 16-bit header CRC
226
227    def write(self,data):
228        self._check_closed()
229        if self.mode != WRITE:
230            import errno
231            raise IOError(errno.EBADF, "write() on read-only GzipFile object")
232
233        if self.fileobj is None:
234            raise ValueError, "write() on closed GzipFile object"
235
236        # Convert data type if called by io.BufferedWriter.
237        if isinstance(data, memoryview):
238            data = data.tobytes()
239
240        if len(data) > 0:
241            self.fileobj.write(self.compress.compress(data))
242            self.size += len(data)
243            self.crc = zlib.crc32(data, self.crc) & 0xffffffffL
244            self.offset += len(data)
245
246        return len(data)
247
248    def read(self, size=-1):
249        self._check_closed()
250        if self.mode != READ:
251            import errno
252            raise IOError(errno.EBADF, "read() on write-only GzipFile object")
253
254        if self.extrasize <= 0 and self.fileobj is None:
255            return ''
256
257        readsize = 1024
258        if size < 0:        # get the whole thing
259            try:
260                while True:
261                    self._read(readsize)
262                    readsize = min(self.max_read_chunk, readsize * 2)
263            except EOFError:
264                size = self.extrasize
265        else:               # just get some more of it
266            try:
267                while size > self.extrasize:
268                    self._read(readsize)
269                    readsize = min(self.max_read_chunk, readsize * 2)
270            except EOFError:
271                if size > self.extrasize:
272                    size = self.extrasize
273
274        offset = self.offset - self.extrastart
275        chunk = self.extrabuf[offset: offset + size]
276        self.extrasize = self.extrasize - size
277
278        self.offset += size
279        return chunk
280
281    def _unread(self, buf):
282        self.extrasize = len(buf) + self.extrasize
283        self.offset -= len(buf)
284
285    def _read(self, size=1024):
286        if self.fileobj is None:
287            raise EOFError, "Reached EOF"
288
289        if self._new_member:
290            # If the _new_member flag is set, we have to
291            # jump to the next member, if there is one.
292            #
293            # First, check if we're at the end of the file;
294            # if so, it's time to stop; no more members to read.
295            pos = self.fileobj.tell()   # Save current position
296            self.fileobj.seek(0, 2)     # Seek to end of file
297            if pos == self.fileobj.tell():
298                raise EOFError, "Reached EOF"
299            else:
300                self.fileobj.seek( pos ) # Return to original position
301
302            self._init_read()
303            self._read_gzip_header()
304            self.decompress = zlib.decompressobj(-zlib.MAX_WBITS)
305            self._new_member = False
306
307        # Read a chunk of data from the file
308        buf = self.fileobj.read(size)
309
310        # If the EOF has been reached, flush the decompression object
311        # and mark this object as finished.
312
313        if buf == "":
314            uncompress = self.decompress.flush()
315            self._read_eof()
316            self._add_read_data( uncompress )
317            raise EOFError, 'Reached EOF'
318
319        uncompress = self.decompress.decompress(buf)
320        self._add_read_data( uncompress )
321
322        if self.decompress.unused_data != "":
323            # Ending case: we've come to the end of a member in the file,
324            # so seek back to the start of the unused data, finish up
325            # this member, and read a new gzip header.
326            # (The number of bytes to seek back is the length of the unused
327            # data, minus 8 because _read_eof() will rewind a further 8 bytes)
328            self.fileobj.seek( -len(self.decompress.unused_data)+8, 1)
329
330            # Check the CRC and file size, and set the flag so we read
331            # a new member on the next call
332            self._read_eof()
333            self._new_member = True
334
335    def _add_read_data(self, data):
336        self.crc = zlib.crc32(data, self.crc) & 0xffffffffL
337        offset = self.offset - self.extrastart
338        self.extrabuf = self.extrabuf[offset:] + data
339        self.extrasize = self.extrasize + len(data)
340        self.extrastart = self.offset
341        self.size = self.size + len(data)
342
343    def _read_eof(self):
344        # We've read to the end of the file, so we have to rewind in order
345        # to reread the 8 bytes containing the CRC and the file size.
346        # We check the that the computed CRC and size of the
347        # uncompressed data matches the stored values.  Note that the size
348        # stored is the true file size mod 2**32.
349        self.fileobj.seek(-8, 1)
350        crc32 = read32(self.fileobj)
351        isize = read32(self.fileobj)  # may exceed 2GB
352        if crc32 != self.crc:
353            raise IOError("CRC check failed %s != %s" % (hex(crc32),
354                                                         hex(self.crc)))
355        elif isize != (self.size & 0xffffffffL):
356            raise IOError, "Incorrect length of data produced"
357
358        # Gzip files can be padded with zeroes and still have archives.
359        # Consume all zero bytes and set the file position to the first
360        # non-zero byte. See http://www.gzip.org/#faq8
361        c = "\x00"
362        while c == "\x00":
363            c = self.fileobj.read(1)
364        if c:
365            self.fileobj.seek(-1, 1)
366
367    @property
368    def closed(self):
369        return self.fileobj is None
370
371    def close(self):
372        fileobj = self.fileobj
373        if fileobj is None:
374            return
375        self.fileobj = None
376        try:
377            if self.mode == WRITE:
378                fileobj.write(self.compress.flush())
379                write32u(fileobj, self.crc)
380                # self.size may exceed 2GB, or even 4GB
381                write32u(fileobj, self.size & 0xffffffffL)
382        finally:
383            myfileobj = self.myfileobj
384            if myfileobj:
385                self.myfileobj = None
386                myfileobj.close()
387
388    def flush(self,zlib_mode=zlib.Z_SYNC_FLUSH):
389        self._check_closed()
390        if self.mode == WRITE:
391            # Ensure the compressor's buffer is flushed
392            self.fileobj.write(self.compress.flush(zlib_mode))
393            self.fileobj.flush()
394
395    def fileno(self):
396        """Invoke the underlying file object's fileno() method.
397
398        This will raise AttributeError if the underlying file object
399        doesn't support fileno().
400        """
401        return self.fileobj.fileno()
402
403    def rewind(self):
404        '''Return the uncompressed stream file position indicator to the
405        beginning of the file'''
406        if self.mode != READ:
407            raise IOError("Can't rewind in write mode")
408        self.fileobj.seek(0)
409        self._new_member = True
410        self.extrabuf = ""
411        self.extrasize = 0
412        self.extrastart = 0
413        self.offset = 0
414
415    def readable(self):
416        return self.mode == READ
417
418    def writable(self):
419        return self.mode == WRITE
420
421    def seekable(self):
422        return True
423
424    def seek(self, offset, whence=0):
425        if whence:
426            if whence == 1:
427                offset = self.offset + offset
428            else:
429                raise ValueError('Seek from end not supported')
430        if self.mode == WRITE:
431            if offset < self.offset:
432                raise IOError('Negative seek in write mode')
433            count = offset - self.offset
434            for i in xrange(count // 1024):
435                self.write(1024 * '\0')
436            self.write((count % 1024) * '\0')
437        elif self.mode == READ:
438            if offset < self.offset:
439                # for negative seek, rewind and do positive seek
440                self.rewind()
441            count = offset - self.offset
442            for i in xrange(count // 1024):
443                self.read(1024)
444            self.read(count % 1024)
445
446        return self.offset
447
448    def readline(self, size=-1):
449        if size < 0:
450            # Shortcut common case - newline found in buffer.
451            offset = self.offset - self.extrastart
452            i = self.extrabuf.find('\n', offset) + 1
453            if i > 0:
454                self.extrasize -= i - offset
455                self.offset += i - offset
456                return self.extrabuf[offset: i]
457
458            size = sys.maxint
459            readsize = self.min_readsize
460        else:
461            readsize = size
462        bufs = []
463        while size != 0:
464            c = self.read(readsize)
465            i = c.find('\n')
466
467            # We set i=size to break out of the loop under two
468            # conditions: 1) there's no newline, and the chunk is
469            # larger than size, or 2) there is a newline, but the
470            # resulting line would be longer than 'size'.
471            if (size <= i) or (i == -1 and len(c) > size):
472                i = size - 1
473
474            if i >= 0 or c == '':
475                bufs.append(c[:i + 1])    # Add portion of last chunk
476                self._unread(c[i + 1:])   # Push back rest of chunk
477                break
478
479            # Append chunk to list, decrease 'size',
480            bufs.append(c)
481            size = size - len(c)
482            readsize = min(size, readsize * 2)
483        if readsize > self.min_readsize:
484            self.min_readsize = min(readsize, self.min_readsize * 2, 512)
485        return ''.join(bufs) # Return resulting line
486
487
488def _test():
489    # Act like gzip; with -d, act like gunzip.
490    # The input file is not deleted, however, nor are any other gzip
491    # options or features supported.
492    args = sys.argv[1:]
493    decompress = args and args[0] == "-d"
494    if decompress:
495        args = args[1:]
496    if not args:
497        args = ["-"]
498    for arg in args:
499        if decompress:
500            if arg == "-":
501                f = GzipFile(filename="", mode="rb", fileobj=sys.stdin)
502                g = sys.stdout
503            else:
504                if arg[-3:] != ".gz":
505                    print "filename doesn't end in .gz:", repr(arg)
506                    continue
507                f = open(arg, "rb")
508                g = __builtin__.open(arg[:-3], "wb")
509        else:
510            if arg == "-":
511                f = sys.stdin
512                g = GzipFile(filename="", mode="wb", fileobj=sys.stdout)
513            else:
514                f = __builtin__.open(arg, "rb")
515                g = open(arg + ".gz", "wb")
516        while True:
517            chunk = f.read(1024)
518            if not chunk:
519                break
520            g.write(chunk)
521        if g is not sys.stdout:
522            g.close()
523        if f is not sys.stdin:
524            f.close()
525
526if __name__ == '__main__':
527    _test()
528