• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1"""Functions that read and write gzipped files.
2
3The user of the file doesn't have to worry about the compression,
4but random access is not allowed."""
5
6# based on Andrew Kuchling's minigzip.py distributed with the zlib module
7
8import struct, sys, time, os
9import zlib
10import builtins
11import io
12import _compression
13
14__all__ = ["GzipFile", "open", "compress", "decompress"]
15
16FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16
17
18READ, WRITE = 1, 2
19
20def open(filename, mode="rb", compresslevel=9,
21         encoding=None, errors=None, newline=None):
22    """Open a gzip-compressed file in binary or text mode.
23
24    The filename argument can be an actual filename (a str or bytes object), or
25    an existing file object to read from or write to.
26
27    The mode argument can be "r", "rb", "w", "wb", "x", "xb", "a" or "ab" for
28    binary mode, or "rt", "wt", "xt" or "at" for text mode. The default mode is
29    "rb", and the default compresslevel is 9.
30
31    For binary mode, this function is equivalent to the GzipFile constructor:
32    GzipFile(filename, mode, compresslevel). In this case, the encoding, errors
33    and newline arguments must not be provided.
34
35    For text mode, a GzipFile object is created, and wrapped in an
36    io.TextIOWrapper instance with the specified encoding, error handling
37    behavior, and line ending(s).
38
39    """
40    if "t" in mode:
41        if "b" in mode:
42            raise ValueError("Invalid mode: %r" % (mode,))
43    else:
44        if encoding is not None:
45            raise ValueError("Argument 'encoding' not supported in binary mode")
46        if errors is not None:
47            raise ValueError("Argument 'errors' not supported in binary mode")
48        if newline is not None:
49            raise ValueError("Argument 'newline' not supported in binary mode")
50
51    gz_mode = mode.replace("t", "")
52    if isinstance(filename, (str, bytes, os.PathLike)):
53        binary_file = GzipFile(filename, gz_mode, compresslevel)
54    elif hasattr(filename, "read") or hasattr(filename, "write"):
55        binary_file = GzipFile(None, gz_mode, compresslevel, filename)
56    else:
57        raise TypeError("filename must be a str or bytes object, or a file")
58
59    if "t" in mode:
60        return io.TextIOWrapper(binary_file, encoding, errors, newline)
61    else:
62        return binary_file
63
64def write32u(output, value):
65    # The L format writes the bit pattern correctly whether signed
66    # or unsigned.
67    output.write(struct.pack("<L", value))
68
69class _PaddedFile:
70    """Minimal read-only file object that prepends a string to the contents
71    of an actual file. Shouldn't be used outside of gzip.py, as it lacks
72    essential functionality."""
73
74    def __init__(self, f, prepend=b''):
75        self._buffer = prepend
76        self._length = len(prepend)
77        self.file = f
78        self._read = 0
79
80    def read(self, size):
81        if self._read is None:
82            return self.file.read(size)
83        if self._read + size <= self._length:
84            read = self._read
85            self._read += size
86            return self._buffer[read:self._read]
87        else:
88            read = self._read
89            self._read = None
90            return self._buffer[read:] + \
91                   self.file.read(size-self._length+read)
92
93    def prepend(self, prepend=b''):
94        if self._read is None:
95            self._buffer = prepend
96        else:  # Assume data was read since the last prepend() call
97            self._read -= len(prepend)
98            return
99        self._length = len(self._buffer)
100        self._read = 0
101
102    def seek(self, off):
103        self._read = None
104        self._buffer = None
105        return self.file.seek(off)
106
107    def seekable(self):
108        return True  # Allows fast-forwarding even in unseekable streams
109
110class GzipFile(_compression.BaseStream):
111    """The GzipFile class simulates most of the methods of a file object with
112    the exception of the truncate() method.
113
114    This class only supports opening files in binary mode. If you need to open a
115    compressed file in text mode, use the gzip.open() function.
116
117    """
118
119    # Overridden with internal file object to be closed, if only a filename
120    # is passed in
121    myfileobj = None
122
123    def __init__(self, filename=None, mode=None,
124                 compresslevel=9, fileobj=None, mtime=None):
125        """Constructor for the GzipFile class.
126
127        At least one of fileobj and filename must be given a
128        non-trivial value.
129
130        The new class instance is based on fileobj, which can be a regular
131        file, an io.BytesIO object, or any other object which simulates a file.
132        It defaults to None, in which case filename is opened to provide
133        a file object.
134
135        When fileobj is not None, the filename argument is only used to be
136        included in the gzip file header, which may include the original
137        filename of the uncompressed file.  It defaults to the filename of
138        fileobj, if discernible; otherwise, it defaults to the empty string,
139        and in this case the original filename is not included in the header.
140
141        The mode argument can be any of 'r', 'rb', 'a', 'ab', 'w', 'wb', 'x', or
142        'xb' depending on whether the file will be read or written.  The default
143        is the mode of fileobj if discernible; otherwise, the default is 'rb'.
144        A mode of 'r' is equivalent to one of 'rb', and similarly for 'w' and
145        'wb', 'a' and 'ab', and 'x' and 'xb'.
146
147        The compresslevel argument is an integer from 0 to 9 controlling the
148        level of compression; 1 is fastest and produces the least compression,
149        and 9 is slowest and produces the most compression. 0 is no compression
150        at all. The default is 9.
151
152        The mtime argument is an optional numeric timestamp to be written
153        to the last modification time field in the stream when compressing.
154        If omitted or None, the current time is used.
155
156        """
157
158        if mode and ('t' in mode or 'U' in mode):
159            raise ValueError("Invalid mode: {!r}".format(mode))
160        if mode and 'b' not in mode:
161            mode += 'b'
162        if fileobj is None:
163            fileobj = self.myfileobj = builtins.open(filename, mode or 'rb')
164        if filename is None:
165            filename = getattr(fileobj, 'name', '')
166            if not isinstance(filename, (str, bytes)):
167                filename = ''
168        else:
169            filename = os.fspath(filename)
170        if mode is None:
171            mode = getattr(fileobj, 'mode', 'rb')
172
173        if mode.startswith('r'):
174            self.mode = READ
175            raw = _GzipReader(fileobj)
176            self._buffer = io.BufferedReader(raw)
177            self.name = filename
178
179        elif mode.startswith(('w', 'a', 'x')):
180            self.mode = WRITE
181            self._init_write(filename)
182            self.compress = zlib.compressobj(compresslevel,
183                                             zlib.DEFLATED,
184                                             -zlib.MAX_WBITS,
185                                             zlib.DEF_MEM_LEVEL,
186                                             0)
187            self._write_mtime = mtime
188        else:
189            raise ValueError("Invalid mode: {!r}".format(mode))
190
191        self.fileobj = fileobj
192
193        if self.mode == WRITE:
194            self._write_gzip_header()
195
196    @property
197    def filename(self):
198        import warnings
199        warnings.warn("use the name attribute", DeprecationWarning, 2)
200        if self.mode == WRITE and self.name[-3:] != ".gz":
201            return self.name + ".gz"
202        return self.name
203
204    @property
205    def mtime(self):
206        """Last modification time read from stream, or None"""
207        return self._buffer.raw._last_mtime
208
209    def __repr__(self):
210        s = repr(self.fileobj)
211        return '<gzip ' + s[1:-1] + ' ' + hex(id(self)) + '>'
212
213    def _init_write(self, filename):
214        self.name = filename
215        self.crc = zlib.crc32(b"")
216        self.size = 0
217        self.writebuf = []
218        self.bufsize = 0
219        self.offset = 0  # Current file offset for seek(), tell(), etc
220
221    def _write_gzip_header(self):
222        self.fileobj.write(b'\037\213')             # magic header
223        self.fileobj.write(b'\010')                 # compression method
224        try:
225            # RFC 1952 requires the FNAME field to be Latin-1. Do not
226            # include filenames that cannot be represented that way.
227            fname = os.path.basename(self.name)
228            if not isinstance(fname, bytes):
229                fname = fname.encode('latin-1')
230            if fname.endswith(b'.gz'):
231                fname = fname[:-3]
232        except UnicodeEncodeError:
233            fname = b''
234        flags = 0
235        if fname:
236            flags = FNAME
237        self.fileobj.write(chr(flags).encode('latin-1'))
238        mtime = self._write_mtime
239        if mtime is None:
240            mtime = time.time()
241        write32u(self.fileobj, int(mtime))
242        self.fileobj.write(b'\002')
243        self.fileobj.write(b'\377')
244        if fname:
245            self.fileobj.write(fname + b'\000')
246
247    def write(self,data):
248        self._check_not_closed()
249        if self.mode != WRITE:
250            import errno
251            raise OSError(errno.EBADF, "write() on read-only GzipFile object")
252
253        if self.fileobj is None:
254            raise ValueError("write() on closed GzipFile object")
255
256        if isinstance(data, bytes):
257            length = len(data)
258        else:
259            # accept any data that supports the buffer protocol
260            data = memoryview(data)
261            length = data.nbytes
262
263        if length > 0:
264            self.fileobj.write(self.compress.compress(data))
265            self.size += length
266            self.crc = zlib.crc32(data, self.crc)
267            self.offset += length
268
269        return length
270
271    def read(self, size=-1):
272        self._check_not_closed()
273        if self.mode != READ:
274            import errno
275            raise OSError(errno.EBADF, "read() on write-only GzipFile object")
276        return self._buffer.read(size)
277
278    def read1(self, size=-1):
279        """Implements BufferedIOBase.read1()
280
281        Reads up to a buffer's worth of data is size is negative."""
282        self._check_not_closed()
283        if self.mode != READ:
284            import errno
285            raise OSError(errno.EBADF, "read1() on write-only GzipFile object")
286
287        if size < 0:
288            size = io.DEFAULT_BUFFER_SIZE
289        return self._buffer.read1(size)
290
291    def peek(self, n):
292        self._check_not_closed()
293        if self.mode != READ:
294            import errno
295            raise OSError(errno.EBADF, "peek() on write-only GzipFile object")
296        return self._buffer.peek(n)
297
298    @property
299    def closed(self):
300        return self.fileobj is None
301
302    def close(self):
303        fileobj = self.fileobj
304        if fileobj is None:
305            return
306        self.fileobj = None
307        try:
308            if self.mode == WRITE:
309                fileobj.write(self.compress.flush())
310                write32u(fileobj, self.crc)
311                # self.size may exceed 2GB, or even 4GB
312                write32u(fileobj, self.size & 0xffffffff)
313            elif self.mode == READ:
314                self._buffer.close()
315        finally:
316            myfileobj = self.myfileobj
317            if myfileobj:
318                self.myfileobj = None
319                myfileobj.close()
320
321    def flush(self,zlib_mode=zlib.Z_SYNC_FLUSH):
322        self._check_not_closed()
323        if self.mode == WRITE:
324            # Ensure the compressor's buffer is flushed
325            self.fileobj.write(self.compress.flush(zlib_mode))
326            self.fileobj.flush()
327
328    def fileno(self):
329        """Invoke the underlying file object's fileno() method.
330
331        This will raise AttributeError if the underlying file object
332        doesn't support fileno().
333        """
334        return self.fileobj.fileno()
335
336    def rewind(self):
337        '''Return the uncompressed stream file position indicator to the
338        beginning of the file'''
339        if self.mode != READ:
340            raise OSError("Can't rewind in write mode")
341        self._buffer.seek(0)
342
343    def readable(self):
344        return self.mode == READ
345
346    def writable(self):
347        return self.mode == WRITE
348
349    def seekable(self):
350        return True
351
352    def seek(self, offset, whence=io.SEEK_SET):
353        if self.mode == WRITE:
354            if whence != io.SEEK_SET:
355                if whence == io.SEEK_CUR:
356                    offset = self.offset + offset
357                else:
358                    raise ValueError('Seek from end not supported')
359            if offset < self.offset:
360                raise OSError('Negative seek in write mode')
361            count = offset - self.offset
362            chunk = b'\0' * 1024
363            for i in range(count // 1024):
364                self.write(chunk)
365            self.write(b'\0' * (count % 1024))
366        elif self.mode == READ:
367            self._check_not_closed()
368            return self._buffer.seek(offset, whence)
369
370        return self.offset
371
372    def readline(self, size=-1):
373        self._check_not_closed()
374        return self._buffer.readline(size)
375
376
377class _GzipReader(_compression.DecompressReader):
378    def __init__(self, fp):
379        super().__init__(_PaddedFile(fp), zlib.decompressobj,
380                         wbits=-zlib.MAX_WBITS)
381        # Set flag indicating start of a new member
382        self._new_member = True
383        self._last_mtime = None
384
385    def _init_read(self):
386        self._crc = zlib.crc32(b"")
387        self._stream_size = 0  # Decompressed size of unconcatenated stream
388
389    def _read_exact(self, n):
390        '''Read exactly *n* bytes from `self._fp`
391
392        This method is required because self._fp may be unbuffered,
393        i.e. return short reads.
394        '''
395
396        data = self._fp.read(n)
397        while len(data) < n:
398            b = self._fp.read(n - len(data))
399            if not b:
400                raise EOFError("Compressed file ended before the "
401                               "end-of-stream marker was reached")
402            data += b
403        return data
404
405    def _read_gzip_header(self):
406        magic = self._fp.read(2)
407        if magic == b'':
408            return False
409
410        if magic != b'\037\213':
411            raise OSError('Not a gzipped file (%r)' % magic)
412
413        (method, flag,
414         self._last_mtime) = struct.unpack("<BBIxx", self._read_exact(8))
415        if method != 8:
416            raise OSError('Unknown compression method')
417
418        if flag & FEXTRA:
419            # Read & discard the extra field, if present
420            extra_len, = struct.unpack("<H", self._read_exact(2))
421            self._read_exact(extra_len)
422        if flag & FNAME:
423            # Read and discard a null-terminated string containing the filename
424            while True:
425                s = self._fp.read(1)
426                if not s or s==b'\000':
427                    break
428        if flag & FCOMMENT:
429            # Read and discard a null-terminated string containing a comment
430            while True:
431                s = self._fp.read(1)
432                if not s or s==b'\000':
433                    break
434        if flag & FHCRC:
435            self._read_exact(2)     # Read & discard the 16-bit header CRC
436        return True
437
438    def read(self, size=-1):
439        if size < 0:
440            return self.readall()
441        # size=0 is special because decompress(max_length=0) is not supported
442        if not size:
443            return b""
444
445        # For certain input data, a single
446        # call to decompress() may not return
447        # any data. In this case, retry until we get some data or reach EOF.
448        while True:
449            if self._decompressor.eof:
450                # Ending case: we've come to the end of a member in the file,
451                # so finish up this member, and read a new gzip header.
452                # Check the CRC and file size, and set the flag so we read
453                # a new member
454                self._read_eof()
455                self._new_member = True
456                self._decompressor = self._decomp_factory(
457                    **self._decomp_args)
458
459            if self._new_member:
460                # If the _new_member flag is set, we have to
461                # jump to the next member, if there is one.
462                self._init_read()
463                if not self._read_gzip_header():
464                    self._size = self._pos
465                    return b""
466                self._new_member = False
467
468            # Read a chunk of data from the file
469            buf = self._fp.read(io.DEFAULT_BUFFER_SIZE)
470
471            uncompress = self._decompressor.decompress(buf, size)
472            if self._decompressor.unconsumed_tail != b"":
473                self._fp.prepend(self._decompressor.unconsumed_tail)
474            elif self._decompressor.unused_data != b"":
475                # Prepend the already read bytes to the fileobj so they can
476                # be seen by _read_eof() and _read_gzip_header()
477                self._fp.prepend(self._decompressor.unused_data)
478
479            if uncompress != b"":
480                break
481            if buf == b"":
482                raise EOFError("Compressed file ended before the "
483                               "end-of-stream marker was reached")
484
485        self._add_read_data( uncompress )
486        self._pos += len(uncompress)
487        return uncompress
488
489    def _add_read_data(self, data):
490        self._crc = zlib.crc32(data, self._crc)
491        self._stream_size = self._stream_size + len(data)
492
493    def _read_eof(self):
494        # We've read to the end of the file
495        # We check the that the computed CRC and size of the
496        # uncompressed data matches the stored values.  Note that the size
497        # stored is the true file size mod 2**32.
498        crc32, isize = struct.unpack("<II", self._read_exact(8))
499        if crc32 != self._crc:
500            raise OSError("CRC check failed %s != %s" % (hex(crc32),
501                                                         hex(self._crc)))
502        elif isize != (self._stream_size & 0xffffffff):
503            raise OSError("Incorrect length of data produced")
504
505        # Gzip files can be padded with zeroes and still have archives.
506        # Consume all zero bytes and set the file position to the first
507        # non-zero byte. See http://www.gzip.org/#faq8
508        c = b"\x00"
509        while c == b"\x00":
510            c = self._fp.read(1)
511        if c:
512            self._fp.prepend(c)
513
514    def _rewind(self):
515        super()._rewind()
516        self._new_member = True
517
518def compress(data, compresslevel=9):
519    """Compress data in one shot and return the compressed string.
520    Optional argument is the compression level, in range of 0-9.
521    """
522    buf = io.BytesIO()
523    with GzipFile(fileobj=buf, mode='wb', compresslevel=compresslevel) as f:
524        f.write(data)
525    return buf.getvalue()
526
527def decompress(data):
528    """Decompress a gzip compressed string in one shot.
529    Return the decompressed string.
530    """
531    with GzipFile(fileobj=io.BytesIO(data)) as f:
532        return f.read()
533
534
535def _test():
536    # Act like gzip; with -d, act like gunzip.
537    # The input file is not deleted, however, nor are any other gzip
538    # options or features supported.
539    args = sys.argv[1:]
540    decompress = args and args[0] == "-d"
541    if decompress:
542        args = args[1:]
543    if not args:
544        args = ["-"]
545    for arg in args:
546        if decompress:
547            if arg == "-":
548                f = GzipFile(filename="", mode="rb", fileobj=sys.stdin.buffer)
549                g = sys.stdout.buffer
550            else:
551                if arg[-3:] != ".gz":
552                    print("filename doesn't end in .gz:", repr(arg))
553                    continue
554                f = open(arg, "rb")
555                g = builtins.open(arg[:-3], "wb")
556        else:
557            if arg == "-":
558                f = sys.stdin.buffer
559                g = GzipFile(filename="", mode="wb", fileobj=sys.stdout.buffer)
560            else:
561                f = builtins.open(arg, "rb")
562                g = open(arg + ".gz", "wb")
563        while True:
564            chunk = f.read(1024)
565            if not chunk:
566                break
567            g.write(chunk)
568        if g is not sys.stdout.buffer:
569            g.close()
570        if f is not sys.stdin.buffer:
571            f.close()
572
573if __name__ == '__main__':
574    _test()
575