• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1"""Functions that read and write gzipped files.
2
3The user of the file doesn't have to worry about the compression,
4but random access is not allowed."""
5
6# based on Andrew Kuchling's minigzip.py distributed with the zlib module
7
8import struct, sys, time, os
9import zlib
10import builtins
11import io
12import _compression
13
14__all__ = ["BadGzipFile", "GzipFile", "open", "compress", "decompress"]
15
16FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16
17
18READ = 'rb'
19WRITE = 'wb'
20
21_COMPRESS_LEVEL_FAST = 1
22_COMPRESS_LEVEL_TRADEOFF = 6
23_COMPRESS_LEVEL_BEST = 9
24
25READ_BUFFER_SIZE = 128 * 1024
26_WRITE_BUFFER_SIZE = 4 * io.DEFAULT_BUFFER_SIZE
27
28
29def open(filename, mode="rb", compresslevel=_COMPRESS_LEVEL_BEST,
30         encoding=None, errors=None, newline=None):
31    """Open a gzip-compressed file in binary or text mode.
32
33    The filename argument can be an actual filename (a str or bytes object), or
34    an existing file object to read from or write to.
35
36    The mode argument can be "r", "rb", "w", "wb", "x", "xb", "a" or "ab" for
37    binary mode, or "rt", "wt", "xt" or "at" for text mode. The default mode is
38    "rb", and the default compresslevel is 9.
39
40    For binary mode, this function is equivalent to the GzipFile constructor:
41    GzipFile(filename, mode, compresslevel). In this case, the encoding, errors
42    and newline arguments must not be provided.
43
44    For text mode, a GzipFile object is created, and wrapped in an
45    io.TextIOWrapper instance with the specified encoding, error handling
46    behavior, and line ending(s).
47
48    """
49    if "t" in mode:
50        if "b" in mode:
51            raise ValueError("Invalid mode: %r" % (mode,))
52    else:
53        if encoding is not None:
54            raise ValueError("Argument 'encoding' not supported in binary mode")
55        if errors is not None:
56            raise ValueError("Argument 'errors' not supported in binary mode")
57        if newline is not None:
58            raise ValueError("Argument 'newline' not supported in binary mode")
59
60    gz_mode = mode.replace("t", "")
61    if isinstance(filename, (str, bytes, os.PathLike)):
62        binary_file = GzipFile(filename, gz_mode, compresslevel)
63    elif hasattr(filename, "read") or hasattr(filename, "write"):
64        binary_file = GzipFile(None, gz_mode, compresslevel, filename)
65    else:
66        raise TypeError("filename must be a str or bytes object, or a file")
67
68    if "t" in mode:
69        encoding = io.text_encoding(encoding)
70        return io.TextIOWrapper(binary_file, encoding, errors, newline)
71    else:
72        return binary_file
73
74def write32u(output, value):
75    # The L format writes the bit pattern correctly whether signed
76    # or unsigned.
77    output.write(struct.pack("<L", value))
78
79class _PaddedFile:
80    """Minimal read-only file object that prepends a string to the contents
81    of an actual file. Shouldn't be used outside of gzip.py, as it lacks
82    essential functionality."""
83
84    def __init__(self, f, prepend=b''):
85        self._buffer = prepend
86        self._length = len(prepend)
87        self.file = f
88        self._read = 0
89
90    def read(self, size):
91        if self._read is None:
92            return self.file.read(size)
93        if self._read + size <= self._length:
94            read = self._read
95            self._read += size
96            return self._buffer[read:self._read]
97        else:
98            read = self._read
99            self._read = None
100            return self._buffer[read:] + \
101                   self.file.read(size-self._length+read)
102
103    def prepend(self, prepend=b''):
104        if self._read is None:
105            self._buffer = prepend
106        else:  # Assume data was read since the last prepend() call
107            self._read -= len(prepend)
108            return
109        self._length = len(self._buffer)
110        self._read = 0
111
112    def seek(self, off):
113        self._read = None
114        self._buffer = None
115        return self.file.seek(off)
116
117    def seekable(self):
118        return True  # Allows fast-forwarding even in unseekable streams
119
120
121class BadGzipFile(OSError):
122    """Exception raised in some cases for invalid gzip files."""
123
124
125class _WriteBufferStream(io.RawIOBase):
126    """Minimal object to pass WriteBuffer flushes into GzipFile"""
127    def __init__(self, gzip_file):
128        self.gzip_file = gzip_file
129
130    def write(self, data):
131        return self.gzip_file._write_raw(data)
132
133    def seekable(self):
134        return False
135
136    def writable(self):
137        return True
138
139
140class GzipFile(_compression.BaseStream):
141    """The GzipFile class simulates most of the methods of a file object with
142    the exception of the truncate() method.
143
144    This class only supports opening files in binary mode. If you need to open a
145    compressed file in text mode, use the gzip.open() function.
146
147    """
148
149    # Overridden with internal file object to be closed, if only a filename
150    # is passed in
151    myfileobj = None
152
153    def __init__(self, filename=None, mode=None,
154                 compresslevel=_COMPRESS_LEVEL_BEST, fileobj=None, mtime=None):
155        """Constructor for the GzipFile class.
156
157        At least one of fileobj and filename must be given a
158        non-trivial value.
159
160        The new class instance is based on fileobj, which can be a regular
161        file, an io.BytesIO object, or any other object which simulates a file.
162        It defaults to None, in which case filename is opened to provide
163        a file object.
164
165        When fileobj is not None, the filename argument is only used to be
166        included in the gzip file header, which may include the original
167        filename of the uncompressed file.  It defaults to the filename of
168        fileobj, if discernible; otherwise, it defaults to the empty string,
169        and in this case the original filename is not included in the header.
170
171        The mode argument can be any of 'r', 'rb', 'a', 'ab', 'w', 'wb', 'x', or
172        'xb' depending on whether the file will be read or written.  The default
173        is the mode of fileobj if discernible; otherwise, the default is 'rb'.
174        A mode of 'r' is equivalent to one of 'rb', and similarly for 'w' and
175        'wb', 'a' and 'ab', and 'x' and 'xb'.
176
177        The compresslevel argument is an integer from 0 to 9 controlling the
178        level of compression; 1 is fastest and produces the least compression,
179        and 9 is slowest and produces the most compression. 0 is no compression
180        at all. The default is 9.
181
182        The optional mtime argument is the timestamp requested by gzip. The time
183        is in Unix format, i.e., seconds since 00:00:00 UTC, January 1, 1970.
184        If mtime is omitted or None, the current time is used. Use mtime = 0
185        to generate a compressed stream that does not depend on creation time.
186
187        """
188
189        if mode and ('t' in mode or 'U' in mode):
190            raise ValueError("Invalid mode: {!r}".format(mode))
191        if mode and 'b' not in mode:
192            mode += 'b'
193        if fileobj is None:
194            fileobj = self.myfileobj = builtins.open(filename, mode or 'rb')
195        if filename is None:
196            filename = getattr(fileobj, 'name', '')
197            if not isinstance(filename, (str, bytes)):
198                filename = ''
199        else:
200            filename = os.fspath(filename)
201        origmode = mode
202        if mode is None:
203            mode = getattr(fileobj, 'mode', 'rb')
204
205
206        if mode.startswith('r'):
207            self.mode = READ
208            raw = _GzipReader(fileobj)
209            self._buffer = io.BufferedReader(raw)
210            self.name = filename
211
212        elif mode.startswith(('w', 'a', 'x')):
213            if origmode is None:
214                import warnings
215                warnings.warn(
216                    "GzipFile was opened for writing, but this will "
217                    "change in future Python releases.  "
218                    "Specify the mode argument for opening it for writing.",
219                    FutureWarning, 2)
220            self.mode = WRITE
221            self._init_write(filename)
222            self.compress = zlib.compressobj(compresslevel,
223                                             zlib.DEFLATED,
224                                             -zlib.MAX_WBITS,
225                                             zlib.DEF_MEM_LEVEL,
226                                             0)
227            self._write_mtime = mtime
228            self._buffer_size = _WRITE_BUFFER_SIZE
229            self._buffer = io.BufferedWriter(_WriteBufferStream(self),
230                                             buffer_size=self._buffer_size)
231        else:
232            raise ValueError("Invalid mode: {!r}".format(mode))
233
234        self.fileobj = fileobj
235
236        if self.mode == WRITE:
237            self._write_gzip_header(compresslevel)
238
239    @property
240    def mtime(self):
241        """Last modification time read from stream, or None"""
242        return self._buffer.raw._last_mtime
243
244    def __repr__(self):
245        s = repr(self.fileobj)
246        return '<gzip ' + s[1:-1] + ' ' + hex(id(self)) + '>'
247
248    def _init_write(self, filename):
249        self.name = filename
250        self.crc = zlib.crc32(b"")
251        self.size = 0
252        self.writebuf = []
253        self.bufsize = 0
254        self.offset = 0  # Current file offset for seek(), tell(), etc
255
256    def tell(self):
257        self._check_not_closed()
258        self._buffer.flush()
259        return super().tell()
260
261    def _write_gzip_header(self, compresslevel):
262        self.fileobj.write(b'\037\213')             # magic header
263        self.fileobj.write(b'\010')                 # compression method
264        try:
265            # RFC 1952 requires the FNAME field to be Latin-1. Do not
266            # include filenames that cannot be represented that way.
267            fname = os.path.basename(self.name)
268            if not isinstance(fname, bytes):
269                fname = fname.encode('latin-1')
270            if fname.endswith(b'.gz'):
271                fname = fname[:-3]
272        except UnicodeEncodeError:
273            fname = b''
274        flags = 0
275        if fname:
276            flags = FNAME
277        self.fileobj.write(chr(flags).encode('latin-1'))
278        mtime = self._write_mtime
279        if mtime is None:
280            mtime = time.time()
281        write32u(self.fileobj, int(mtime))
282        if compresslevel == _COMPRESS_LEVEL_BEST:
283            xfl = b'\002'
284        elif compresslevel == _COMPRESS_LEVEL_FAST:
285            xfl = b'\004'
286        else:
287            xfl = b'\000'
288        self.fileobj.write(xfl)
289        self.fileobj.write(b'\377')
290        if fname:
291            self.fileobj.write(fname + b'\000')
292
293    def write(self,data):
294        self._check_not_closed()
295        if self.mode != WRITE:
296            import errno
297            raise OSError(errno.EBADF, "write() on read-only GzipFile object")
298
299        if self.fileobj is None:
300            raise ValueError("write() on closed GzipFile object")
301
302        return self._buffer.write(data)
303
304    def _write_raw(self, data):
305        # Called by our self._buffer underlying WriteBufferStream.
306        if isinstance(data, (bytes, bytearray)):
307            length = len(data)
308        else:
309            # accept any data that supports the buffer protocol
310            data = memoryview(data)
311            length = data.nbytes
312
313        if length > 0:
314            self.fileobj.write(self.compress.compress(data))
315            self.size += length
316            self.crc = zlib.crc32(data, self.crc)
317            self.offset += length
318
319        return length
320
321    def read(self, size=-1):
322        self._check_not_closed()
323        if self.mode != READ:
324            import errno
325            raise OSError(errno.EBADF, "read() on write-only GzipFile object")
326        return self._buffer.read(size)
327
328    def read1(self, size=-1):
329        """Implements BufferedIOBase.read1()
330
331        Reads up to a buffer's worth of data if size is negative."""
332        self._check_not_closed()
333        if self.mode != READ:
334            import errno
335            raise OSError(errno.EBADF, "read1() on write-only GzipFile object")
336
337        if size < 0:
338            size = io.DEFAULT_BUFFER_SIZE
339        return self._buffer.read1(size)
340
341    def peek(self, n):
342        self._check_not_closed()
343        if self.mode != READ:
344            import errno
345            raise OSError(errno.EBADF, "peek() on write-only GzipFile object")
346        return self._buffer.peek(n)
347
348    @property
349    def closed(self):
350        return self.fileobj is None
351
352    def close(self):
353        fileobj = self.fileobj
354        if fileobj is None or self._buffer.closed:
355            return
356        try:
357            if self.mode == WRITE:
358                self._buffer.flush()
359                fileobj.write(self.compress.flush())
360                write32u(fileobj, self.crc)
361                # self.size may exceed 2 GiB, or even 4 GiB
362                write32u(fileobj, self.size & 0xffffffff)
363            elif self.mode == READ:
364                self._buffer.close()
365        finally:
366            self.fileobj = None
367            myfileobj = self.myfileobj
368            if myfileobj:
369                self.myfileobj = None
370                myfileobj.close()
371
372    def flush(self,zlib_mode=zlib.Z_SYNC_FLUSH):
373        self._check_not_closed()
374        if self.mode == WRITE:
375            self._buffer.flush()
376            # Ensure the compressor's buffer is flushed
377            self.fileobj.write(self.compress.flush(zlib_mode))
378            self.fileobj.flush()
379
380    def fileno(self):
381        """Invoke the underlying file object's fileno() method.
382
383        This will raise AttributeError if the underlying file object
384        doesn't support fileno().
385        """
386        return self.fileobj.fileno()
387
388    def rewind(self):
389        '''Return the uncompressed stream file position indicator to the
390        beginning of the file'''
391        if self.mode != READ:
392            raise OSError("Can't rewind in write mode")
393        self._buffer.seek(0)
394
395    def readable(self):
396        return self.mode == READ
397
398    def writable(self):
399        return self.mode == WRITE
400
401    def seekable(self):
402        return True
403
404    def seek(self, offset, whence=io.SEEK_SET):
405        if self.mode == WRITE:
406            self._check_not_closed()
407            # Flush buffer to ensure validity of self.offset
408            self._buffer.flush()
409            if whence != io.SEEK_SET:
410                if whence == io.SEEK_CUR:
411                    offset = self.offset + offset
412                else:
413                    raise ValueError('Seek from end not supported')
414            if offset < self.offset:
415                raise OSError('Negative seek in write mode')
416            count = offset - self.offset
417            chunk = b'\0' * self._buffer_size
418            for i in range(count // self._buffer_size):
419                self.write(chunk)
420            self.write(b'\0' * (count % self._buffer_size))
421        elif self.mode == READ:
422            self._check_not_closed()
423            return self._buffer.seek(offset, whence)
424
425        return self.offset
426
427    def readline(self, size=-1):
428        self._check_not_closed()
429        return self._buffer.readline(size)
430
431
432def _read_exact(fp, n):
433    '''Read exactly *n* bytes from `fp`
434
435    This method is required because fp may be unbuffered,
436    i.e. return short reads.
437    '''
438    data = fp.read(n)
439    while len(data) < n:
440        b = fp.read(n - len(data))
441        if not b:
442            raise EOFError("Compressed file ended before the "
443                           "end-of-stream marker was reached")
444        data += b
445    return data
446
447
448def _read_gzip_header(fp):
449    '''Read a gzip header from `fp` and progress to the end of the header.
450
451    Returns last mtime if header was present or None otherwise.
452    '''
453    magic = fp.read(2)
454    if magic == b'':
455        return None
456
457    if magic != b'\037\213':
458        raise BadGzipFile('Not a gzipped file (%r)' % magic)
459
460    (method, flag, last_mtime) = struct.unpack("<BBIxx", _read_exact(fp, 8))
461    if method != 8:
462        raise BadGzipFile('Unknown compression method')
463
464    if flag & FEXTRA:
465        # Read & discard the extra field, if present
466        extra_len, = struct.unpack("<H", _read_exact(fp, 2))
467        _read_exact(fp, extra_len)
468    if flag & FNAME:
469        # Read and discard a null-terminated string containing the filename
470        while True:
471            s = fp.read(1)
472            if not s or s==b'\000':
473                break
474    if flag & FCOMMENT:
475        # Read and discard a null-terminated string containing a comment
476        while True:
477            s = fp.read(1)
478            if not s or s==b'\000':
479                break
480    if flag & FHCRC:
481        _read_exact(fp, 2)     # Read & discard the 16-bit header CRC
482    return last_mtime
483
484
485class _GzipReader(_compression.DecompressReader):
486    def __init__(self, fp):
487        super().__init__(_PaddedFile(fp), zlib._ZlibDecompressor,
488                         wbits=-zlib.MAX_WBITS)
489        # Set flag indicating start of a new member
490        self._new_member = True
491        self._last_mtime = None
492
493    def _init_read(self):
494        self._crc = zlib.crc32(b"")
495        self._stream_size = 0  # Decompressed size of unconcatenated stream
496
497    def _read_gzip_header(self):
498        last_mtime = _read_gzip_header(self._fp)
499        if last_mtime is None:
500            return False
501        self._last_mtime = last_mtime
502        return True
503
504    def read(self, size=-1):
505        if size < 0:
506            return self.readall()
507        # size=0 is special because decompress(max_length=0) is not supported
508        if not size:
509            return b""
510
511        # For certain input data, a single
512        # call to decompress() may not return
513        # any data. In this case, retry until we get some data or reach EOF.
514        while True:
515            if self._decompressor.eof:
516                # Ending case: we've come to the end of a member in the file,
517                # so finish up this member, and read a new gzip header.
518                # Check the CRC and file size, and set the flag so we read
519                # a new member
520                self._read_eof()
521                self._new_member = True
522                self._decompressor = self._decomp_factory(
523                    **self._decomp_args)
524
525            if self._new_member:
526                # If the _new_member flag is set, we have to
527                # jump to the next member, if there is one.
528                self._init_read()
529                if not self._read_gzip_header():
530                    self._size = self._pos
531                    return b""
532                self._new_member = False
533
534            # Read a chunk of data from the file
535            if self._decompressor.needs_input:
536                buf = self._fp.read(READ_BUFFER_SIZE)
537                uncompress = self._decompressor.decompress(buf, size)
538            else:
539                uncompress = self._decompressor.decompress(b"", size)
540
541            if self._decompressor.unused_data != b"":
542                # Prepend the already read bytes to the fileobj so they can
543                # be seen by _read_eof() and _read_gzip_header()
544                self._fp.prepend(self._decompressor.unused_data)
545
546            if uncompress != b"":
547                break
548            if buf == b"":
549                raise EOFError("Compressed file ended before the "
550                               "end-of-stream marker was reached")
551
552        self._crc = zlib.crc32(uncompress, self._crc)
553        self._stream_size += len(uncompress)
554        self._pos += len(uncompress)
555        return uncompress
556
557    def _read_eof(self):
558        # We've read to the end of the file
559        # We check that the computed CRC and size of the
560        # uncompressed data matches the stored values.  Note that the size
561        # stored is the true file size mod 2**32.
562        crc32, isize = struct.unpack("<II", _read_exact(self._fp, 8))
563        if crc32 != self._crc:
564            raise BadGzipFile("CRC check failed %s != %s" % (hex(crc32),
565                                                             hex(self._crc)))
566        elif isize != (self._stream_size & 0xffffffff):
567            raise BadGzipFile("Incorrect length of data produced")
568
569        # Gzip files can be padded with zeroes and still have archives.
570        # Consume all zero bytes and set the file position to the first
571        # non-zero byte. See http://www.gzip.org/#faq8
572        c = b"\x00"
573        while c == b"\x00":
574            c = self._fp.read(1)
575        if c:
576            self._fp.prepend(c)
577
578    def _rewind(self):
579        super()._rewind()
580        self._new_member = True
581
582
583def compress(data, compresslevel=_COMPRESS_LEVEL_BEST, *, mtime=None):
584    """Compress data in one shot and return the compressed string.
585
586    compresslevel sets the compression level in range of 0-9.
587    mtime can be used to set the modification time. The modification time is
588    set to the current time by default.
589    """
590    # Wbits=31 automatically includes a gzip header and trailer.
591    gzip_data = zlib.compress(data, level=compresslevel, wbits=31)
592    if mtime is None:
593        mtime = time.time()
594    # Reuse gzip header created by zlib, replace mtime and OS byte for
595    # consistency.
596    header = struct.pack("<4sLBB", gzip_data, int(mtime), gzip_data[8], 255)
597    return header + gzip_data[10:]
598
599
600def decompress(data):
601    """Decompress a gzip compressed string in one shot.
602    Return the decompressed string.
603    """
604    decompressed_members = []
605    while True:
606        fp = io.BytesIO(data)
607        if _read_gzip_header(fp) is None:
608            return b"".join(decompressed_members)
609        # Use a zlib raw deflate compressor
610        do = zlib.decompressobj(wbits=-zlib.MAX_WBITS)
611        # Read all the data except the header
612        decompressed = do.decompress(data[fp.tell():])
613        if not do.eof or len(do.unused_data) < 8:
614            raise EOFError("Compressed file ended before the end-of-stream "
615                           "marker was reached")
616        crc, length = struct.unpack("<II", do.unused_data[:8])
617        if crc != zlib.crc32(decompressed):
618            raise BadGzipFile("CRC check failed")
619        if length != (len(decompressed) & 0xffffffff):
620            raise BadGzipFile("Incorrect length of data produced")
621        decompressed_members.append(decompressed)
622        data = do.unused_data[8:].lstrip(b"\x00")
623
624
625def main():
626    from argparse import ArgumentParser
627    parser = ArgumentParser(description=
628        "A simple command line interface for the gzip module: act like gzip, "
629        "but do not delete the input file.")
630    group = parser.add_mutually_exclusive_group()
631    group.add_argument('--fast', action='store_true', help='compress faster')
632    group.add_argument('--best', action='store_true', help='compress better')
633    group.add_argument("-d", "--decompress", action="store_true",
634                        help="act like gunzip instead of gzip")
635
636    parser.add_argument("args", nargs="*", default=["-"], metavar='file')
637    args = parser.parse_args()
638
639    compresslevel = _COMPRESS_LEVEL_TRADEOFF
640    if args.fast:
641        compresslevel = _COMPRESS_LEVEL_FAST
642    elif args.best:
643        compresslevel = _COMPRESS_LEVEL_BEST
644
645    for arg in args.args:
646        if args.decompress:
647            if arg == "-":
648                f = GzipFile(filename="", mode="rb", fileobj=sys.stdin.buffer)
649                g = sys.stdout.buffer
650            else:
651                if arg[-3:] != ".gz":
652                    sys.exit(f"filename doesn't end in .gz: {arg!r}")
653                f = open(arg, "rb")
654                g = builtins.open(arg[:-3], "wb")
655        else:
656            if arg == "-":
657                f = sys.stdin.buffer
658                g = GzipFile(filename="", mode="wb", fileobj=sys.stdout.buffer,
659                             compresslevel=compresslevel)
660            else:
661                f = builtins.open(arg, "rb")
662                g = open(arg + ".gz", "wb")
663        while True:
664            chunk = f.read(READ_BUFFER_SIZE)
665            if not chunk:
666                break
667            g.write(chunk)
668        if g is not sys.stdout.buffer:
669            g.close()
670        if f is not sys.stdin.buffer:
671            f.close()
672
673if __name__ == '__main__':
674    main()
675