• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1"""Functions that read and write gzipped files.
2
3The user of the file doesn't have to worry about the compression,
4but random access is not allowed."""
5
6# based on Andrew Kuchling's minigzip.py distributed with the zlib module
7
8import struct, sys, time, os
9import zlib
10import builtins
11import io
12import _compression
13
14__all__ = ["BadGzipFile", "GzipFile", "open", "compress", "decompress"]
15
16FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16
17
18READ, WRITE = 1, 2
19
20_COMPRESS_LEVEL_FAST = 1
21_COMPRESS_LEVEL_TRADEOFF = 6
22_COMPRESS_LEVEL_BEST = 9
23
24
25def open(filename, mode="rb", compresslevel=_COMPRESS_LEVEL_BEST,
26         encoding=None, errors=None, newline=None):
27    """Open a gzip-compressed file in binary or text mode.
28
29    The filename argument can be an actual filename (a str or bytes object), or
30    an existing file object to read from or write to.
31
32    The mode argument can be "r", "rb", "w", "wb", "x", "xb", "a" or "ab" for
33    binary mode, or "rt", "wt", "xt" or "at" for text mode. The default mode is
34    "rb", and the default compresslevel is 9.
35
36    For binary mode, this function is equivalent to the GzipFile constructor:
37    GzipFile(filename, mode, compresslevel). In this case, the encoding, errors
38    and newline arguments must not be provided.
39
40    For text mode, a GzipFile object is created, and wrapped in an
41    io.TextIOWrapper instance with the specified encoding, error handling
42    behavior, and line ending(s).
43
44    """
45    if "t" in mode:
46        if "b" in mode:
47            raise ValueError("Invalid mode: %r" % (mode,))
48    else:
49        if encoding is not None:
50            raise ValueError("Argument 'encoding' not supported in binary mode")
51        if errors is not None:
52            raise ValueError("Argument 'errors' not supported in binary mode")
53        if newline is not None:
54            raise ValueError("Argument 'newline' not supported in binary mode")
55
56    gz_mode = mode.replace("t", "")
57    if isinstance(filename, (str, bytes, os.PathLike)):
58        binary_file = GzipFile(filename, gz_mode, compresslevel)
59    elif hasattr(filename, "read") or hasattr(filename, "write"):
60        binary_file = GzipFile(None, gz_mode, compresslevel, filename)
61    else:
62        raise TypeError("filename must be a str or bytes object, or a file")
63
64    if "t" in mode:
65        encoding = io.text_encoding(encoding)
66        return io.TextIOWrapper(binary_file, encoding, errors, newline)
67    else:
68        return binary_file
69
70def write32u(output, value):
71    # The L format writes the bit pattern correctly whether signed
72    # or unsigned.
73    output.write(struct.pack("<L", value))
74
75class _PaddedFile:
76    """Minimal read-only file object that prepends a string to the contents
77    of an actual file. Shouldn't be used outside of gzip.py, as it lacks
78    essential functionality."""
79
80    def __init__(self, f, prepend=b''):
81        self._buffer = prepend
82        self._length = len(prepend)
83        self.file = f
84        self._read = 0
85
86    def read(self, size):
87        if self._read is None:
88            return self.file.read(size)
89        if self._read + size <= self._length:
90            read = self._read
91            self._read += size
92            return self._buffer[read:self._read]
93        else:
94            read = self._read
95            self._read = None
96            return self._buffer[read:] + \
97                   self.file.read(size-self._length+read)
98
99    def prepend(self, prepend=b''):
100        if self._read is None:
101            self._buffer = prepend
102        else:  # Assume data was read since the last prepend() call
103            self._read -= len(prepend)
104            return
105        self._length = len(self._buffer)
106        self._read = 0
107
108    def seek(self, off):
109        self._read = None
110        self._buffer = None
111        return self.file.seek(off)
112
113    def seekable(self):
114        return True  # Allows fast-forwarding even in unseekable streams
115
116
117class BadGzipFile(OSError):
118    """Exception raised in some cases for invalid gzip files."""
119
120
121class GzipFile(_compression.BaseStream):
122    """The GzipFile class simulates most of the methods of a file object with
123    the exception of the truncate() method.
124
125    This class only supports opening files in binary mode. If you need to open a
126    compressed file in text mode, use the gzip.open() function.
127
128    """
129
130    # Overridden with internal file object to be closed, if only a filename
131    # is passed in
132    myfileobj = None
133
134    def __init__(self, filename=None, mode=None,
135                 compresslevel=_COMPRESS_LEVEL_BEST, fileobj=None, mtime=None):
136        """Constructor for the GzipFile class.
137
138        At least one of fileobj and filename must be given a
139        non-trivial value.
140
141        The new class instance is based on fileobj, which can be a regular
142        file, an io.BytesIO object, or any other object which simulates a file.
143        It defaults to None, in which case filename is opened to provide
144        a file object.
145
146        When fileobj is not None, the filename argument is only used to be
147        included in the gzip file header, which may include the original
148        filename of the uncompressed file.  It defaults to the filename of
149        fileobj, if discernible; otherwise, it defaults to the empty string,
150        and in this case the original filename is not included in the header.
151
152        The mode argument can be any of 'r', 'rb', 'a', 'ab', 'w', 'wb', 'x', or
153        'xb' depending on whether the file will be read or written.  The default
154        is the mode of fileobj if discernible; otherwise, the default is 'rb'.
155        A mode of 'r' is equivalent to one of 'rb', and similarly for 'w' and
156        'wb', 'a' and 'ab', and 'x' and 'xb'.
157
158        The compresslevel argument is an integer from 0 to 9 controlling the
159        level of compression; 1 is fastest and produces the least compression,
160        and 9 is slowest and produces the most compression. 0 is no compression
161        at all. The default is 9.
162
163        The mtime argument is an optional numeric timestamp to be written
164        to the last modification time field in the stream when compressing.
165        If omitted or None, the current time is used.
166
167        """
168
169        if mode and ('t' in mode or 'U' in mode):
170            raise ValueError("Invalid mode: {!r}".format(mode))
171        if mode and 'b' not in mode:
172            mode += 'b'
173        if fileobj is None:
174            fileobj = self.myfileobj = builtins.open(filename, mode or 'rb')
175        if filename is None:
176            filename = getattr(fileobj, 'name', '')
177            if not isinstance(filename, (str, bytes)):
178                filename = ''
179        else:
180            filename = os.fspath(filename)
181        origmode = mode
182        if mode is None:
183            mode = getattr(fileobj, 'mode', 'rb')
184
185        if mode.startswith('r'):
186            self.mode = READ
187            raw = _GzipReader(fileobj)
188            self._buffer = io.BufferedReader(raw)
189            self.name = filename
190
191        elif mode.startswith(('w', 'a', 'x')):
192            if origmode is None:
193                import warnings
194                warnings.warn(
195                    "GzipFile was opened for writing, but this will "
196                    "change in future Python releases.  "
197                    "Specify the mode argument for opening it for writing.",
198                    FutureWarning, 2)
199            self.mode = WRITE
200            self._init_write(filename)
201            self.compress = zlib.compressobj(compresslevel,
202                                             zlib.DEFLATED,
203                                             -zlib.MAX_WBITS,
204                                             zlib.DEF_MEM_LEVEL,
205                                             0)
206            self._write_mtime = mtime
207        else:
208            raise ValueError("Invalid mode: {!r}".format(mode))
209
210        self.fileobj = fileobj
211
212        if self.mode == WRITE:
213            self._write_gzip_header(compresslevel)
214
215    @property
216    def filename(self):
217        import warnings
218        warnings.warn("use the name attribute", DeprecationWarning, 2)
219        if self.mode == WRITE and self.name[-3:] != ".gz":
220            return self.name + ".gz"
221        return self.name
222
223    @property
224    def mtime(self):
225        """Last modification time read from stream, or None"""
226        return self._buffer.raw._last_mtime
227
228    def __repr__(self):
229        s = repr(self.fileobj)
230        return '<gzip ' + s[1:-1] + ' ' + hex(id(self)) + '>'
231
232    def _init_write(self, filename):
233        self.name = filename
234        self.crc = zlib.crc32(b"")
235        self.size = 0
236        self.writebuf = []
237        self.bufsize = 0
238        self.offset = 0  # Current file offset for seek(), tell(), etc
239
240    def _write_gzip_header(self, compresslevel):
241        self.fileobj.write(b'\037\213')             # magic header
242        self.fileobj.write(b'\010')                 # compression method
243        try:
244            # RFC 1952 requires the FNAME field to be Latin-1. Do not
245            # include filenames that cannot be represented that way.
246            fname = os.path.basename(self.name)
247            if not isinstance(fname, bytes):
248                fname = fname.encode('latin-1')
249            if fname.endswith(b'.gz'):
250                fname = fname[:-3]
251        except UnicodeEncodeError:
252            fname = b''
253        flags = 0
254        if fname:
255            flags = FNAME
256        self.fileobj.write(chr(flags).encode('latin-1'))
257        mtime = self._write_mtime
258        if mtime is None:
259            mtime = time.time()
260        write32u(self.fileobj, int(mtime))
261        if compresslevel == _COMPRESS_LEVEL_BEST:
262            xfl = b'\002'
263        elif compresslevel == _COMPRESS_LEVEL_FAST:
264            xfl = b'\004'
265        else:
266            xfl = b'\000'
267        self.fileobj.write(xfl)
268        self.fileobj.write(b'\377')
269        if fname:
270            self.fileobj.write(fname + b'\000')
271
272    def write(self,data):
273        self._check_not_closed()
274        if self.mode != WRITE:
275            import errno
276            raise OSError(errno.EBADF, "write() on read-only GzipFile object")
277
278        if self.fileobj is None:
279            raise ValueError("write() on closed GzipFile object")
280
281        if isinstance(data, (bytes, bytearray)):
282            length = len(data)
283        else:
284            # accept any data that supports the buffer protocol
285            data = memoryview(data)
286            length = data.nbytes
287
288        if length > 0:
289            self.fileobj.write(self.compress.compress(data))
290            self.size += length
291            self.crc = zlib.crc32(data, self.crc)
292            self.offset += length
293
294        return length
295
296    def read(self, size=-1):
297        self._check_not_closed()
298        if self.mode != READ:
299            import errno
300            raise OSError(errno.EBADF, "read() on write-only GzipFile object")
301        return self._buffer.read(size)
302
303    def read1(self, size=-1):
304        """Implements BufferedIOBase.read1()
305
306        Reads up to a buffer's worth of data if size is negative."""
307        self._check_not_closed()
308        if self.mode != READ:
309            import errno
310            raise OSError(errno.EBADF, "read1() on write-only GzipFile object")
311
312        if size < 0:
313            size = io.DEFAULT_BUFFER_SIZE
314        return self._buffer.read1(size)
315
316    def peek(self, n):
317        self._check_not_closed()
318        if self.mode != READ:
319            import errno
320            raise OSError(errno.EBADF, "peek() on write-only GzipFile object")
321        return self._buffer.peek(n)
322
323    @property
324    def closed(self):
325        return self.fileobj is None
326
327    def close(self):
328        fileobj = self.fileobj
329        if fileobj is None:
330            return
331        self.fileobj = None
332        try:
333            if self.mode == WRITE:
334                fileobj.write(self.compress.flush())
335                write32u(fileobj, self.crc)
336                # self.size may exceed 2 GiB, or even 4 GiB
337                write32u(fileobj, self.size & 0xffffffff)
338            elif self.mode == READ:
339                self._buffer.close()
340        finally:
341            myfileobj = self.myfileobj
342            if myfileobj:
343                self.myfileobj = None
344                myfileobj.close()
345
346    def flush(self,zlib_mode=zlib.Z_SYNC_FLUSH):
347        self._check_not_closed()
348        if self.mode == WRITE:
349            # Ensure the compressor's buffer is flushed
350            self.fileobj.write(self.compress.flush(zlib_mode))
351            self.fileobj.flush()
352
353    def fileno(self):
354        """Invoke the underlying file object's fileno() method.
355
356        This will raise AttributeError if the underlying file object
357        doesn't support fileno().
358        """
359        return self.fileobj.fileno()
360
361    def rewind(self):
362        '''Return the uncompressed stream file position indicator to the
363        beginning of the file'''
364        if self.mode != READ:
365            raise OSError("Can't rewind in write mode")
366        self._buffer.seek(0)
367
368    def readable(self):
369        return self.mode == READ
370
371    def writable(self):
372        return self.mode == WRITE
373
374    def seekable(self):
375        return True
376
377    def seek(self, offset, whence=io.SEEK_SET):
378        if self.mode == WRITE:
379            if whence != io.SEEK_SET:
380                if whence == io.SEEK_CUR:
381                    offset = self.offset + offset
382                else:
383                    raise ValueError('Seek from end not supported')
384            if offset < self.offset:
385                raise OSError('Negative seek in write mode')
386            count = offset - self.offset
387            chunk = b'\0' * 1024
388            for i in range(count // 1024):
389                self.write(chunk)
390            self.write(b'\0' * (count % 1024))
391        elif self.mode == READ:
392            self._check_not_closed()
393            return self._buffer.seek(offset, whence)
394
395        return self.offset
396
397    def readline(self, size=-1):
398        self._check_not_closed()
399        return self._buffer.readline(size)
400
401
402class _GzipReader(_compression.DecompressReader):
403    def __init__(self, fp):
404        super().__init__(_PaddedFile(fp), zlib.decompressobj,
405                         wbits=-zlib.MAX_WBITS)
406        # Set flag indicating start of a new member
407        self._new_member = True
408        self._last_mtime = None
409
410    def _init_read(self):
411        self._crc = zlib.crc32(b"")
412        self._stream_size = 0  # Decompressed size of unconcatenated stream
413
414    def _read_exact(self, n):
415        '''Read exactly *n* bytes from `self._fp`
416
417        This method is required because self._fp may be unbuffered,
418        i.e. return short reads.
419        '''
420
421        data = self._fp.read(n)
422        while len(data) < n:
423            b = self._fp.read(n - len(data))
424            if not b:
425                raise EOFError("Compressed file ended before the "
426                               "end-of-stream marker was reached")
427            data += b
428        return data
429
430    def _read_gzip_header(self):
431        magic = self._fp.read(2)
432        if magic == b'':
433            return False
434
435        if magic != b'\037\213':
436            raise BadGzipFile('Not a gzipped file (%r)' % magic)
437
438        (method, flag,
439         self._last_mtime) = struct.unpack("<BBIxx", self._read_exact(8))
440        if method != 8:
441            raise BadGzipFile('Unknown compression method')
442
443        if flag & FEXTRA:
444            # Read & discard the extra field, if present
445            extra_len, = struct.unpack("<H", self._read_exact(2))
446            self._read_exact(extra_len)
447        if flag & FNAME:
448            # Read and discard a null-terminated string containing the filename
449            while True:
450                s = self._fp.read(1)
451                if not s or s==b'\000':
452                    break
453        if flag & FCOMMENT:
454            # Read and discard a null-terminated string containing a comment
455            while True:
456                s = self._fp.read(1)
457                if not s or s==b'\000':
458                    break
459        if flag & FHCRC:
460            self._read_exact(2)     # Read & discard the 16-bit header CRC
461        return True
462
463    def read(self, size=-1):
464        if size < 0:
465            return self.readall()
466        # size=0 is special because decompress(max_length=0) is not supported
467        if not size:
468            return b""
469
470        # For certain input data, a single
471        # call to decompress() may not return
472        # any data. In this case, retry until we get some data or reach EOF.
473        while True:
474            if self._decompressor.eof:
475                # Ending case: we've come to the end of a member in the file,
476                # so finish up this member, and read a new gzip header.
477                # Check the CRC and file size, and set the flag so we read
478                # a new member
479                self._read_eof()
480                self._new_member = True
481                self._decompressor = self._decomp_factory(
482                    **self._decomp_args)
483
484            if self._new_member:
485                # If the _new_member flag is set, we have to
486                # jump to the next member, if there is one.
487                self._init_read()
488                if not self._read_gzip_header():
489                    self._size = self._pos
490                    return b""
491                self._new_member = False
492
493            # Read a chunk of data from the file
494            buf = self._fp.read(io.DEFAULT_BUFFER_SIZE)
495
496            uncompress = self._decompressor.decompress(buf, size)
497            if self._decompressor.unconsumed_tail != b"":
498                self._fp.prepend(self._decompressor.unconsumed_tail)
499            elif self._decompressor.unused_data != b"":
500                # Prepend the already read bytes to the fileobj so they can
501                # be seen by _read_eof() and _read_gzip_header()
502                self._fp.prepend(self._decompressor.unused_data)
503
504            if uncompress != b"":
505                break
506            if buf == b"":
507                raise EOFError("Compressed file ended before the "
508                               "end-of-stream marker was reached")
509
510        self._add_read_data( uncompress )
511        self._pos += len(uncompress)
512        return uncompress
513
514    def _add_read_data(self, data):
515        self._crc = zlib.crc32(data, self._crc)
516        self._stream_size = self._stream_size + len(data)
517
518    def _read_eof(self):
519        # We've read to the end of the file
520        # We check that the computed CRC and size of the
521        # uncompressed data matches the stored values.  Note that the size
522        # stored is the true file size mod 2**32.
523        crc32, isize = struct.unpack("<II", self._read_exact(8))
524        if crc32 != self._crc:
525            raise BadGzipFile("CRC check failed %s != %s" % (hex(crc32),
526                                                             hex(self._crc)))
527        elif isize != (self._stream_size & 0xffffffff):
528            raise BadGzipFile("Incorrect length of data produced")
529
530        # Gzip files can be padded with zeroes and still have archives.
531        # Consume all zero bytes and set the file position to the first
532        # non-zero byte. See http://www.gzip.org/#faq8
533        c = b"\x00"
534        while c == b"\x00":
535            c = self._fp.read(1)
536        if c:
537            self._fp.prepend(c)
538
539    def _rewind(self):
540        super()._rewind()
541        self._new_member = True
542
543def compress(data, compresslevel=_COMPRESS_LEVEL_BEST, *, mtime=None):
544    """Compress data in one shot and return the compressed string.
545    Optional argument is the compression level, in range of 0-9.
546    """
547    buf = io.BytesIO()
548    with GzipFile(fileobj=buf, mode='wb', compresslevel=compresslevel, mtime=mtime) as f:
549        f.write(data)
550    return buf.getvalue()
551
552def decompress(data):
553    """Decompress a gzip compressed string in one shot.
554    Return the decompressed string.
555    """
556    with GzipFile(fileobj=io.BytesIO(data)) as f:
557        return f.read()
558
559
560def main():
561    from argparse import ArgumentParser
562    parser = ArgumentParser(description=
563        "A simple command line interface for the gzip module: act like gzip, "
564        "but do not delete the input file.")
565    group = parser.add_mutually_exclusive_group()
566    group.add_argument('--fast', action='store_true', help='compress faster')
567    group.add_argument('--best', action='store_true', help='compress better')
568    group.add_argument("-d", "--decompress", action="store_true",
569                        help="act like gunzip instead of gzip")
570
571    parser.add_argument("args", nargs="*", default=["-"], metavar='file')
572    args = parser.parse_args()
573
574    compresslevel = _COMPRESS_LEVEL_TRADEOFF
575    if args.fast:
576        compresslevel = _COMPRESS_LEVEL_FAST
577    elif args.best:
578        compresslevel = _COMPRESS_LEVEL_BEST
579
580    for arg in args.args:
581        if args.decompress:
582            if arg == "-":
583                f = GzipFile(filename="", mode="rb", fileobj=sys.stdin.buffer)
584                g = sys.stdout.buffer
585            else:
586                if arg[-3:] != ".gz":
587                    sys.exit(f"filename doesn't end in .gz: {arg!r}")
588                f = open(arg, "rb")
589                g = builtins.open(arg[:-3], "wb")
590        else:
591            if arg == "-":
592                f = sys.stdin.buffer
593                g = GzipFile(filename="", mode="wb", fileobj=sys.stdout.buffer,
594                             compresslevel=compresslevel)
595            else:
596                f = builtins.open(arg, "rb")
597                g = open(arg + ".gz", "wb")
598        while True:
599            chunk = f.read(io.DEFAULT_BUFFER_SIZE)
600            if not chunk:
601                break
602            g.write(chunk)
603        if g is not sys.stdout.buffer:
604            g.close()
605        if f is not sys.stdin.buffer:
606            f.close()
607
608if __name__ == '__main__':
609    main()
610