• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1"""Functions that read and write gzipped files.
2
3The user of the file doesn't have to worry about the compression,
4but random access is not allowed."""
5
6# based on Andrew Kuchling's minigzip.py distributed with the zlib module
7
8import struct, sys, time, os
9import zlib
10import builtins
11import io
12import _compression
13
14__all__ = ["BadGzipFile", "GzipFile", "open", "compress", "decompress"]
15
16FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16
17
18READ, WRITE = 1, 2
19
20_COMPRESS_LEVEL_FAST = 1
21_COMPRESS_LEVEL_TRADEOFF = 6
22_COMPRESS_LEVEL_BEST = 9
23
24
25def open(filename, mode="rb", compresslevel=_COMPRESS_LEVEL_BEST,
26         encoding=None, errors=None, newline=None):
27    """Open a gzip-compressed file in binary or text mode.
28
29    The filename argument can be an actual filename (a str or bytes object), or
30    an existing file object to read from or write to.
31
32    The mode argument can be "r", "rb", "w", "wb", "x", "xb", "a" or "ab" for
33    binary mode, or "rt", "wt", "xt" or "at" for text mode. The default mode is
34    "rb", and the default compresslevel is 9.
35
36    For binary mode, this function is equivalent to the GzipFile constructor:
37    GzipFile(filename, mode, compresslevel). In this case, the encoding, errors
38    and newline arguments must not be provided.
39
40    For text mode, a GzipFile object is created, and wrapped in an
41    io.TextIOWrapper instance with the specified encoding, error handling
42    behavior, and line ending(s).
43
44    """
45    if "t" in mode:
46        if "b" in mode:
47            raise ValueError("Invalid mode: %r" % (mode,))
48    else:
49        if encoding is not None:
50            raise ValueError("Argument 'encoding' not supported in binary mode")
51        if errors is not None:
52            raise ValueError("Argument 'errors' not supported in binary mode")
53        if newline is not None:
54            raise ValueError("Argument 'newline' not supported in binary mode")
55
56    gz_mode = mode.replace("t", "")
57    if isinstance(filename, (str, bytes, os.PathLike)):
58        binary_file = GzipFile(filename, gz_mode, compresslevel)
59    elif hasattr(filename, "read") or hasattr(filename, "write"):
60        binary_file = GzipFile(None, gz_mode, compresslevel, filename)
61    else:
62        raise TypeError("filename must be a str or bytes object, or a file")
63
64    if "t" in mode:
65        return io.TextIOWrapper(binary_file, encoding, errors, newline)
66    else:
67        return binary_file
68
69def write32u(output, value):
70    # The L format writes the bit pattern correctly whether signed
71    # or unsigned.
72    output.write(struct.pack("<L", value))
73
74class _PaddedFile:
75    """Minimal read-only file object that prepends a string to the contents
76    of an actual file. Shouldn't be used outside of gzip.py, as it lacks
77    essential functionality."""
78
79    def __init__(self, f, prepend=b''):
80        self._buffer = prepend
81        self._length = len(prepend)
82        self.file = f
83        self._read = 0
84
85    def read(self, size):
86        if self._read is None:
87            return self.file.read(size)
88        if self._read + size <= self._length:
89            read = self._read
90            self._read += size
91            return self._buffer[read:self._read]
92        else:
93            read = self._read
94            self._read = None
95            return self._buffer[read:] + \
96                   self.file.read(size-self._length+read)
97
98    def prepend(self, prepend=b''):
99        if self._read is None:
100            self._buffer = prepend
101        else:  # Assume data was read since the last prepend() call
102            self._read -= len(prepend)
103            return
104        self._length = len(self._buffer)
105        self._read = 0
106
107    def seek(self, off):
108        self._read = None
109        self._buffer = None
110        return self.file.seek(off)
111
112    def seekable(self):
113        return True  # Allows fast-forwarding even in unseekable streams
114
115
116class BadGzipFile(OSError):
117    """Exception raised in some cases for invalid gzip files."""
118
119
120class GzipFile(_compression.BaseStream):
121    """The GzipFile class simulates most of the methods of a file object with
122    the exception of the truncate() method.
123
124    This class only supports opening files in binary mode. If you need to open a
125    compressed file in text mode, use the gzip.open() function.
126
127    """
128
129    # Overridden with internal file object to be closed, if only a filename
130    # is passed in
131    myfileobj = None
132
133    def __init__(self, filename=None, mode=None,
134                 compresslevel=_COMPRESS_LEVEL_BEST, fileobj=None, mtime=None):
135        """Constructor for the GzipFile class.
136
137        At least one of fileobj and filename must be given a
138        non-trivial value.
139
140        The new class instance is based on fileobj, which can be a regular
141        file, an io.BytesIO object, or any other object which simulates a file.
142        It defaults to None, in which case filename is opened to provide
143        a file object.
144
145        When fileobj is not None, the filename argument is only used to be
146        included in the gzip file header, which may include the original
147        filename of the uncompressed file.  It defaults to the filename of
148        fileobj, if discernible; otherwise, it defaults to the empty string,
149        and in this case the original filename is not included in the header.
150
151        The mode argument can be any of 'r', 'rb', 'a', 'ab', 'w', 'wb', 'x', or
152        'xb' depending on whether the file will be read or written.  The default
153        is the mode of fileobj if discernible; otherwise, the default is 'rb'.
154        A mode of 'r' is equivalent to one of 'rb', and similarly for 'w' and
155        'wb', 'a' and 'ab', and 'x' and 'xb'.
156
157        The compresslevel argument is an integer from 0 to 9 controlling the
158        level of compression; 1 is fastest and produces the least compression,
159        and 9 is slowest and produces the most compression. 0 is no compression
160        at all. The default is 9.
161
162        The mtime argument is an optional numeric timestamp to be written
163        to the last modification time field in the stream when compressing.
164        If omitted or None, the current time is used.
165
166        """
167
168        if mode and ('t' in mode or 'U' in mode):
169            raise ValueError("Invalid mode: {!r}".format(mode))
170        if mode and 'b' not in mode:
171            mode += 'b'
172        if fileobj is None:
173            fileobj = self.myfileobj = builtins.open(filename, mode or 'rb')
174        if filename is None:
175            filename = getattr(fileobj, 'name', '')
176            if not isinstance(filename, (str, bytes)):
177                filename = ''
178        else:
179            filename = os.fspath(filename)
180        if mode is None:
181            mode = getattr(fileobj, 'mode', 'rb')
182
183        if mode.startswith('r'):
184            self.mode = READ
185            raw = _GzipReader(fileobj)
186            self._buffer = io.BufferedReader(raw)
187            self.name = filename
188
189        elif mode.startswith(('w', 'a', 'x')):
190            self.mode = WRITE
191            self._init_write(filename)
192            self.compress = zlib.compressobj(compresslevel,
193                                             zlib.DEFLATED,
194                                             -zlib.MAX_WBITS,
195                                             zlib.DEF_MEM_LEVEL,
196                                             0)
197            self._write_mtime = mtime
198        else:
199            raise ValueError("Invalid mode: {!r}".format(mode))
200
201        self.fileobj = fileobj
202
203        if self.mode == WRITE:
204            self._write_gzip_header(compresslevel)
205
206    @property
207    def filename(self):
208        import warnings
209        warnings.warn("use the name attribute", DeprecationWarning, 2)
210        if self.mode == WRITE and self.name[-3:] != ".gz":
211            return self.name + ".gz"
212        return self.name
213
214    @property
215    def mtime(self):
216        """Last modification time read from stream, or None"""
217        return self._buffer.raw._last_mtime
218
219    def __repr__(self):
220        s = repr(self.fileobj)
221        return '<gzip ' + s[1:-1] + ' ' + hex(id(self)) + '>'
222
223    def _init_write(self, filename):
224        self.name = filename
225        self.crc = zlib.crc32(b"")
226        self.size = 0
227        self.writebuf = []
228        self.bufsize = 0
229        self.offset = 0  # Current file offset for seek(), tell(), etc
230
231    def _write_gzip_header(self, compresslevel):
232        self.fileobj.write(b'\037\213')             # magic header
233        self.fileobj.write(b'\010')                 # compression method
234        try:
235            # RFC 1952 requires the FNAME field to be Latin-1. Do not
236            # include filenames that cannot be represented that way.
237            fname = os.path.basename(self.name)
238            if not isinstance(fname, bytes):
239                fname = fname.encode('latin-1')
240            if fname.endswith(b'.gz'):
241                fname = fname[:-3]
242        except UnicodeEncodeError:
243            fname = b''
244        flags = 0
245        if fname:
246            flags = FNAME
247        self.fileobj.write(chr(flags).encode('latin-1'))
248        mtime = self._write_mtime
249        if mtime is None:
250            mtime = time.time()
251        write32u(self.fileobj, int(mtime))
252        if compresslevel == _COMPRESS_LEVEL_BEST:
253            xfl = b'\002'
254        elif compresslevel == _COMPRESS_LEVEL_FAST:
255            xfl = b'\004'
256        else:
257            xfl = b'\000'
258        self.fileobj.write(xfl)
259        self.fileobj.write(b'\377')
260        if fname:
261            self.fileobj.write(fname + b'\000')
262
263    def write(self,data):
264        self._check_not_closed()
265        if self.mode != WRITE:
266            import errno
267            raise OSError(errno.EBADF, "write() on read-only GzipFile object")
268
269        if self.fileobj is None:
270            raise ValueError("write() on closed GzipFile object")
271
272        if isinstance(data, bytes):
273            length = len(data)
274        else:
275            # accept any data that supports the buffer protocol
276            data = memoryview(data)
277            length = data.nbytes
278
279        if length > 0:
280            self.fileobj.write(self.compress.compress(data))
281            self.size += length
282            self.crc = zlib.crc32(data, self.crc)
283            self.offset += length
284
285        return length
286
287    def read(self, size=-1):
288        self._check_not_closed()
289        if self.mode != READ:
290            import errno
291            raise OSError(errno.EBADF, "read() on write-only GzipFile object")
292        return self._buffer.read(size)
293
294    def read1(self, size=-1):
295        """Implements BufferedIOBase.read1()
296
297        Reads up to a buffer's worth of data if size is negative."""
298        self._check_not_closed()
299        if self.mode != READ:
300            import errno
301            raise OSError(errno.EBADF, "read1() on write-only GzipFile object")
302
303        if size < 0:
304            size = io.DEFAULT_BUFFER_SIZE
305        return self._buffer.read1(size)
306
307    def peek(self, n):
308        self._check_not_closed()
309        if self.mode != READ:
310            import errno
311            raise OSError(errno.EBADF, "peek() on write-only GzipFile object")
312        return self._buffer.peek(n)
313
314    @property
315    def closed(self):
316        return self.fileobj is None
317
318    def close(self):
319        fileobj = self.fileobj
320        if fileobj is None:
321            return
322        self.fileobj = None
323        try:
324            if self.mode == WRITE:
325                fileobj.write(self.compress.flush())
326                write32u(fileobj, self.crc)
327                # self.size may exceed 2 GiB, or even 4 GiB
328                write32u(fileobj, self.size & 0xffffffff)
329            elif self.mode == READ:
330                self._buffer.close()
331        finally:
332            myfileobj = self.myfileobj
333            if myfileobj:
334                self.myfileobj = None
335                myfileobj.close()
336
337    def flush(self,zlib_mode=zlib.Z_SYNC_FLUSH):
338        self._check_not_closed()
339        if self.mode == WRITE:
340            # Ensure the compressor's buffer is flushed
341            self.fileobj.write(self.compress.flush(zlib_mode))
342            self.fileobj.flush()
343
344    def fileno(self):
345        """Invoke the underlying file object's fileno() method.
346
347        This will raise AttributeError if the underlying file object
348        doesn't support fileno().
349        """
350        return self.fileobj.fileno()
351
352    def rewind(self):
353        '''Return the uncompressed stream file position indicator to the
354        beginning of the file'''
355        if self.mode != READ:
356            raise OSError("Can't rewind in write mode")
357        self._buffer.seek(0)
358
359    def readable(self):
360        return self.mode == READ
361
362    def writable(self):
363        return self.mode == WRITE
364
365    def seekable(self):
366        return True
367
368    def seek(self, offset, whence=io.SEEK_SET):
369        if self.mode == WRITE:
370            if whence != io.SEEK_SET:
371                if whence == io.SEEK_CUR:
372                    offset = self.offset + offset
373                else:
374                    raise ValueError('Seek from end not supported')
375            if offset < self.offset:
376                raise OSError('Negative seek in write mode')
377            count = offset - self.offset
378            chunk = b'\0' * 1024
379            for i in range(count // 1024):
380                self.write(chunk)
381            self.write(b'\0' * (count % 1024))
382        elif self.mode == READ:
383            self._check_not_closed()
384            return self._buffer.seek(offset, whence)
385
386        return self.offset
387
388    def readline(self, size=-1):
389        self._check_not_closed()
390        return self._buffer.readline(size)
391
392
393class _GzipReader(_compression.DecompressReader):
394    def __init__(self, fp):
395        super().__init__(_PaddedFile(fp), zlib.decompressobj,
396                         wbits=-zlib.MAX_WBITS)
397        # Set flag indicating start of a new member
398        self._new_member = True
399        self._last_mtime = None
400
401    def _init_read(self):
402        self._crc = zlib.crc32(b"")
403        self._stream_size = 0  # Decompressed size of unconcatenated stream
404
405    def _read_exact(self, n):
406        '''Read exactly *n* bytes from `self._fp`
407
408        This method is required because self._fp may be unbuffered,
409        i.e. return short reads.
410        '''
411
412        data = self._fp.read(n)
413        while len(data) < n:
414            b = self._fp.read(n - len(data))
415            if not b:
416                raise EOFError("Compressed file ended before the "
417                               "end-of-stream marker was reached")
418            data += b
419        return data
420
421    def _read_gzip_header(self):
422        magic = self._fp.read(2)
423        if magic == b'':
424            return False
425
426        if magic != b'\037\213':
427            raise BadGzipFile('Not a gzipped file (%r)' % magic)
428
429        (method, flag,
430         self._last_mtime) = struct.unpack("<BBIxx", self._read_exact(8))
431        if method != 8:
432            raise BadGzipFile('Unknown compression method')
433
434        if flag & FEXTRA:
435            # Read & discard the extra field, if present
436            extra_len, = struct.unpack("<H", self._read_exact(2))
437            self._read_exact(extra_len)
438        if flag & FNAME:
439            # Read and discard a null-terminated string containing the filename
440            while True:
441                s = self._fp.read(1)
442                if not s or s==b'\000':
443                    break
444        if flag & FCOMMENT:
445            # Read and discard a null-terminated string containing a comment
446            while True:
447                s = self._fp.read(1)
448                if not s or s==b'\000':
449                    break
450        if flag & FHCRC:
451            self._read_exact(2)     # Read & discard the 16-bit header CRC
452        return True
453
454    def read(self, size=-1):
455        if size < 0:
456            return self.readall()
457        # size=0 is special because decompress(max_length=0) is not supported
458        if not size:
459            return b""
460
461        # For certain input data, a single
462        # call to decompress() may not return
463        # any data. In this case, retry until we get some data or reach EOF.
464        while True:
465            if self._decompressor.eof:
466                # Ending case: we've come to the end of a member in the file,
467                # so finish up this member, and read a new gzip header.
468                # Check the CRC and file size, and set the flag so we read
469                # a new member
470                self._read_eof()
471                self._new_member = True
472                self._decompressor = self._decomp_factory(
473                    **self._decomp_args)
474
475            if self._new_member:
476                # If the _new_member flag is set, we have to
477                # jump to the next member, if there is one.
478                self._init_read()
479                if not self._read_gzip_header():
480                    self._size = self._pos
481                    return b""
482                self._new_member = False
483
484            # Read a chunk of data from the file
485            buf = self._fp.read(io.DEFAULT_BUFFER_SIZE)
486
487            uncompress = self._decompressor.decompress(buf, size)
488            if self._decompressor.unconsumed_tail != b"":
489                self._fp.prepend(self._decompressor.unconsumed_tail)
490            elif self._decompressor.unused_data != b"":
491                # Prepend the already read bytes to the fileobj so they can
492                # be seen by _read_eof() and _read_gzip_header()
493                self._fp.prepend(self._decompressor.unused_data)
494
495            if uncompress != b"":
496                break
497            if buf == b"":
498                raise EOFError("Compressed file ended before the "
499                               "end-of-stream marker was reached")
500
501        self._add_read_data( uncompress )
502        self._pos += len(uncompress)
503        return uncompress
504
505    def _add_read_data(self, data):
506        self._crc = zlib.crc32(data, self._crc)
507        self._stream_size = self._stream_size + len(data)
508
509    def _read_eof(self):
510        # We've read to the end of the file
511        # We check the that the computed CRC and size of the
512        # uncompressed data matches the stored values.  Note that the size
513        # stored is the true file size mod 2**32.
514        crc32, isize = struct.unpack("<II", self._read_exact(8))
515        if crc32 != self._crc:
516            raise BadGzipFile("CRC check failed %s != %s" % (hex(crc32),
517                                                             hex(self._crc)))
518        elif isize != (self._stream_size & 0xffffffff):
519            raise BadGzipFile("Incorrect length of data produced")
520
521        # Gzip files can be padded with zeroes and still have archives.
522        # Consume all zero bytes and set the file position to the first
523        # non-zero byte. See http://www.gzip.org/#faq8
524        c = b"\x00"
525        while c == b"\x00":
526            c = self._fp.read(1)
527        if c:
528            self._fp.prepend(c)
529
530    def _rewind(self):
531        super()._rewind()
532        self._new_member = True
533
534def compress(data, compresslevel=_COMPRESS_LEVEL_BEST, *, mtime=None):
535    """Compress data in one shot and return the compressed string.
536    Optional argument is the compression level, in range of 0-9.
537    """
538    buf = io.BytesIO()
539    with GzipFile(fileobj=buf, mode='wb', compresslevel=compresslevel, mtime=mtime) as f:
540        f.write(data)
541    return buf.getvalue()
542
543def decompress(data):
544    """Decompress a gzip compressed string in one shot.
545    Return the decompressed string.
546    """
547    with GzipFile(fileobj=io.BytesIO(data)) as f:
548        return f.read()
549
550
551def main():
552    from argparse import ArgumentParser
553    parser = ArgumentParser(description=
554        "A simple command line interface for the gzip module: act like gzip, "
555        "but do not delete the input file.")
556    group = parser.add_mutually_exclusive_group()
557    group.add_argument('--fast', action='store_true', help='compress faster')
558    group.add_argument('--best', action='store_true', help='compress better')
559    group.add_argument("-d", "--decompress", action="store_true",
560                        help="act like gunzip instead of gzip")
561
562    parser.add_argument("args", nargs="*", default=["-"], metavar='file')
563    args = parser.parse_args()
564
565    compresslevel = _COMPRESS_LEVEL_TRADEOFF
566    if args.fast:
567        compresslevel = _COMPRESS_LEVEL_FAST
568    elif args.best:
569        compresslevel = _COMPRESS_LEVEL_BEST
570
571    for arg in args.args:
572        if args.decompress:
573            if arg == "-":
574                f = GzipFile(filename="", mode="rb", fileobj=sys.stdin.buffer)
575                g = sys.stdout.buffer
576            else:
577                if arg[-3:] != ".gz":
578                    print("filename doesn't end in .gz:", repr(arg))
579                    continue
580                f = open(arg, "rb")
581                g = builtins.open(arg[:-3], "wb")
582        else:
583            if arg == "-":
584                f = sys.stdin.buffer
585                g = GzipFile(filename="", mode="wb", fileobj=sys.stdout.buffer,
586                             compresslevel=compresslevel)
587            else:
588                f = builtins.open(arg, "rb")
589                g = open(arg + ".gz", "wb")
590        while True:
591            chunk = f.read(1024)
592            if not chunk:
593                break
594            g.write(chunk)
595        if g is not sys.stdout.buffer:
596            g.close()
597        if f is not sys.stdin.buffer:
598            f.close()
599
600if __name__ == '__main__':
601    main()
602