1"""Functions that read and write gzipped files. 2 3The user of the file doesn't have to worry about the compression, 4but random access is not allowed.""" 5 6# based on Andrew Kuchling's minigzip.py distributed with the zlib module 7 8import struct, sys, time, os 9import zlib 10import builtins 11import io 12import _compression 13 14__all__ = ["BadGzipFile", "GzipFile", "open", "compress", "decompress"] 15 16FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16 17 18READ = 'rb' 19WRITE = 'wb' 20 21_COMPRESS_LEVEL_FAST = 1 22_COMPRESS_LEVEL_TRADEOFF = 6 23_COMPRESS_LEVEL_BEST = 9 24 25READ_BUFFER_SIZE = 128 * 1024 26_WRITE_BUFFER_SIZE = 4 * io.DEFAULT_BUFFER_SIZE 27 28 29def open(filename, mode="rb", compresslevel=_COMPRESS_LEVEL_BEST, 30 encoding=None, errors=None, newline=None): 31 """Open a gzip-compressed file in binary or text mode. 32 33 The filename argument can be an actual filename (a str or bytes object), or 34 an existing file object to read from or write to. 35 36 The mode argument can be "r", "rb", "w", "wb", "x", "xb", "a" or "ab" for 37 binary mode, or "rt", "wt", "xt" or "at" for text mode. The default mode is 38 "rb", and the default compresslevel is 9. 39 40 For binary mode, this function is equivalent to the GzipFile constructor: 41 GzipFile(filename, mode, compresslevel). In this case, the encoding, errors 42 and newline arguments must not be provided. 43 44 For text mode, a GzipFile object is created, and wrapped in an 45 io.TextIOWrapper instance with the specified encoding, error handling 46 behavior, and line ending(s). 47 48 """ 49 if "t" in mode: 50 if "b" in mode: 51 raise ValueError("Invalid mode: %r" % (mode,)) 52 else: 53 if encoding is not None: 54 raise ValueError("Argument 'encoding' not supported in binary mode") 55 if errors is not None: 56 raise ValueError("Argument 'errors' not supported in binary mode") 57 if newline is not None: 58 raise ValueError("Argument 'newline' not supported in binary mode") 59 60 gz_mode = mode.replace("t", "") 61 if isinstance(filename, (str, bytes, os.PathLike)): 62 binary_file = GzipFile(filename, gz_mode, compresslevel) 63 elif hasattr(filename, "read") or hasattr(filename, "write"): 64 binary_file = GzipFile(None, gz_mode, compresslevel, filename) 65 else: 66 raise TypeError("filename must be a str or bytes object, or a file") 67 68 if "t" in mode: 69 encoding = io.text_encoding(encoding) 70 return io.TextIOWrapper(binary_file, encoding, errors, newline) 71 else: 72 return binary_file 73 74def write32u(output, value): 75 # The L format writes the bit pattern correctly whether signed 76 # or unsigned. 77 output.write(struct.pack("<L", value)) 78 79class _PaddedFile: 80 """Minimal read-only file object that prepends a string to the contents 81 of an actual file. Shouldn't be used outside of gzip.py, as it lacks 82 essential functionality.""" 83 84 def __init__(self, f, prepend=b''): 85 self._buffer = prepend 86 self._length = len(prepend) 87 self.file = f 88 self._read = 0 89 90 def read(self, size): 91 if self._read is None: 92 return self.file.read(size) 93 if self._read + size <= self._length: 94 read = self._read 95 self._read += size 96 return self._buffer[read:self._read] 97 else: 98 read = self._read 99 self._read = None 100 return self._buffer[read:] + \ 101 self.file.read(size-self._length+read) 102 103 def prepend(self, prepend=b''): 104 if self._read is None: 105 self._buffer = prepend 106 else: # Assume data was read since the last prepend() call 107 self._read -= len(prepend) 108 return 109 self._length = len(self._buffer) 110 self._read = 0 111 112 def seek(self, off): 113 self._read = None 114 self._buffer = None 115 return self.file.seek(off) 116 117 def seekable(self): 118 return True # Allows fast-forwarding even in unseekable streams 119 120 121class BadGzipFile(OSError): 122 """Exception raised in some cases for invalid gzip files.""" 123 124 125class _WriteBufferStream(io.RawIOBase): 126 """Minimal object to pass WriteBuffer flushes into GzipFile""" 127 def __init__(self, gzip_file): 128 self.gzip_file = gzip_file 129 130 def write(self, data): 131 return self.gzip_file._write_raw(data) 132 133 def seekable(self): 134 return False 135 136 def writable(self): 137 return True 138 139 140class GzipFile(_compression.BaseStream): 141 """The GzipFile class simulates most of the methods of a file object with 142 the exception of the truncate() method. 143 144 This class only supports opening files in binary mode. If you need to open a 145 compressed file in text mode, use the gzip.open() function. 146 147 """ 148 149 # Overridden with internal file object to be closed, if only a filename 150 # is passed in 151 myfileobj = None 152 153 def __init__(self, filename=None, mode=None, 154 compresslevel=_COMPRESS_LEVEL_BEST, fileobj=None, mtime=None): 155 """Constructor for the GzipFile class. 156 157 At least one of fileobj and filename must be given a 158 non-trivial value. 159 160 The new class instance is based on fileobj, which can be a regular 161 file, an io.BytesIO object, or any other object which simulates a file. 162 It defaults to None, in which case filename is opened to provide 163 a file object. 164 165 When fileobj is not None, the filename argument is only used to be 166 included in the gzip file header, which may include the original 167 filename of the uncompressed file. It defaults to the filename of 168 fileobj, if discernible; otherwise, it defaults to the empty string, 169 and in this case the original filename is not included in the header. 170 171 The mode argument can be any of 'r', 'rb', 'a', 'ab', 'w', 'wb', 'x', or 172 'xb' depending on whether the file will be read or written. The default 173 is the mode of fileobj if discernible; otherwise, the default is 'rb'. 174 A mode of 'r' is equivalent to one of 'rb', and similarly for 'w' and 175 'wb', 'a' and 'ab', and 'x' and 'xb'. 176 177 The compresslevel argument is an integer from 0 to 9 controlling the 178 level of compression; 1 is fastest and produces the least compression, 179 and 9 is slowest and produces the most compression. 0 is no compression 180 at all. The default is 9. 181 182 The optional mtime argument is the timestamp requested by gzip. The time 183 is in Unix format, i.e., seconds since 00:00:00 UTC, January 1, 1970. 184 If mtime is omitted or None, the current time is used. Use mtime = 0 185 to generate a compressed stream that does not depend on creation time. 186 187 """ 188 189 if mode and ('t' in mode or 'U' in mode): 190 raise ValueError("Invalid mode: {!r}".format(mode)) 191 if mode and 'b' not in mode: 192 mode += 'b' 193 if fileobj is None: 194 fileobj = self.myfileobj = builtins.open(filename, mode or 'rb') 195 if filename is None: 196 filename = getattr(fileobj, 'name', '') 197 if not isinstance(filename, (str, bytes)): 198 filename = '' 199 else: 200 filename = os.fspath(filename) 201 origmode = mode 202 if mode is None: 203 mode = getattr(fileobj, 'mode', 'rb') 204 205 206 if mode.startswith('r'): 207 self.mode = READ 208 raw = _GzipReader(fileobj) 209 self._buffer = io.BufferedReader(raw) 210 self.name = filename 211 212 elif mode.startswith(('w', 'a', 'x')): 213 if origmode is None: 214 import warnings 215 warnings.warn( 216 "GzipFile was opened for writing, but this will " 217 "change in future Python releases. " 218 "Specify the mode argument for opening it for writing.", 219 FutureWarning, 2) 220 self.mode = WRITE 221 self._init_write(filename) 222 self.compress = zlib.compressobj(compresslevel, 223 zlib.DEFLATED, 224 -zlib.MAX_WBITS, 225 zlib.DEF_MEM_LEVEL, 226 0) 227 self._write_mtime = mtime 228 self._buffer_size = _WRITE_BUFFER_SIZE 229 self._buffer = io.BufferedWriter(_WriteBufferStream(self), 230 buffer_size=self._buffer_size) 231 else: 232 raise ValueError("Invalid mode: {!r}".format(mode)) 233 234 self.fileobj = fileobj 235 236 if self.mode == WRITE: 237 self._write_gzip_header(compresslevel) 238 239 @property 240 def mtime(self): 241 """Last modification time read from stream, or None""" 242 return self._buffer.raw._last_mtime 243 244 def __repr__(self): 245 s = repr(self.fileobj) 246 return '<gzip ' + s[1:-1] + ' ' + hex(id(self)) + '>' 247 248 def _init_write(self, filename): 249 self.name = filename 250 self.crc = zlib.crc32(b"") 251 self.size = 0 252 self.writebuf = [] 253 self.bufsize = 0 254 self.offset = 0 # Current file offset for seek(), tell(), etc 255 256 def tell(self): 257 self._check_not_closed() 258 self._buffer.flush() 259 return super().tell() 260 261 def _write_gzip_header(self, compresslevel): 262 self.fileobj.write(b'\037\213') # magic header 263 self.fileobj.write(b'\010') # compression method 264 try: 265 # RFC 1952 requires the FNAME field to be Latin-1. Do not 266 # include filenames that cannot be represented that way. 267 fname = os.path.basename(self.name) 268 if not isinstance(fname, bytes): 269 fname = fname.encode('latin-1') 270 if fname.endswith(b'.gz'): 271 fname = fname[:-3] 272 except UnicodeEncodeError: 273 fname = b'' 274 flags = 0 275 if fname: 276 flags = FNAME 277 self.fileobj.write(chr(flags).encode('latin-1')) 278 mtime = self._write_mtime 279 if mtime is None: 280 mtime = time.time() 281 write32u(self.fileobj, int(mtime)) 282 if compresslevel == _COMPRESS_LEVEL_BEST: 283 xfl = b'\002' 284 elif compresslevel == _COMPRESS_LEVEL_FAST: 285 xfl = b'\004' 286 else: 287 xfl = b'\000' 288 self.fileobj.write(xfl) 289 self.fileobj.write(b'\377') 290 if fname: 291 self.fileobj.write(fname + b'\000') 292 293 def write(self,data): 294 self._check_not_closed() 295 if self.mode != WRITE: 296 import errno 297 raise OSError(errno.EBADF, "write() on read-only GzipFile object") 298 299 if self.fileobj is None: 300 raise ValueError("write() on closed GzipFile object") 301 302 return self._buffer.write(data) 303 304 def _write_raw(self, data): 305 # Called by our self._buffer underlying WriteBufferStream. 306 if isinstance(data, (bytes, bytearray)): 307 length = len(data) 308 else: 309 # accept any data that supports the buffer protocol 310 data = memoryview(data) 311 length = data.nbytes 312 313 if length > 0: 314 self.fileobj.write(self.compress.compress(data)) 315 self.size += length 316 self.crc = zlib.crc32(data, self.crc) 317 self.offset += length 318 319 return length 320 321 def read(self, size=-1): 322 self._check_not_closed() 323 if self.mode != READ: 324 import errno 325 raise OSError(errno.EBADF, "read() on write-only GzipFile object") 326 return self._buffer.read(size) 327 328 def read1(self, size=-1): 329 """Implements BufferedIOBase.read1() 330 331 Reads up to a buffer's worth of data if size is negative.""" 332 self._check_not_closed() 333 if self.mode != READ: 334 import errno 335 raise OSError(errno.EBADF, "read1() on write-only GzipFile object") 336 337 if size < 0: 338 size = io.DEFAULT_BUFFER_SIZE 339 return self._buffer.read1(size) 340 341 def peek(self, n): 342 self._check_not_closed() 343 if self.mode != READ: 344 import errno 345 raise OSError(errno.EBADF, "peek() on write-only GzipFile object") 346 return self._buffer.peek(n) 347 348 @property 349 def closed(self): 350 return self.fileobj is None 351 352 def close(self): 353 fileobj = self.fileobj 354 if fileobj is None or self._buffer.closed: 355 return 356 try: 357 if self.mode == WRITE: 358 self._buffer.flush() 359 fileobj.write(self.compress.flush()) 360 write32u(fileobj, self.crc) 361 # self.size may exceed 2 GiB, or even 4 GiB 362 write32u(fileobj, self.size & 0xffffffff) 363 elif self.mode == READ: 364 self._buffer.close() 365 finally: 366 self.fileobj = None 367 myfileobj = self.myfileobj 368 if myfileobj: 369 self.myfileobj = None 370 myfileobj.close() 371 372 def flush(self,zlib_mode=zlib.Z_SYNC_FLUSH): 373 self._check_not_closed() 374 if self.mode == WRITE: 375 self._buffer.flush() 376 # Ensure the compressor's buffer is flushed 377 self.fileobj.write(self.compress.flush(zlib_mode)) 378 self.fileobj.flush() 379 380 def fileno(self): 381 """Invoke the underlying file object's fileno() method. 382 383 This will raise AttributeError if the underlying file object 384 doesn't support fileno(). 385 """ 386 return self.fileobj.fileno() 387 388 def rewind(self): 389 '''Return the uncompressed stream file position indicator to the 390 beginning of the file''' 391 if self.mode != READ: 392 raise OSError("Can't rewind in write mode") 393 self._buffer.seek(0) 394 395 def readable(self): 396 return self.mode == READ 397 398 def writable(self): 399 return self.mode == WRITE 400 401 def seekable(self): 402 return True 403 404 def seek(self, offset, whence=io.SEEK_SET): 405 if self.mode == WRITE: 406 self._check_not_closed() 407 # Flush buffer to ensure validity of self.offset 408 self._buffer.flush() 409 if whence != io.SEEK_SET: 410 if whence == io.SEEK_CUR: 411 offset = self.offset + offset 412 else: 413 raise ValueError('Seek from end not supported') 414 if offset < self.offset: 415 raise OSError('Negative seek in write mode') 416 count = offset - self.offset 417 chunk = b'\0' * self._buffer_size 418 for i in range(count // self._buffer_size): 419 self.write(chunk) 420 self.write(b'\0' * (count % self._buffer_size)) 421 elif self.mode == READ: 422 self._check_not_closed() 423 return self._buffer.seek(offset, whence) 424 425 return self.offset 426 427 def readline(self, size=-1): 428 self._check_not_closed() 429 return self._buffer.readline(size) 430 431 432def _read_exact(fp, n): 433 '''Read exactly *n* bytes from `fp` 434 435 This method is required because fp may be unbuffered, 436 i.e. return short reads. 437 ''' 438 data = fp.read(n) 439 while len(data) < n: 440 b = fp.read(n - len(data)) 441 if not b: 442 raise EOFError("Compressed file ended before the " 443 "end-of-stream marker was reached") 444 data += b 445 return data 446 447 448def _read_gzip_header(fp): 449 '''Read a gzip header from `fp` and progress to the end of the header. 450 451 Returns last mtime if header was present or None otherwise. 452 ''' 453 magic = fp.read(2) 454 if magic == b'': 455 return None 456 457 if magic != b'\037\213': 458 raise BadGzipFile('Not a gzipped file (%r)' % magic) 459 460 (method, flag, last_mtime) = struct.unpack("<BBIxx", _read_exact(fp, 8)) 461 if method != 8: 462 raise BadGzipFile('Unknown compression method') 463 464 if flag & FEXTRA: 465 # Read & discard the extra field, if present 466 extra_len, = struct.unpack("<H", _read_exact(fp, 2)) 467 _read_exact(fp, extra_len) 468 if flag & FNAME: 469 # Read and discard a null-terminated string containing the filename 470 while True: 471 s = fp.read(1) 472 if not s or s==b'\000': 473 break 474 if flag & FCOMMENT: 475 # Read and discard a null-terminated string containing a comment 476 while True: 477 s = fp.read(1) 478 if not s or s==b'\000': 479 break 480 if flag & FHCRC: 481 _read_exact(fp, 2) # Read & discard the 16-bit header CRC 482 return last_mtime 483 484 485class _GzipReader(_compression.DecompressReader): 486 def __init__(self, fp): 487 super().__init__(_PaddedFile(fp), zlib._ZlibDecompressor, 488 wbits=-zlib.MAX_WBITS) 489 # Set flag indicating start of a new member 490 self._new_member = True 491 self._last_mtime = None 492 493 def _init_read(self): 494 self._crc = zlib.crc32(b"") 495 self._stream_size = 0 # Decompressed size of unconcatenated stream 496 497 def _read_gzip_header(self): 498 last_mtime = _read_gzip_header(self._fp) 499 if last_mtime is None: 500 return False 501 self._last_mtime = last_mtime 502 return True 503 504 def read(self, size=-1): 505 if size < 0: 506 return self.readall() 507 # size=0 is special because decompress(max_length=0) is not supported 508 if not size: 509 return b"" 510 511 # For certain input data, a single 512 # call to decompress() may not return 513 # any data. In this case, retry until we get some data or reach EOF. 514 while True: 515 if self._decompressor.eof: 516 # Ending case: we've come to the end of a member in the file, 517 # so finish up this member, and read a new gzip header. 518 # Check the CRC and file size, and set the flag so we read 519 # a new member 520 self._read_eof() 521 self._new_member = True 522 self._decompressor = self._decomp_factory( 523 **self._decomp_args) 524 525 if self._new_member: 526 # If the _new_member flag is set, we have to 527 # jump to the next member, if there is one. 528 self._init_read() 529 if not self._read_gzip_header(): 530 self._size = self._pos 531 return b"" 532 self._new_member = False 533 534 # Read a chunk of data from the file 535 if self._decompressor.needs_input: 536 buf = self._fp.read(READ_BUFFER_SIZE) 537 uncompress = self._decompressor.decompress(buf, size) 538 else: 539 uncompress = self._decompressor.decompress(b"", size) 540 541 if self._decompressor.unused_data != b"": 542 # Prepend the already read bytes to the fileobj so they can 543 # be seen by _read_eof() and _read_gzip_header() 544 self._fp.prepend(self._decompressor.unused_data) 545 546 if uncompress != b"": 547 break 548 if buf == b"": 549 raise EOFError("Compressed file ended before the " 550 "end-of-stream marker was reached") 551 552 self._crc = zlib.crc32(uncompress, self._crc) 553 self._stream_size += len(uncompress) 554 self._pos += len(uncompress) 555 return uncompress 556 557 def _read_eof(self): 558 # We've read to the end of the file 559 # We check that the computed CRC and size of the 560 # uncompressed data matches the stored values. Note that the size 561 # stored is the true file size mod 2**32. 562 crc32, isize = struct.unpack("<II", _read_exact(self._fp, 8)) 563 if crc32 != self._crc: 564 raise BadGzipFile("CRC check failed %s != %s" % (hex(crc32), 565 hex(self._crc))) 566 elif isize != (self._stream_size & 0xffffffff): 567 raise BadGzipFile("Incorrect length of data produced") 568 569 # Gzip files can be padded with zeroes and still have archives. 570 # Consume all zero bytes and set the file position to the first 571 # non-zero byte. See http://www.gzip.org/#faq8 572 c = b"\x00" 573 while c == b"\x00": 574 c = self._fp.read(1) 575 if c: 576 self._fp.prepend(c) 577 578 def _rewind(self): 579 super()._rewind() 580 self._new_member = True 581 582 583def compress(data, compresslevel=_COMPRESS_LEVEL_BEST, *, mtime=None): 584 """Compress data in one shot and return the compressed string. 585 586 compresslevel sets the compression level in range of 0-9. 587 mtime can be used to set the modification time. The modification time is 588 set to the current time by default. 589 """ 590 # Wbits=31 automatically includes a gzip header and trailer. 591 gzip_data = zlib.compress(data, level=compresslevel, wbits=31) 592 if mtime is None: 593 mtime = time.time() 594 # Reuse gzip header created by zlib, replace mtime and OS byte for 595 # consistency. 596 header = struct.pack("<4sLBB", gzip_data, int(mtime), gzip_data[8], 255) 597 return header + gzip_data[10:] 598 599 600def decompress(data): 601 """Decompress a gzip compressed string in one shot. 602 Return the decompressed string. 603 """ 604 decompressed_members = [] 605 while True: 606 fp = io.BytesIO(data) 607 if _read_gzip_header(fp) is None: 608 return b"".join(decompressed_members) 609 # Use a zlib raw deflate compressor 610 do = zlib.decompressobj(wbits=-zlib.MAX_WBITS) 611 # Read all the data except the header 612 decompressed = do.decompress(data[fp.tell():]) 613 if not do.eof or len(do.unused_data) < 8: 614 raise EOFError("Compressed file ended before the end-of-stream " 615 "marker was reached") 616 crc, length = struct.unpack("<II", do.unused_data[:8]) 617 if crc != zlib.crc32(decompressed): 618 raise BadGzipFile("CRC check failed") 619 if length != (len(decompressed) & 0xffffffff): 620 raise BadGzipFile("Incorrect length of data produced") 621 decompressed_members.append(decompressed) 622 data = do.unused_data[8:].lstrip(b"\x00") 623 624 625def main(): 626 from argparse import ArgumentParser 627 parser = ArgumentParser(description= 628 "A simple command line interface for the gzip module: act like gzip, " 629 "but do not delete the input file.") 630 group = parser.add_mutually_exclusive_group() 631 group.add_argument('--fast', action='store_true', help='compress faster') 632 group.add_argument('--best', action='store_true', help='compress better') 633 group.add_argument("-d", "--decompress", action="store_true", 634 help="act like gunzip instead of gzip") 635 636 parser.add_argument("args", nargs="*", default=["-"], metavar='file') 637 args = parser.parse_args() 638 639 compresslevel = _COMPRESS_LEVEL_TRADEOFF 640 if args.fast: 641 compresslevel = _COMPRESS_LEVEL_FAST 642 elif args.best: 643 compresslevel = _COMPRESS_LEVEL_BEST 644 645 for arg in args.args: 646 if args.decompress: 647 if arg == "-": 648 f = GzipFile(filename="", mode="rb", fileobj=sys.stdin.buffer) 649 g = sys.stdout.buffer 650 else: 651 if arg[-3:] != ".gz": 652 sys.exit(f"filename doesn't end in .gz: {arg!r}") 653 f = open(arg, "rb") 654 g = builtins.open(arg[:-3], "wb") 655 else: 656 if arg == "-": 657 f = sys.stdin.buffer 658 g = GzipFile(filename="", mode="wb", fileobj=sys.stdout.buffer, 659 compresslevel=compresslevel) 660 else: 661 f = builtins.open(arg, "rb") 662 g = open(arg + ".gz", "wb") 663 while True: 664 chunk = f.read(READ_BUFFER_SIZE) 665 if not chunk: 666 break 667 g.write(chunk) 668 if g is not sys.stdout.buffer: 669 g.close() 670 if f is not sys.stdin.buffer: 671 f.close() 672 673if __name__ == '__main__': 674 main() 675