1"""Functions that read and write gzipped files. 2 3The user of the file doesn't have to worry about the compression, 4but random access is not allowed.""" 5 6# based on Andrew Kuchling's minigzip.py distributed with the zlib module 7 8import struct, sys, time, os 9import zlib 10import builtins 11import io 12import _compression 13 14__all__ = ["BadGzipFile", "GzipFile", "open", "compress", "decompress"] 15 16FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16 17 18READ, WRITE = 1, 2 19 20_COMPRESS_LEVEL_FAST = 1 21_COMPRESS_LEVEL_TRADEOFF = 6 22_COMPRESS_LEVEL_BEST = 9 23 24 25def open(filename, mode="rb", compresslevel=_COMPRESS_LEVEL_BEST, 26 encoding=None, errors=None, newline=None): 27 """Open a gzip-compressed file in binary or text mode. 28 29 The filename argument can be an actual filename (a str or bytes object), or 30 an existing file object to read from or write to. 31 32 The mode argument can be "r", "rb", "w", "wb", "x", "xb", "a" or "ab" for 33 binary mode, or "rt", "wt", "xt" or "at" for text mode. The default mode is 34 "rb", and the default compresslevel is 9. 35 36 For binary mode, this function is equivalent to the GzipFile constructor: 37 GzipFile(filename, mode, compresslevel). In this case, the encoding, errors 38 and newline arguments must not be provided. 39 40 For text mode, a GzipFile object is created, and wrapped in an 41 io.TextIOWrapper instance with the specified encoding, error handling 42 behavior, and line ending(s). 43 44 """ 45 if "t" in mode: 46 if "b" in mode: 47 raise ValueError("Invalid mode: %r" % (mode,)) 48 else: 49 if encoding is not None: 50 raise ValueError("Argument 'encoding' not supported in binary mode") 51 if errors is not None: 52 raise ValueError("Argument 'errors' not supported in binary mode") 53 if newline is not None: 54 raise ValueError("Argument 'newline' not supported in binary mode") 55 56 gz_mode = mode.replace("t", "") 57 if isinstance(filename, (str, bytes, os.PathLike)): 58 binary_file = GzipFile(filename, gz_mode, compresslevel) 59 elif hasattr(filename, "read") or hasattr(filename, "write"): 60 binary_file = GzipFile(None, gz_mode, compresslevel, filename) 61 else: 62 raise TypeError("filename must be a str or bytes object, or a file") 63 64 if "t" in mode: 65 encoding = io.text_encoding(encoding) 66 return io.TextIOWrapper(binary_file, encoding, errors, newline) 67 else: 68 return binary_file 69 70def write32u(output, value): 71 # The L format writes the bit pattern correctly whether signed 72 # or unsigned. 73 output.write(struct.pack("<L", value)) 74 75class _PaddedFile: 76 """Minimal read-only file object that prepends a string to the contents 77 of an actual file. Shouldn't be used outside of gzip.py, as it lacks 78 essential functionality.""" 79 80 def __init__(self, f, prepend=b''): 81 self._buffer = prepend 82 self._length = len(prepend) 83 self.file = f 84 self._read = 0 85 86 def read(self, size): 87 if self._read is None: 88 return self.file.read(size) 89 if self._read + size <= self._length: 90 read = self._read 91 self._read += size 92 return self._buffer[read:self._read] 93 else: 94 read = self._read 95 self._read = None 96 return self._buffer[read:] + \ 97 self.file.read(size-self._length+read) 98 99 def prepend(self, prepend=b''): 100 if self._read is None: 101 self._buffer = prepend 102 else: # Assume data was read since the last prepend() call 103 self._read -= len(prepend) 104 return 105 self._length = len(self._buffer) 106 self._read = 0 107 108 def seek(self, off): 109 self._read = None 110 self._buffer = None 111 return self.file.seek(off) 112 113 def seekable(self): 114 return True # Allows fast-forwarding even in unseekable streams 115 116 117class BadGzipFile(OSError): 118 """Exception raised in some cases for invalid gzip files.""" 119 120 121class GzipFile(_compression.BaseStream): 122 """The GzipFile class simulates most of the methods of a file object with 123 the exception of the truncate() method. 124 125 This class only supports opening files in binary mode. If you need to open a 126 compressed file in text mode, use the gzip.open() function. 127 128 """ 129 130 # Overridden with internal file object to be closed, if only a filename 131 # is passed in 132 myfileobj = None 133 134 def __init__(self, filename=None, mode=None, 135 compresslevel=_COMPRESS_LEVEL_BEST, fileobj=None, mtime=None): 136 """Constructor for the GzipFile class. 137 138 At least one of fileobj and filename must be given a 139 non-trivial value. 140 141 The new class instance is based on fileobj, which can be a regular 142 file, an io.BytesIO object, or any other object which simulates a file. 143 It defaults to None, in which case filename is opened to provide 144 a file object. 145 146 When fileobj is not None, the filename argument is only used to be 147 included in the gzip file header, which may include the original 148 filename of the uncompressed file. It defaults to the filename of 149 fileobj, if discernible; otherwise, it defaults to the empty string, 150 and in this case the original filename is not included in the header. 151 152 The mode argument can be any of 'r', 'rb', 'a', 'ab', 'w', 'wb', 'x', or 153 'xb' depending on whether the file will be read or written. The default 154 is the mode of fileobj if discernible; otherwise, the default is 'rb'. 155 A mode of 'r' is equivalent to one of 'rb', and similarly for 'w' and 156 'wb', 'a' and 'ab', and 'x' and 'xb'. 157 158 The compresslevel argument is an integer from 0 to 9 controlling the 159 level of compression; 1 is fastest and produces the least compression, 160 and 9 is slowest and produces the most compression. 0 is no compression 161 at all. The default is 9. 162 163 The mtime argument is an optional numeric timestamp to be written 164 to the last modification time field in the stream when compressing. 165 If omitted or None, the current time is used. 166 167 """ 168 169 if mode and ('t' in mode or 'U' in mode): 170 raise ValueError("Invalid mode: {!r}".format(mode)) 171 if mode and 'b' not in mode: 172 mode += 'b' 173 if fileobj is None: 174 fileobj = self.myfileobj = builtins.open(filename, mode or 'rb') 175 if filename is None: 176 filename = getattr(fileobj, 'name', '') 177 if not isinstance(filename, (str, bytes)): 178 filename = '' 179 else: 180 filename = os.fspath(filename) 181 origmode = mode 182 if mode is None: 183 mode = getattr(fileobj, 'mode', 'rb') 184 185 if mode.startswith('r'): 186 self.mode = READ 187 raw = _GzipReader(fileobj) 188 self._buffer = io.BufferedReader(raw) 189 self.name = filename 190 191 elif mode.startswith(('w', 'a', 'x')): 192 if origmode is None: 193 import warnings 194 warnings.warn( 195 "GzipFile was opened for writing, but this will " 196 "change in future Python releases. " 197 "Specify the mode argument for opening it for writing.", 198 FutureWarning, 2) 199 self.mode = WRITE 200 self._init_write(filename) 201 self.compress = zlib.compressobj(compresslevel, 202 zlib.DEFLATED, 203 -zlib.MAX_WBITS, 204 zlib.DEF_MEM_LEVEL, 205 0) 206 self._write_mtime = mtime 207 else: 208 raise ValueError("Invalid mode: {!r}".format(mode)) 209 210 self.fileobj = fileobj 211 212 if self.mode == WRITE: 213 self._write_gzip_header(compresslevel) 214 215 @property 216 def filename(self): 217 import warnings 218 warnings.warn("use the name attribute", DeprecationWarning, 2) 219 if self.mode == WRITE and self.name[-3:] != ".gz": 220 return self.name + ".gz" 221 return self.name 222 223 @property 224 def mtime(self): 225 """Last modification time read from stream, or None""" 226 return self._buffer.raw._last_mtime 227 228 def __repr__(self): 229 s = repr(self.fileobj) 230 return '<gzip ' + s[1:-1] + ' ' + hex(id(self)) + '>' 231 232 def _init_write(self, filename): 233 self.name = filename 234 self.crc = zlib.crc32(b"") 235 self.size = 0 236 self.writebuf = [] 237 self.bufsize = 0 238 self.offset = 0 # Current file offset for seek(), tell(), etc 239 240 def _write_gzip_header(self, compresslevel): 241 self.fileobj.write(b'\037\213') # magic header 242 self.fileobj.write(b'\010') # compression method 243 try: 244 # RFC 1952 requires the FNAME field to be Latin-1. Do not 245 # include filenames that cannot be represented that way. 246 fname = os.path.basename(self.name) 247 if not isinstance(fname, bytes): 248 fname = fname.encode('latin-1') 249 if fname.endswith(b'.gz'): 250 fname = fname[:-3] 251 except UnicodeEncodeError: 252 fname = b'' 253 flags = 0 254 if fname: 255 flags = FNAME 256 self.fileobj.write(chr(flags).encode('latin-1')) 257 mtime = self._write_mtime 258 if mtime is None: 259 mtime = time.time() 260 write32u(self.fileobj, int(mtime)) 261 if compresslevel == _COMPRESS_LEVEL_BEST: 262 xfl = b'\002' 263 elif compresslevel == _COMPRESS_LEVEL_FAST: 264 xfl = b'\004' 265 else: 266 xfl = b'\000' 267 self.fileobj.write(xfl) 268 self.fileobj.write(b'\377') 269 if fname: 270 self.fileobj.write(fname + b'\000') 271 272 def write(self,data): 273 self._check_not_closed() 274 if self.mode != WRITE: 275 import errno 276 raise OSError(errno.EBADF, "write() on read-only GzipFile object") 277 278 if self.fileobj is None: 279 raise ValueError("write() on closed GzipFile object") 280 281 if isinstance(data, (bytes, bytearray)): 282 length = len(data) 283 else: 284 # accept any data that supports the buffer protocol 285 data = memoryview(data) 286 length = data.nbytes 287 288 if length > 0: 289 self.fileobj.write(self.compress.compress(data)) 290 self.size += length 291 self.crc = zlib.crc32(data, self.crc) 292 self.offset += length 293 294 return length 295 296 def read(self, size=-1): 297 self._check_not_closed() 298 if self.mode != READ: 299 import errno 300 raise OSError(errno.EBADF, "read() on write-only GzipFile object") 301 return self._buffer.read(size) 302 303 def read1(self, size=-1): 304 """Implements BufferedIOBase.read1() 305 306 Reads up to a buffer's worth of data if size is negative.""" 307 self._check_not_closed() 308 if self.mode != READ: 309 import errno 310 raise OSError(errno.EBADF, "read1() on write-only GzipFile object") 311 312 if size < 0: 313 size = io.DEFAULT_BUFFER_SIZE 314 return self._buffer.read1(size) 315 316 def peek(self, n): 317 self._check_not_closed() 318 if self.mode != READ: 319 import errno 320 raise OSError(errno.EBADF, "peek() on write-only GzipFile object") 321 return self._buffer.peek(n) 322 323 @property 324 def closed(self): 325 return self.fileobj is None 326 327 def close(self): 328 fileobj = self.fileobj 329 if fileobj is None: 330 return 331 self.fileobj = None 332 try: 333 if self.mode == WRITE: 334 fileobj.write(self.compress.flush()) 335 write32u(fileobj, self.crc) 336 # self.size may exceed 2 GiB, or even 4 GiB 337 write32u(fileobj, self.size & 0xffffffff) 338 elif self.mode == READ: 339 self._buffer.close() 340 finally: 341 myfileobj = self.myfileobj 342 if myfileobj: 343 self.myfileobj = None 344 myfileobj.close() 345 346 def flush(self,zlib_mode=zlib.Z_SYNC_FLUSH): 347 self._check_not_closed() 348 if self.mode == WRITE: 349 # Ensure the compressor's buffer is flushed 350 self.fileobj.write(self.compress.flush(zlib_mode)) 351 self.fileobj.flush() 352 353 def fileno(self): 354 """Invoke the underlying file object's fileno() method. 355 356 This will raise AttributeError if the underlying file object 357 doesn't support fileno(). 358 """ 359 return self.fileobj.fileno() 360 361 def rewind(self): 362 '''Return the uncompressed stream file position indicator to the 363 beginning of the file''' 364 if self.mode != READ: 365 raise OSError("Can't rewind in write mode") 366 self._buffer.seek(0) 367 368 def readable(self): 369 return self.mode == READ 370 371 def writable(self): 372 return self.mode == WRITE 373 374 def seekable(self): 375 return True 376 377 def seek(self, offset, whence=io.SEEK_SET): 378 if self.mode == WRITE: 379 if whence != io.SEEK_SET: 380 if whence == io.SEEK_CUR: 381 offset = self.offset + offset 382 else: 383 raise ValueError('Seek from end not supported') 384 if offset < self.offset: 385 raise OSError('Negative seek in write mode') 386 count = offset - self.offset 387 chunk = b'\0' * 1024 388 for i in range(count // 1024): 389 self.write(chunk) 390 self.write(b'\0' * (count % 1024)) 391 elif self.mode == READ: 392 self._check_not_closed() 393 return self._buffer.seek(offset, whence) 394 395 return self.offset 396 397 def readline(self, size=-1): 398 self._check_not_closed() 399 return self._buffer.readline(size) 400 401 402class _GzipReader(_compression.DecompressReader): 403 def __init__(self, fp): 404 super().__init__(_PaddedFile(fp), zlib.decompressobj, 405 wbits=-zlib.MAX_WBITS) 406 # Set flag indicating start of a new member 407 self._new_member = True 408 self._last_mtime = None 409 410 def _init_read(self): 411 self._crc = zlib.crc32(b"") 412 self._stream_size = 0 # Decompressed size of unconcatenated stream 413 414 def _read_exact(self, n): 415 '''Read exactly *n* bytes from `self._fp` 416 417 This method is required because self._fp may be unbuffered, 418 i.e. return short reads. 419 ''' 420 421 data = self._fp.read(n) 422 while len(data) < n: 423 b = self._fp.read(n - len(data)) 424 if not b: 425 raise EOFError("Compressed file ended before the " 426 "end-of-stream marker was reached") 427 data += b 428 return data 429 430 def _read_gzip_header(self): 431 magic = self._fp.read(2) 432 if magic == b'': 433 return False 434 435 if magic != b'\037\213': 436 raise BadGzipFile('Not a gzipped file (%r)' % magic) 437 438 (method, flag, 439 self._last_mtime) = struct.unpack("<BBIxx", self._read_exact(8)) 440 if method != 8: 441 raise BadGzipFile('Unknown compression method') 442 443 if flag & FEXTRA: 444 # Read & discard the extra field, if present 445 extra_len, = struct.unpack("<H", self._read_exact(2)) 446 self._read_exact(extra_len) 447 if flag & FNAME: 448 # Read and discard a null-terminated string containing the filename 449 while True: 450 s = self._fp.read(1) 451 if not s or s==b'\000': 452 break 453 if flag & FCOMMENT: 454 # Read and discard a null-terminated string containing a comment 455 while True: 456 s = self._fp.read(1) 457 if not s or s==b'\000': 458 break 459 if flag & FHCRC: 460 self._read_exact(2) # Read & discard the 16-bit header CRC 461 return True 462 463 def read(self, size=-1): 464 if size < 0: 465 return self.readall() 466 # size=0 is special because decompress(max_length=0) is not supported 467 if not size: 468 return b"" 469 470 # For certain input data, a single 471 # call to decompress() may not return 472 # any data. In this case, retry until we get some data or reach EOF. 473 while True: 474 if self._decompressor.eof: 475 # Ending case: we've come to the end of a member in the file, 476 # so finish up this member, and read a new gzip header. 477 # Check the CRC and file size, and set the flag so we read 478 # a new member 479 self._read_eof() 480 self._new_member = True 481 self._decompressor = self._decomp_factory( 482 **self._decomp_args) 483 484 if self._new_member: 485 # If the _new_member flag is set, we have to 486 # jump to the next member, if there is one. 487 self._init_read() 488 if not self._read_gzip_header(): 489 self._size = self._pos 490 return b"" 491 self._new_member = False 492 493 # Read a chunk of data from the file 494 buf = self._fp.read(io.DEFAULT_BUFFER_SIZE) 495 496 uncompress = self._decompressor.decompress(buf, size) 497 if self._decompressor.unconsumed_tail != b"": 498 self._fp.prepend(self._decompressor.unconsumed_tail) 499 elif self._decompressor.unused_data != b"": 500 # Prepend the already read bytes to the fileobj so they can 501 # be seen by _read_eof() and _read_gzip_header() 502 self._fp.prepend(self._decompressor.unused_data) 503 504 if uncompress != b"": 505 break 506 if buf == b"": 507 raise EOFError("Compressed file ended before the " 508 "end-of-stream marker was reached") 509 510 self._add_read_data( uncompress ) 511 self._pos += len(uncompress) 512 return uncompress 513 514 def _add_read_data(self, data): 515 self._crc = zlib.crc32(data, self._crc) 516 self._stream_size = self._stream_size + len(data) 517 518 def _read_eof(self): 519 # We've read to the end of the file 520 # We check that the computed CRC and size of the 521 # uncompressed data matches the stored values. Note that the size 522 # stored is the true file size mod 2**32. 523 crc32, isize = struct.unpack("<II", self._read_exact(8)) 524 if crc32 != self._crc: 525 raise BadGzipFile("CRC check failed %s != %s" % (hex(crc32), 526 hex(self._crc))) 527 elif isize != (self._stream_size & 0xffffffff): 528 raise BadGzipFile("Incorrect length of data produced") 529 530 # Gzip files can be padded with zeroes and still have archives. 531 # Consume all zero bytes and set the file position to the first 532 # non-zero byte. See http://www.gzip.org/#faq8 533 c = b"\x00" 534 while c == b"\x00": 535 c = self._fp.read(1) 536 if c: 537 self._fp.prepend(c) 538 539 def _rewind(self): 540 super()._rewind() 541 self._new_member = True 542 543def compress(data, compresslevel=_COMPRESS_LEVEL_BEST, *, mtime=None): 544 """Compress data in one shot and return the compressed string. 545 Optional argument is the compression level, in range of 0-9. 546 """ 547 buf = io.BytesIO() 548 with GzipFile(fileobj=buf, mode='wb', compresslevel=compresslevel, mtime=mtime) as f: 549 f.write(data) 550 return buf.getvalue() 551 552def decompress(data): 553 """Decompress a gzip compressed string in one shot. 554 Return the decompressed string. 555 """ 556 with GzipFile(fileobj=io.BytesIO(data)) as f: 557 return f.read() 558 559 560def main(): 561 from argparse import ArgumentParser 562 parser = ArgumentParser(description= 563 "A simple command line interface for the gzip module: act like gzip, " 564 "but do not delete the input file.") 565 group = parser.add_mutually_exclusive_group() 566 group.add_argument('--fast', action='store_true', help='compress faster') 567 group.add_argument('--best', action='store_true', help='compress better') 568 group.add_argument("-d", "--decompress", action="store_true", 569 help="act like gunzip instead of gzip") 570 571 parser.add_argument("args", nargs="*", default=["-"], metavar='file') 572 args = parser.parse_args() 573 574 compresslevel = _COMPRESS_LEVEL_TRADEOFF 575 if args.fast: 576 compresslevel = _COMPRESS_LEVEL_FAST 577 elif args.best: 578 compresslevel = _COMPRESS_LEVEL_BEST 579 580 for arg in args.args: 581 if args.decompress: 582 if arg == "-": 583 f = GzipFile(filename="", mode="rb", fileobj=sys.stdin.buffer) 584 g = sys.stdout.buffer 585 else: 586 if arg[-3:] != ".gz": 587 sys.exit(f"filename doesn't end in .gz: {arg!r}") 588 f = open(arg, "rb") 589 g = builtins.open(arg[:-3], "wb") 590 else: 591 if arg == "-": 592 f = sys.stdin.buffer 593 g = GzipFile(filename="", mode="wb", fileobj=sys.stdout.buffer, 594 compresslevel=compresslevel) 595 else: 596 f = builtins.open(arg, "rb") 597 g = open(arg + ".gz", "wb") 598 while True: 599 chunk = f.read(io.DEFAULT_BUFFER_SIZE) 600 if not chunk: 601 break 602 g.write(chunk) 603 if g is not sys.stdout.buffer: 604 g.close() 605 if f is not sys.stdin.buffer: 606 f.close() 607 608if __name__ == '__main__': 609 main() 610