1"""Functions that read and write gzipped files. 2 3The user of the file doesn't have to worry about the compression, 4but random access is not allowed.""" 5 6# based on Andrew Kuchling's minigzip.py distributed with the zlib module 7 8import struct, sys, time, os 9import zlib 10import builtins 11import io 12import _compression 13 14__all__ = ["BadGzipFile", "GzipFile", "open", "compress", "decompress"] 15 16FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16 17 18READ, WRITE = 1, 2 19 20_COMPRESS_LEVEL_FAST = 1 21_COMPRESS_LEVEL_TRADEOFF = 6 22_COMPRESS_LEVEL_BEST = 9 23 24 25def open(filename, mode="rb", compresslevel=_COMPRESS_LEVEL_BEST, 26 encoding=None, errors=None, newline=None): 27 """Open a gzip-compressed file in binary or text mode. 28 29 The filename argument can be an actual filename (a str or bytes object), or 30 an existing file object to read from or write to. 31 32 The mode argument can be "r", "rb", "w", "wb", "x", "xb", "a" or "ab" for 33 binary mode, or "rt", "wt", "xt" or "at" for text mode. The default mode is 34 "rb", and the default compresslevel is 9. 35 36 For binary mode, this function is equivalent to the GzipFile constructor: 37 GzipFile(filename, mode, compresslevel). In this case, the encoding, errors 38 and newline arguments must not be provided. 39 40 For text mode, a GzipFile object is created, and wrapped in an 41 io.TextIOWrapper instance with the specified encoding, error handling 42 behavior, and line ending(s). 43 44 """ 45 if "t" in mode: 46 if "b" in mode: 47 raise ValueError("Invalid mode: %r" % (mode,)) 48 else: 49 if encoding is not None: 50 raise ValueError("Argument 'encoding' not supported in binary mode") 51 if errors is not None: 52 raise ValueError("Argument 'errors' not supported in binary mode") 53 if newline is not None: 54 raise ValueError("Argument 'newline' not supported in binary mode") 55 56 gz_mode = mode.replace("t", "") 57 if isinstance(filename, (str, bytes, os.PathLike)): 58 binary_file = GzipFile(filename, gz_mode, compresslevel) 59 elif hasattr(filename, "read") or hasattr(filename, "write"): 60 binary_file = GzipFile(None, gz_mode, compresslevel, filename) 61 else: 62 raise TypeError("filename must be a str or bytes object, or a file") 63 64 if "t" in mode: 65 return io.TextIOWrapper(binary_file, encoding, errors, newline) 66 else: 67 return binary_file 68 69def write32u(output, value): 70 # The L format writes the bit pattern correctly whether signed 71 # or unsigned. 72 output.write(struct.pack("<L", value)) 73 74class _PaddedFile: 75 """Minimal read-only file object that prepends a string to the contents 76 of an actual file. Shouldn't be used outside of gzip.py, as it lacks 77 essential functionality.""" 78 79 def __init__(self, f, prepend=b''): 80 self._buffer = prepend 81 self._length = len(prepend) 82 self.file = f 83 self._read = 0 84 85 def read(self, size): 86 if self._read is None: 87 return self.file.read(size) 88 if self._read + size <= self._length: 89 read = self._read 90 self._read += size 91 return self._buffer[read:self._read] 92 else: 93 read = self._read 94 self._read = None 95 return self._buffer[read:] + \ 96 self.file.read(size-self._length+read) 97 98 def prepend(self, prepend=b''): 99 if self._read is None: 100 self._buffer = prepend 101 else: # Assume data was read since the last prepend() call 102 self._read -= len(prepend) 103 return 104 self._length = len(self._buffer) 105 self._read = 0 106 107 def seek(self, off): 108 self._read = None 109 self._buffer = None 110 return self.file.seek(off) 111 112 def seekable(self): 113 return True # Allows fast-forwarding even in unseekable streams 114 115 116class BadGzipFile(OSError): 117 """Exception raised in some cases for invalid gzip files.""" 118 119 120class GzipFile(_compression.BaseStream): 121 """The GzipFile class simulates most of the methods of a file object with 122 the exception of the truncate() method. 123 124 This class only supports opening files in binary mode. If you need to open a 125 compressed file in text mode, use the gzip.open() function. 126 127 """ 128 129 # Overridden with internal file object to be closed, if only a filename 130 # is passed in 131 myfileobj = None 132 133 def __init__(self, filename=None, mode=None, 134 compresslevel=_COMPRESS_LEVEL_BEST, fileobj=None, mtime=None): 135 """Constructor for the GzipFile class. 136 137 At least one of fileobj and filename must be given a 138 non-trivial value. 139 140 The new class instance is based on fileobj, which can be a regular 141 file, an io.BytesIO object, or any other object which simulates a file. 142 It defaults to None, in which case filename is opened to provide 143 a file object. 144 145 When fileobj is not None, the filename argument is only used to be 146 included in the gzip file header, which may include the original 147 filename of the uncompressed file. It defaults to the filename of 148 fileobj, if discernible; otherwise, it defaults to the empty string, 149 and in this case the original filename is not included in the header. 150 151 The mode argument can be any of 'r', 'rb', 'a', 'ab', 'w', 'wb', 'x', or 152 'xb' depending on whether the file will be read or written. The default 153 is the mode of fileobj if discernible; otherwise, the default is 'rb'. 154 A mode of 'r' is equivalent to one of 'rb', and similarly for 'w' and 155 'wb', 'a' and 'ab', and 'x' and 'xb'. 156 157 The compresslevel argument is an integer from 0 to 9 controlling the 158 level of compression; 1 is fastest and produces the least compression, 159 and 9 is slowest and produces the most compression. 0 is no compression 160 at all. The default is 9. 161 162 The mtime argument is an optional numeric timestamp to be written 163 to the last modification time field in the stream when compressing. 164 If omitted or None, the current time is used. 165 166 """ 167 168 if mode and ('t' in mode or 'U' in mode): 169 raise ValueError("Invalid mode: {!r}".format(mode)) 170 if mode and 'b' not in mode: 171 mode += 'b' 172 if fileobj is None: 173 fileobj = self.myfileobj = builtins.open(filename, mode or 'rb') 174 if filename is None: 175 filename = getattr(fileobj, 'name', '') 176 if not isinstance(filename, (str, bytes)): 177 filename = '' 178 else: 179 filename = os.fspath(filename) 180 origmode = mode 181 if mode is None: 182 mode = getattr(fileobj, 'mode', 'rb') 183 184 if mode.startswith('r'): 185 self.mode = READ 186 raw = _GzipReader(fileobj) 187 self._buffer = io.BufferedReader(raw) 188 self.name = filename 189 190 elif mode.startswith(('w', 'a', 'x')): 191 if origmode is None: 192 import warnings 193 warnings.warn( 194 "GzipFile was opened for writing, but this will " 195 "change in future Python releases. " 196 "Specify the mode argument for opening it for writing.", 197 FutureWarning, 2) 198 self.mode = WRITE 199 self._init_write(filename) 200 self.compress = zlib.compressobj(compresslevel, 201 zlib.DEFLATED, 202 -zlib.MAX_WBITS, 203 zlib.DEF_MEM_LEVEL, 204 0) 205 self._write_mtime = mtime 206 else: 207 raise ValueError("Invalid mode: {!r}".format(mode)) 208 209 self.fileobj = fileobj 210 211 if self.mode == WRITE: 212 self._write_gzip_header(compresslevel) 213 214 @property 215 def filename(self): 216 import warnings 217 warnings.warn("use the name attribute", DeprecationWarning, 2) 218 if self.mode == WRITE and self.name[-3:] != ".gz": 219 return self.name + ".gz" 220 return self.name 221 222 @property 223 def mtime(self): 224 """Last modification time read from stream, or None""" 225 return self._buffer.raw._last_mtime 226 227 def __repr__(self): 228 s = repr(self.fileobj) 229 return '<gzip ' + s[1:-1] + ' ' + hex(id(self)) + '>' 230 231 def _init_write(self, filename): 232 self.name = filename 233 self.crc = zlib.crc32(b"") 234 self.size = 0 235 self.writebuf = [] 236 self.bufsize = 0 237 self.offset = 0 # Current file offset for seek(), tell(), etc 238 239 def _write_gzip_header(self, compresslevel): 240 self.fileobj.write(b'\037\213') # magic header 241 self.fileobj.write(b'\010') # compression method 242 try: 243 # RFC 1952 requires the FNAME field to be Latin-1. Do not 244 # include filenames that cannot be represented that way. 245 fname = os.path.basename(self.name) 246 if not isinstance(fname, bytes): 247 fname = fname.encode('latin-1') 248 if fname.endswith(b'.gz'): 249 fname = fname[:-3] 250 except UnicodeEncodeError: 251 fname = b'' 252 flags = 0 253 if fname: 254 flags = FNAME 255 self.fileobj.write(chr(flags).encode('latin-1')) 256 mtime = self._write_mtime 257 if mtime is None: 258 mtime = time.time() 259 write32u(self.fileobj, int(mtime)) 260 if compresslevel == _COMPRESS_LEVEL_BEST: 261 xfl = b'\002' 262 elif compresslevel == _COMPRESS_LEVEL_FAST: 263 xfl = b'\004' 264 else: 265 xfl = b'\000' 266 self.fileobj.write(xfl) 267 self.fileobj.write(b'\377') 268 if fname: 269 self.fileobj.write(fname + b'\000') 270 271 def write(self,data): 272 self._check_not_closed() 273 if self.mode != WRITE: 274 import errno 275 raise OSError(errno.EBADF, "write() on read-only GzipFile object") 276 277 if self.fileobj is None: 278 raise ValueError("write() on closed GzipFile object") 279 280 if isinstance(data, bytes): 281 length = len(data) 282 else: 283 # accept any data that supports the buffer protocol 284 data = memoryview(data) 285 length = data.nbytes 286 287 if length > 0: 288 self.fileobj.write(self.compress.compress(data)) 289 self.size += length 290 self.crc = zlib.crc32(data, self.crc) 291 self.offset += length 292 293 return length 294 295 def read(self, size=-1): 296 self._check_not_closed() 297 if self.mode != READ: 298 import errno 299 raise OSError(errno.EBADF, "read() on write-only GzipFile object") 300 return self._buffer.read(size) 301 302 def read1(self, size=-1): 303 """Implements BufferedIOBase.read1() 304 305 Reads up to a buffer's worth of data if size is negative.""" 306 self._check_not_closed() 307 if self.mode != READ: 308 import errno 309 raise OSError(errno.EBADF, "read1() on write-only GzipFile object") 310 311 if size < 0: 312 size = io.DEFAULT_BUFFER_SIZE 313 return self._buffer.read1(size) 314 315 def peek(self, n): 316 self._check_not_closed() 317 if self.mode != READ: 318 import errno 319 raise OSError(errno.EBADF, "peek() on write-only GzipFile object") 320 return self._buffer.peek(n) 321 322 @property 323 def closed(self): 324 return self.fileobj is None 325 326 def close(self): 327 fileobj = self.fileobj 328 if fileobj is None: 329 return 330 self.fileobj = None 331 try: 332 if self.mode == WRITE: 333 fileobj.write(self.compress.flush()) 334 write32u(fileobj, self.crc) 335 # self.size may exceed 2 GiB, or even 4 GiB 336 write32u(fileobj, self.size & 0xffffffff) 337 elif self.mode == READ: 338 self._buffer.close() 339 finally: 340 myfileobj = self.myfileobj 341 if myfileobj: 342 self.myfileobj = None 343 myfileobj.close() 344 345 def flush(self,zlib_mode=zlib.Z_SYNC_FLUSH): 346 self._check_not_closed() 347 if self.mode == WRITE: 348 # Ensure the compressor's buffer is flushed 349 self.fileobj.write(self.compress.flush(zlib_mode)) 350 self.fileobj.flush() 351 352 def fileno(self): 353 """Invoke the underlying file object's fileno() method. 354 355 This will raise AttributeError if the underlying file object 356 doesn't support fileno(). 357 """ 358 return self.fileobj.fileno() 359 360 def rewind(self): 361 '''Return the uncompressed stream file position indicator to the 362 beginning of the file''' 363 if self.mode != READ: 364 raise OSError("Can't rewind in write mode") 365 self._buffer.seek(0) 366 367 def readable(self): 368 return self.mode == READ 369 370 def writable(self): 371 return self.mode == WRITE 372 373 def seekable(self): 374 return True 375 376 def seek(self, offset, whence=io.SEEK_SET): 377 if self.mode == WRITE: 378 if whence != io.SEEK_SET: 379 if whence == io.SEEK_CUR: 380 offset = self.offset + offset 381 else: 382 raise ValueError('Seek from end not supported') 383 if offset < self.offset: 384 raise OSError('Negative seek in write mode') 385 count = offset - self.offset 386 chunk = b'\0' * 1024 387 for i in range(count // 1024): 388 self.write(chunk) 389 self.write(b'\0' * (count % 1024)) 390 elif self.mode == READ: 391 self._check_not_closed() 392 return self._buffer.seek(offset, whence) 393 394 return self.offset 395 396 def readline(self, size=-1): 397 self._check_not_closed() 398 return self._buffer.readline(size) 399 400 401class _GzipReader(_compression.DecompressReader): 402 def __init__(self, fp): 403 super().__init__(_PaddedFile(fp), zlib.decompressobj, 404 wbits=-zlib.MAX_WBITS) 405 # Set flag indicating start of a new member 406 self._new_member = True 407 self._last_mtime = None 408 409 def _init_read(self): 410 self._crc = zlib.crc32(b"") 411 self._stream_size = 0 # Decompressed size of unconcatenated stream 412 413 def _read_exact(self, n): 414 '''Read exactly *n* bytes from `self._fp` 415 416 This method is required because self._fp may be unbuffered, 417 i.e. return short reads. 418 ''' 419 420 data = self._fp.read(n) 421 while len(data) < n: 422 b = self._fp.read(n - len(data)) 423 if not b: 424 raise EOFError("Compressed file ended before the " 425 "end-of-stream marker was reached") 426 data += b 427 return data 428 429 def _read_gzip_header(self): 430 magic = self._fp.read(2) 431 if magic == b'': 432 return False 433 434 if magic != b'\037\213': 435 raise BadGzipFile('Not a gzipped file (%r)' % magic) 436 437 (method, flag, 438 self._last_mtime) = struct.unpack("<BBIxx", self._read_exact(8)) 439 if method != 8: 440 raise BadGzipFile('Unknown compression method') 441 442 if flag & FEXTRA: 443 # Read & discard the extra field, if present 444 extra_len, = struct.unpack("<H", self._read_exact(2)) 445 self._read_exact(extra_len) 446 if flag & FNAME: 447 # Read and discard a null-terminated string containing the filename 448 while True: 449 s = self._fp.read(1) 450 if not s or s==b'\000': 451 break 452 if flag & FCOMMENT: 453 # Read and discard a null-terminated string containing a comment 454 while True: 455 s = self._fp.read(1) 456 if not s or s==b'\000': 457 break 458 if flag & FHCRC: 459 self._read_exact(2) # Read & discard the 16-bit header CRC 460 return True 461 462 def read(self, size=-1): 463 if size < 0: 464 return self.readall() 465 # size=0 is special because decompress(max_length=0) is not supported 466 if not size: 467 return b"" 468 469 # For certain input data, a single 470 # call to decompress() may not return 471 # any data. In this case, retry until we get some data or reach EOF. 472 while True: 473 if self._decompressor.eof: 474 # Ending case: we've come to the end of a member in the file, 475 # so finish up this member, and read a new gzip header. 476 # Check the CRC and file size, and set the flag so we read 477 # a new member 478 self._read_eof() 479 self._new_member = True 480 self._decompressor = self._decomp_factory( 481 **self._decomp_args) 482 483 if self._new_member: 484 # If the _new_member flag is set, we have to 485 # jump to the next member, if there is one. 486 self._init_read() 487 if not self._read_gzip_header(): 488 self._size = self._pos 489 return b"" 490 self._new_member = False 491 492 # Read a chunk of data from the file 493 buf = self._fp.read(io.DEFAULT_BUFFER_SIZE) 494 495 uncompress = self._decompressor.decompress(buf, size) 496 if self._decompressor.unconsumed_tail != b"": 497 self._fp.prepend(self._decompressor.unconsumed_tail) 498 elif self._decompressor.unused_data != b"": 499 # Prepend the already read bytes to the fileobj so they can 500 # be seen by _read_eof() and _read_gzip_header() 501 self._fp.prepend(self._decompressor.unused_data) 502 503 if uncompress != b"": 504 break 505 if buf == b"": 506 raise EOFError("Compressed file ended before the " 507 "end-of-stream marker was reached") 508 509 self._add_read_data( uncompress ) 510 self._pos += len(uncompress) 511 return uncompress 512 513 def _add_read_data(self, data): 514 self._crc = zlib.crc32(data, self._crc) 515 self._stream_size = self._stream_size + len(data) 516 517 def _read_eof(self): 518 # We've read to the end of the file 519 # We check the that the computed CRC and size of the 520 # uncompressed data matches the stored values. Note that the size 521 # stored is the true file size mod 2**32. 522 crc32, isize = struct.unpack("<II", self._read_exact(8)) 523 if crc32 != self._crc: 524 raise BadGzipFile("CRC check failed %s != %s" % (hex(crc32), 525 hex(self._crc))) 526 elif isize != (self._stream_size & 0xffffffff): 527 raise BadGzipFile("Incorrect length of data produced") 528 529 # Gzip files can be padded with zeroes and still have archives. 530 # Consume all zero bytes and set the file position to the first 531 # non-zero byte. See http://www.gzip.org/#faq8 532 c = b"\x00" 533 while c == b"\x00": 534 c = self._fp.read(1) 535 if c: 536 self._fp.prepend(c) 537 538 def _rewind(self): 539 super()._rewind() 540 self._new_member = True 541 542def compress(data, compresslevel=_COMPRESS_LEVEL_BEST, *, mtime=None): 543 """Compress data in one shot and return the compressed string. 544 Optional argument is the compression level, in range of 0-9. 545 """ 546 buf = io.BytesIO() 547 with GzipFile(fileobj=buf, mode='wb', compresslevel=compresslevel, mtime=mtime) as f: 548 f.write(data) 549 return buf.getvalue() 550 551def decompress(data): 552 """Decompress a gzip compressed string in one shot. 553 Return the decompressed string. 554 """ 555 with GzipFile(fileobj=io.BytesIO(data)) as f: 556 return f.read() 557 558 559def main(): 560 from argparse import ArgumentParser 561 parser = ArgumentParser(description= 562 "A simple command line interface for the gzip module: act like gzip, " 563 "but do not delete the input file.") 564 group = parser.add_mutually_exclusive_group() 565 group.add_argument('--fast', action='store_true', help='compress faster') 566 group.add_argument('--best', action='store_true', help='compress better') 567 group.add_argument("-d", "--decompress", action="store_true", 568 help="act like gunzip instead of gzip") 569 570 parser.add_argument("args", nargs="*", default=["-"], metavar='file') 571 args = parser.parse_args() 572 573 compresslevel = _COMPRESS_LEVEL_TRADEOFF 574 if args.fast: 575 compresslevel = _COMPRESS_LEVEL_FAST 576 elif args.best: 577 compresslevel = _COMPRESS_LEVEL_BEST 578 579 for arg in args.args: 580 if args.decompress: 581 if arg == "-": 582 f = GzipFile(filename="", mode="rb", fileobj=sys.stdin.buffer) 583 g = sys.stdout.buffer 584 else: 585 if arg[-3:] != ".gz": 586 print("filename doesn't end in .gz:", repr(arg)) 587 continue 588 f = open(arg, "rb") 589 g = builtins.open(arg[:-3], "wb") 590 else: 591 if arg == "-": 592 f = sys.stdin.buffer 593 g = GzipFile(filename="", mode="wb", fileobj=sys.stdout.buffer, 594 compresslevel=compresslevel) 595 else: 596 f = builtins.open(arg, "rb") 597 g = open(arg + ".gz", "wb") 598 while True: 599 chunk = f.read(1024) 600 if not chunk: 601 break 602 g.write(chunk) 603 if g is not sys.stdout.buffer: 604 g.close() 605 if f is not sys.stdin.buffer: 606 f.close() 607 608if __name__ == '__main__': 609 main() 610