1"""Functions that read and write gzipped files. 2 3The user of the file doesn't have to worry about the compression, 4but random access is not allowed.""" 5 6# based on Andrew Kuchling's minigzip.py distributed with the zlib module 7 8import struct, sys, time, os 9import zlib 10import builtins 11import io 12import _compression 13 14__all__ = ["BadGzipFile", "GzipFile", "open", "compress", "decompress"] 15 16FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16 17 18READ, WRITE = 1, 2 19 20_COMPRESS_LEVEL_FAST = 1 21_COMPRESS_LEVEL_TRADEOFF = 6 22_COMPRESS_LEVEL_BEST = 9 23 24 25def open(filename, mode="rb", compresslevel=_COMPRESS_LEVEL_BEST, 26 encoding=None, errors=None, newline=None): 27 """Open a gzip-compressed file in binary or text mode. 28 29 The filename argument can be an actual filename (a str or bytes object), or 30 an existing file object to read from or write to. 31 32 The mode argument can be "r", "rb", "w", "wb", "x", "xb", "a" or "ab" for 33 binary mode, or "rt", "wt", "xt" or "at" for text mode. The default mode is 34 "rb", and the default compresslevel is 9. 35 36 For binary mode, this function is equivalent to the GzipFile constructor: 37 GzipFile(filename, mode, compresslevel). In this case, the encoding, errors 38 and newline arguments must not be provided. 39 40 For text mode, a GzipFile object is created, and wrapped in an 41 io.TextIOWrapper instance with the specified encoding, error handling 42 behavior, and line ending(s). 43 44 """ 45 if "t" in mode: 46 if "b" in mode: 47 raise ValueError("Invalid mode: %r" % (mode,)) 48 else: 49 if encoding is not None: 50 raise ValueError("Argument 'encoding' not supported in binary mode") 51 if errors is not None: 52 raise ValueError("Argument 'errors' not supported in binary mode") 53 if newline is not None: 54 raise ValueError("Argument 'newline' not supported in binary mode") 55 56 gz_mode = mode.replace("t", "") 57 if isinstance(filename, (str, bytes, os.PathLike)): 58 binary_file = GzipFile(filename, gz_mode, compresslevel) 59 elif hasattr(filename, "read") or hasattr(filename, "write"): 60 binary_file = GzipFile(None, gz_mode, compresslevel, filename) 61 else: 62 raise TypeError("filename must be a str or bytes object, or a file") 63 64 if "t" in mode: 65 return io.TextIOWrapper(binary_file, encoding, errors, newline) 66 else: 67 return binary_file 68 69def write32u(output, value): 70 # The L format writes the bit pattern correctly whether signed 71 # or unsigned. 72 output.write(struct.pack("<L", value)) 73 74class _PaddedFile: 75 """Minimal read-only file object that prepends a string to the contents 76 of an actual file. Shouldn't be used outside of gzip.py, as it lacks 77 essential functionality.""" 78 79 def __init__(self, f, prepend=b''): 80 self._buffer = prepend 81 self._length = len(prepend) 82 self.file = f 83 self._read = 0 84 85 def read(self, size): 86 if self._read is None: 87 return self.file.read(size) 88 if self._read + size <= self._length: 89 read = self._read 90 self._read += size 91 return self._buffer[read:self._read] 92 else: 93 read = self._read 94 self._read = None 95 return self._buffer[read:] + \ 96 self.file.read(size-self._length+read) 97 98 def prepend(self, prepend=b''): 99 if self._read is None: 100 self._buffer = prepend 101 else: # Assume data was read since the last prepend() call 102 self._read -= len(prepend) 103 return 104 self._length = len(self._buffer) 105 self._read = 0 106 107 def seek(self, off): 108 self._read = None 109 self._buffer = None 110 return self.file.seek(off) 111 112 def seekable(self): 113 return True # Allows fast-forwarding even in unseekable streams 114 115 116class BadGzipFile(OSError): 117 """Exception raised in some cases for invalid gzip files.""" 118 119 120class GzipFile(_compression.BaseStream): 121 """The GzipFile class simulates most of the methods of a file object with 122 the exception of the truncate() method. 123 124 This class only supports opening files in binary mode. If you need to open a 125 compressed file in text mode, use the gzip.open() function. 126 127 """ 128 129 # Overridden with internal file object to be closed, if only a filename 130 # is passed in 131 myfileobj = None 132 133 def __init__(self, filename=None, mode=None, 134 compresslevel=_COMPRESS_LEVEL_BEST, fileobj=None, mtime=None): 135 """Constructor for the GzipFile class. 136 137 At least one of fileobj and filename must be given a 138 non-trivial value. 139 140 The new class instance is based on fileobj, which can be a regular 141 file, an io.BytesIO object, or any other object which simulates a file. 142 It defaults to None, in which case filename is opened to provide 143 a file object. 144 145 When fileobj is not None, the filename argument is only used to be 146 included in the gzip file header, which may include the original 147 filename of the uncompressed file. It defaults to the filename of 148 fileobj, if discernible; otherwise, it defaults to the empty string, 149 and in this case the original filename is not included in the header. 150 151 The mode argument can be any of 'r', 'rb', 'a', 'ab', 'w', 'wb', 'x', or 152 'xb' depending on whether the file will be read or written. The default 153 is the mode of fileobj if discernible; otherwise, the default is 'rb'. 154 A mode of 'r' is equivalent to one of 'rb', and similarly for 'w' and 155 'wb', 'a' and 'ab', and 'x' and 'xb'. 156 157 The compresslevel argument is an integer from 0 to 9 controlling the 158 level of compression; 1 is fastest and produces the least compression, 159 and 9 is slowest and produces the most compression. 0 is no compression 160 at all. The default is 9. 161 162 The mtime argument is an optional numeric timestamp to be written 163 to the last modification time field in the stream when compressing. 164 If omitted or None, the current time is used. 165 166 """ 167 168 if mode and ('t' in mode or 'U' in mode): 169 raise ValueError("Invalid mode: {!r}".format(mode)) 170 if mode and 'b' not in mode: 171 mode += 'b' 172 if fileobj is None: 173 fileobj = self.myfileobj = builtins.open(filename, mode or 'rb') 174 if filename is None: 175 filename = getattr(fileobj, 'name', '') 176 if not isinstance(filename, (str, bytes)): 177 filename = '' 178 else: 179 filename = os.fspath(filename) 180 if mode is None: 181 mode = getattr(fileobj, 'mode', 'rb') 182 183 if mode.startswith('r'): 184 self.mode = READ 185 raw = _GzipReader(fileobj) 186 self._buffer = io.BufferedReader(raw) 187 self.name = filename 188 189 elif mode.startswith(('w', 'a', 'x')): 190 self.mode = WRITE 191 self._init_write(filename) 192 self.compress = zlib.compressobj(compresslevel, 193 zlib.DEFLATED, 194 -zlib.MAX_WBITS, 195 zlib.DEF_MEM_LEVEL, 196 0) 197 self._write_mtime = mtime 198 else: 199 raise ValueError("Invalid mode: {!r}".format(mode)) 200 201 self.fileobj = fileobj 202 203 if self.mode == WRITE: 204 self._write_gzip_header(compresslevel) 205 206 @property 207 def filename(self): 208 import warnings 209 warnings.warn("use the name attribute", DeprecationWarning, 2) 210 if self.mode == WRITE and self.name[-3:] != ".gz": 211 return self.name + ".gz" 212 return self.name 213 214 @property 215 def mtime(self): 216 """Last modification time read from stream, or None""" 217 return self._buffer.raw._last_mtime 218 219 def __repr__(self): 220 s = repr(self.fileobj) 221 return '<gzip ' + s[1:-1] + ' ' + hex(id(self)) + '>' 222 223 def _init_write(self, filename): 224 self.name = filename 225 self.crc = zlib.crc32(b"") 226 self.size = 0 227 self.writebuf = [] 228 self.bufsize = 0 229 self.offset = 0 # Current file offset for seek(), tell(), etc 230 231 def _write_gzip_header(self, compresslevel): 232 self.fileobj.write(b'\037\213') # magic header 233 self.fileobj.write(b'\010') # compression method 234 try: 235 # RFC 1952 requires the FNAME field to be Latin-1. Do not 236 # include filenames that cannot be represented that way. 237 fname = os.path.basename(self.name) 238 if not isinstance(fname, bytes): 239 fname = fname.encode('latin-1') 240 if fname.endswith(b'.gz'): 241 fname = fname[:-3] 242 except UnicodeEncodeError: 243 fname = b'' 244 flags = 0 245 if fname: 246 flags = FNAME 247 self.fileobj.write(chr(flags).encode('latin-1')) 248 mtime = self._write_mtime 249 if mtime is None: 250 mtime = time.time() 251 write32u(self.fileobj, int(mtime)) 252 if compresslevel == _COMPRESS_LEVEL_BEST: 253 xfl = b'\002' 254 elif compresslevel == _COMPRESS_LEVEL_FAST: 255 xfl = b'\004' 256 else: 257 xfl = b'\000' 258 self.fileobj.write(xfl) 259 self.fileobj.write(b'\377') 260 if fname: 261 self.fileobj.write(fname + b'\000') 262 263 def write(self,data): 264 self._check_not_closed() 265 if self.mode != WRITE: 266 import errno 267 raise OSError(errno.EBADF, "write() on read-only GzipFile object") 268 269 if self.fileobj is None: 270 raise ValueError("write() on closed GzipFile object") 271 272 if isinstance(data, bytes): 273 length = len(data) 274 else: 275 # accept any data that supports the buffer protocol 276 data = memoryview(data) 277 length = data.nbytes 278 279 if length > 0: 280 self.fileobj.write(self.compress.compress(data)) 281 self.size += length 282 self.crc = zlib.crc32(data, self.crc) 283 self.offset += length 284 285 return length 286 287 def read(self, size=-1): 288 self._check_not_closed() 289 if self.mode != READ: 290 import errno 291 raise OSError(errno.EBADF, "read() on write-only GzipFile object") 292 return self._buffer.read(size) 293 294 def read1(self, size=-1): 295 """Implements BufferedIOBase.read1() 296 297 Reads up to a buffer's worth of data if size is negative.""" 298 self._check_not_closed() 299 if self.mode != READ: 300 import errno 301 raise OSError(errno.EBADF, "read1() on write-only GzipFile object") 302 303 if size < 0: 304 size = io.DEFAULT_BUFFER_SIZE 305 return self._buffer.read1(size) 306 307 def peek(self, n): 308 self._check_not_closed() 309 if self.mode != READ: 310 import errno 311 raise OSError(errno.EBADF, "peek() on write-only GzipFile object") 312 return self._buffer.peek(n) 313 314 @property 315 def closed(self): 316 return self.fileobj is None 317 318 def close(self): 319 fileobj = self.fileobj 320 if fileobj is None: 321 return 322 self.fileobj = None 323 try: 324 if self.mode == WRITE: 325 fileobj.write(self.compress.flush()) 326 write32u(fileobj, self.crc) 327 # self.size may exceed 2 GiB, or even 4 GiB 328 write32u(fileobj, self.size & 0xffffffff) 329 elif self.mode == READ: 330 self._buffer.close() 331 finally: 332 myfileobj = self.myfileobj 333 if myfileobj: 334 self.myfileobj = None 335 myfileobj.close() 336 337 def flush(self,zlib_mode=zlib.Z_SYNC_FLUSH): 338 self._check_not_closed() 339 if self.mode == WRITE: 340 # Ensure the compressor's buffer is flushed 341 self.fileobj.write(self.compress.flush(zlib_mode)) 342 self.fileobj.flush() 343 344 def fileno(self): 345 """Invoke the underlying file object's fileno() method. 346 347 This will raise AttributeError if the underlying file object 348 doesn't support fileno(). 349 """ 350 return self.fileobj.fileno() 351 352 def rewind(self): 353 '''Return the uncompressed stream file position indicator to the 354 beginning of the file''' 355 if self.mode != READ: 356 raise OSError("Can't rewind in write mode") 357 self._buffer.seek(0) 358 359 def readable(self): 360 return self.mode == READ 361 362 def writable(self): 363 return self.mode == WRITE 364 365 def seekable(self): 366 return True 367 368 def seek(self, offset, whence=io.SEEK_SET): 369 if self.mode == WRITE: 370 if whence != io.SEEK_SET: 371 if whence == io.SEEK_CUR: 372 offset = self.offset + offset 373 else: 374 raise ValueError('Seek from end not supported') 375 if offset < self.offset: 376 raise OSError('Negative seek in write mode') 377 count = offset - self.offset 378 chunk = b'\0' * 1024 379 for i in range(count // 1024): 380 self.write(chunk) 381 self.write(b'\0' * (count % 1024)) 382 elif self.mode == READ: 383 self._check_not_closed() 384 return self._buffer.seek(offset, whence) 385 386 return self.offset 387 388 def readline(self, size=-1): 389 self._check_not_closed() 390 return self._buffer.readline(size) 391 392 393class _GzipReader(_compression.DecompressReader): 394 def __init__(self, fp): 395 super().__init__(_PaddedFile(fp), zlib.decompressobj, 396 wbits=-zlib.MAX_WBITS) 397 # Set flag indicating start of a new member 398 self._new_member = True 399 self._last_mtime = None 400 401 def _init_read(self): 402 self._crc = zlib.crc32(b"") 403 self._stream_size = 0 # Decompressed size of unconcatenated stream 404 405 def _read_exact(self, n): 406 '''Read exactly *n* bytes from `self._fp` 407 408 This method is required because self._fp may be unbuffered, 409 i.e. return short reads. 410 ''' 411 412 data = self._fp.read(n) 413 while len(data) < n: 414 b = self._fp.read(n - len(data)) 415 if not b: 416 raise EOFError("Compressed file ended before the " 417 "end-of-stream marker was reached") 418 data += b 419 return data 420 421 def _read_gzip_header(self): 422 magic = self._fp.read(2) 423 if magic == b'': 424 return False 425 426 if magic != b'\037\213': 427 raise BadGzipFile('Not a gzipped file (%r)' % magic) 428 429 (method, flag, 430 self._last_mtime) = struct.unpack("<BBIxx", self._read_exact(8)) 431 if method != 8: 432 raise BadGzipFile('Unknown compression method') 433 434 if flag & FEXTRA: 435 # Read & discard the extra field, if present 436 extra_len, = struct.unpack("<H", self._read_exact(2)) 437 self._read_exact(extra_len) 438 if flag & FNAME: 439 # Read and discard a null-terminated string containing the filename 440 while True: 441 s = self._fp.read(1) 442 if not s or s==b'\000': 443 break 444 if flag & FCOMMENT: 445 # Read and discard a null-terminated string containing a comment 446 while True: 447 s = self._fp.read(1) 448 if not s or s==b'\000': 449 break 450 if flag & FHCRC: 451 self._read_exact(2) # Read & discard the 16-bit header CRC 452 return True 453 454 def read(self, size=-1): 455 if size < 0: 456 return self.readall() 457 # size=0 is special because decompress(max_length=0) is not supported 458 if not size: 459 return b"" 460 461 # For certain input data, a single 462 # call to decompress() may not return 463 # any data. In this case, retry until we get some data or reach EOF. 464 while True: 465 if self._decompressor.eof: 466 # Ending case: we've come to the end of a member in the file, 467 # so finish up this member, and read a new gzip header. 468 # Check the CRC and file size, and set the flag so we read 469 # a new member 470 self._read_eof() 471 self._new_member = True 472 self._decompressor = self._decomp_factory( 473 **self._decomp_args) 474 475 if self._new_member: 476 # If the _new_member flag is set, we have to 477 # jump to the next member, if there is one. 478 self._init_read() 479 if not self._read_gzip_header(): 480 self._size = self._pos 481 return b"" 482 self._new_member = False 483 484 # Read a chunk of data from the file 485 buf = self._fp.read(io.DEFAULT_BUFFER_SIZE) 486 487 uncompress = self._decompressor.decompress(buf, size) 488 if self._decompressor.unconsumed_tail != b"": 489 self._fp.prepend(self._decompressor.unconsumed_tail) 490 elif self._decompressor.unused_data != b"": 491 # Prepend the already read bytes to the fileobj so they can 492 # be seen by _read_eof() and _read_gzip_header() 493 self._fp.prepend(self._decompressor.unused_data) 494 495 if uncompress != b"": 496 break 497 if buf == b"": 498 raise EOFError("Compressed file ended before the " 499 "end-of-stream marker was reached") 500 501 self._add_read_data( uncompress ) 502 self._pos += len(uncompress) 503 return uncompress 504 505 def _add_read_data(self, data): 506 self._crc = zlib.crc32(data, self._crc) 507 self._stream_size = self._stream_size + len(data) 508 509 def _read_eof(self): 510 # We've read to the end of the file 511 # We check the that the computed CRC and size of the 512 # uncompressed data matches the stored values. Note that the size 513 # stored is the true file size mod 2**32. 514 crc32, isize = struct.unpack("<II", self._read_exact(8)) 515 if crc32 != self._crc: 516 raise BadGzipFile("CRC check failed %s != %s" % (hex(crc32), 517 hex(self._crc))) 518 elif isize != (self._stream_size & 0xffffffff): 519 raise BadGzipFile("Incorrect length of data produced") 520 521 # Gzip files can be padded with zeroes and still have archives. 522 # Consume all zero bytes and set the file position to the first 523 # non-zero byte. See http://www.gzip.org/#faq8 524 c = b"\x00" 525 while c == b"\x00": 526 c = self._fp.read(1) 527 if c: 528 self._fp.prepend(c) 529 530 def _rewind(self): 531 super()._rewind() 532 self._new_member = True 533 534def compress(data, compresslevel=_COMPRESS_LEVEL_BEST, *, mtime=None): 535 """Compress data in one shot and return the compressed string. 536 Optional argument is the compression level, in range of 0-9. 537 """ 538 buf = io.BytesIO() 539 with GzipFile(fileobj=buf, mode='wb', compresslevel=compresslevel, mtime=mtime) as f: 540 f.write(data) 541 return buf.getvalue() 542 543def decompress(data): 544 """Decompress a gzip compressed string in one shot. 545 Return the decompressed string. 546 """ 547 with GzipFile(fileobj=io.BytesIO(data)) as f: 548 return f.read() 549 550 551def main(): 552 from argparse import ArgumentParser 553 parser = ArgumentParser(description= 554 "A simple command line interface for the gzip module: act like gzip, " 555 "but do not delete the input file.") 556 group = parser.add_mutually_exclusive_group() 557 group.add_argument('--fast', action='store_true', help='compress faster') 558 group.add_argument('--best', action='store_true', help='compress better') 559 group.add_argument("-d", "--decompress", action="store_true", 560 help="act like gunzip instead of gzip") 561 562 parser.add_argument("args", nargs="*", default=["-"], metavar='file') 563 args = parser.parse_args() 564 565 compresslevel = _COMPRESS_LEVEL_TRADEOFF 566 if args.fast: 567 compresslevel = _COMPRESS_LEVEL_FAST 568 elif args.best: 569 compresslevel = _COMPRESS_LEVEL_BEST 570 571 for arg in args.args: 572 if args.decompress: 573 if arg == "-": 574 f = GzipFile(filename="", mode="rb", fileobj=sys.stdin.buffer) 575 g = sys.stdout.buffer 576 else: 577 if arg[-3:] != ".gz": 578 print("filename doesn't end in .gz:", repr(arg)) 579 continue 580 f = open(arg, "rb") 581 g = builtins.open(arg[:-3], "wb") 582 else: 583 if arg == "-": 584 f = sys.stdin.buffer 585 g = GzipFile(filename="", mode="wb", fileobj=sys.stdout.buffer, 586 compresslevel=compresslevel) 587 else: 588 f = builtins.open(arg, "rb") 589 g = open(arg + ".gz", "wb") 590 while True: 591 chunk = f.read(1024) 592 if not chunk: 593 break 594 g.write(chunk) 595 if g is not sys.stdout.buffer: 596 g.close() 597 if f is not sys.stdin.buffer: 598 f.close() 599 600if __name__ == '__main__': 601 main() 602