1"""Functions that read and write gzipped files. 2 3The user of the file doesn't have to worry about the compression, 4but random access is not allowed.""" 5 6# based on Andrew Kuchling's minigzip.py distributed with the zlib module 7 8import struct, sys, time, os 9import zlib 10import builtins 11import io 12import _compression 13 14__all__ = ["GzipFile", "open", "compress", "decompress"] 15 16FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16 17 18READ, WRITE = 1, 2 19 20def open(filename, mode="rb", compresslevel=9, 21 encoding=None, errors=None, newline=None): 22 """Open a gzip-compressed file in binary or text mode. 23 24 The filename argument can be an actual filename (a str or bytes object), or 25 an existing file object to read from or write to. 26 27 The mode argument can be "r", "rb", "w", "wb", "x", "xb", "a" or "ab" for 28 binary mode, or "rt", "wt", "xt" or "at" for text mode. The default mode is 29 "rb", and the default compresslevel is 9. 30 31 For binary mode, this function is equivalent to the GzipFile constructor: 32 GzipFile(filename, mode, compresslevel). In this case, the encoding, errors 33 and newline arguments must not be provided. 34 35 For text mode, a GzipFile object is created, and wrapped in an 36 io.TextIOWrapper instance with the specified encoding, error handling 37 behavior, and line ending(s). 38 39 """ 40 if "t" in mode: 41 if "b" in mode: 42 raise ValueError("Invalid mode: %r" % (mode,)) 43 else: 44 if encoding is not None: 45 raise ValueError("Argument 'encoding' not supported in binary mode") 46 if errors is not None: 47 raise ValueError("Argument 'errors' not supported in binary mode") 48 if newline is not None: 49 raise ValueError("Argument 'newline' not supported in binary mode") 50 51 gz_mode = mode.replace("t", "") 52 if isinstance(filename, (str, bytes, os.PathLike)): 53 binary_file = GzipFile(filename, gz_mode, compresslevel) 54 elif hasattr(filename, "read") or hasattr(filename, "write"): 55 binary_file = GzipFile(None, gz_mode, compresslevel, filename) 56 else: 57 raise TypeError("filename must be a str or bytes object, or a file") 58 59 if "t" in mode: 60 return io.TextIOWrapper(binary_file, encoding, errors, newline) 61 else: 62 return binary_file 63 64def write32u(output, value): 65 # The L format writes the bit pattern correctly whether signed 66 # or unsigned. 67 output.write(struct.pack("<L", value)) 68 69class _PaddedFile: 70 """Minimal read-only file object that prepends a string to the contents 71 of an actual file. Shouldn't be used outside of gzip.py, as it lacks 72 essential functionality.""" 73 74 def __init__(self, f, prepend=b''): 75 self._buffer = prepend 76 self._length = len(prepend) 77 self.file = f 78 self._read = 0 79 80 def read(self, size): 81 if self._read is None: 82 return self.file.read(size) 83 if self._read + size <= self._length: 84 read = self._read 85 self._read += size 86 return self._buffer[read:self._read] 87 else: 88 read = self._read 89 self._read = None 90 return self._buffer[read:] + \ 91 self.file.read(size-self._length+read) 92 93 def prepend(self, prepend=b''): 94 if self._read is None: 95 self._buffer = prepend 96 else: # Assume data was read since the last prepend() call 97 self._read -= len(prepend) 98 return 99 self._length = len(self._buffer) 100 self._read = 0 101 102 def seek(self, off): 103 self._read = None 104 self._buffer = None 105 return self.file.seek(off) 106 107 def seekable(self): 108 return True # Allows fast-forwarding even in unseekable streams 109 110class GzipFile(_compression.BaseStream): 111 """The GzipFile class simulates most of the methods of a file object with 112 the exception of the truncate() method. 113 114 This class only supports opening files in binary mode. If you need to open a 115 compressed file in text mode, use the gzip.open() function. 116 117 """ 118 119 # Overridden with internal file object to be closed, if only a filename 120 # is passed in 121 myfileobj = None 122 123 def __init__(self, filename=None, mode=None, 124 compresslevel=9, fileobj=None, mtime=None): 125 """Constructor for the GzipFile class. 126 127 At least one of fileobj and filename must be given a 128 non-trivial value. 129 130 The new class instance is based on fileobj, which can be a regular 131 file, an io.BytesIO object, or any other object which simulates a file. 132 It defaults to None, in which case filename is opened to provide 133 a file object. 134 135 When fileobj is not None, the filename argument is only used to be 136 included in the gzip file header, which may include the original 137 filename of the uncompressed file. It defaults to the filename of 138 fileobj, if discernible; otherwise, it defaults to the empty string, 139 and in this case the original filename is not included in the header. 140 141 The mode argument can be any of 'r', 'rb', 'a', 'ab', 'w', 'wb', 'x', or 142 'xb' depending on whether the file will be read or written. The default 143 is the mode of fileobj if discernible; otherwise, the default is 'rb'. 144 A mode of 'r' is equivalent to one of 'rb', and similarly for 'w' and 145 'wb', 'a' and 'ab', and 'x' and 'xb'. 146 147 The compresslevel argument is an integer from 0 to 9 controlling the 148 level of compression; 1 is fastest and produces the least compression, 149 and 9 is slowest and produces the most compression. 0 is no compression 150 at all. The default is 9. 151 152 The mtime argument is an optional numeric timestamp to be written 153 to the last modification time field in the stream when compressing. 154 If omitted or None, the current time is used. 155 156 """ 157 158 if mode and ('t' in mode or 'U' in mode): 159 raise ValueError("Invalid mode: {!r}".format(mode)) 160 if mode and 'b' not in mode: 161 mode += 'b' 162 if fileobj is None: 163 fileobj = self.myfileobj = builtins.open(filename, mode or 'rb') 164 if filename is None: 165 filename = getattr(fileobj, 'name', '') 166 if not isinstance(filename, (str, bytes)): 167 filename = '' 168 else: 169 filename = os.fspath(filename) 170 if mode is None: 171 mode = getattr(fileobj, 'mode', 'rb') 172 173 if mode.startswith('r'): 174 self.mode = READ 175 raw = _GzipReader(fileobj) 176 self._buffer = io.BufferedReader(raw) 177 self.name = filename 178 179 elif mode.startswith(('w', 'a', 'x')): 180 self.mode = WRITE 181 self._init_write(filename) 182 self.compress = zlib.compressobj(compresslevel, 183 zlib.DEFLATED, 184 -zlib.MAX_WBITS, 185 zlib.DEF_MEM_LEVEL, 186 0) 187 self._write_mtime = mtime 188 else: 189 raise ValueError("Invalid mode: {!r}".format(mode)) 190 191 self.fileobj = fileobj 192 193 if self.mode == WRITE: 194 self._write_gzip_header() 195 196 @property 197 def filename(self): 198 import warnings 199 warnings.warn("use the name attribute", DeprecationWarning, 2) 200 if self.mode == WRITE and self.name[-3:] != ".gz": 201 return self.name + ".gz" 202 return self.name 203 204 @property 205 def mtime(self): 206 """Last modification time read from stream, or None""" 207 return self._buffer.raw._last_mtime 208 209 def __repr__(self): 210 s = repr(self.fileobj) 211 return '<gzip ' + s[1:-1] + ' ' + hex(id(self)) + '>' 212 213 def _init_write(self, filename): 214 self.name = filename 215 self.crc = zlib.crc32(b"") 216 self.size = 0 217 self.writebuf = [] 218 self.bufsize = 0 219 self.offset = 0 # Current file offset for seek(), tell(), etc 220 221 def _write_gzip_header(self): 222 self.fileobj.write(b'\037\213') # magic header 223 self.fileobj.write(b'\010') # compression method 224 try: 225 # RFC 1952 requires the FNAME field to be Latin-1. Do not 226 # include filenames that cannot be represented that way. 227 fname = os.path.basename(self.name) 228 if not isinstance(fname, bytes): 229 fname = fname.encode('latin-1') 230 if fname.endswith(b'.gz'): 231 fname = fname[:-3] 232 except UnicodeEncodeError: 233 fname = b'' 234 flags = 0 235 if fname: 236 flags = FNAME 237 self.fileobj.write(chr(flags).encode('latin-1')) 238 mtime = self._write_mtime 239 if mtime is None: 240 mtime = time.time() 241 write32u(self.fileobj, int(mtime)) 242 self.fileobj.write(b'\002') 243 self.fileobj.write(b'\377') 244 if fname: 245 self.fileobj.write(fname + b'\000') 246 247 def write(self,data): 248 self._check_not_closed() 249 if self.mode != WRITE: 250 import errno 251 raise OSError(errno.EBADF, "write() on read-only GzipFile object") 252 253 if self.fileobj is None: 254 raise ValueError("write() on closed GzipFile object") 255 256 if isinstance(data, bytes): 257 length = len(data) 258 else: 259 # accept any data that supports the buffer protocol 260 data = memoryview(data) 261 length = data.nbytes 262 263 if length > 0: 264 self.fileobj.write(self.compress.compress(data)) 265 self.size += length 266 self.crc = zlib.crc32(data, self.crc) 267 self.offset += length 268 269 return length 270 271 def read(self, size=-1): 272 self._check_not_closed() 273 if self.mode != READ: 274 import errno 275 raise OSError(errno.EBADF, "read() on write-only GzipFile object") 276 return self._buffer.read(size) 277 278 def read1(self, size=-1): 279 """Implements BufferedIOBase.read1() 280 281 Reads up to a buffer's worth of data is size is negative.""" 282 self._check_not_closed() 283 if self.mode != READ: 284 import errno 285 raise OSError(errno.EBADF, "read1() on write-only GzipFile object") 286 287 if size < 0: 288 size = io.DEFAULT_BUFFER_SIZE 289 return self._buffer.read1(size) 290 291 def peek(self, n): 292 self._check_not_closed() 293 if self.mode != READ: 294 import errno 295 raise OSError(errno.EBADF, "peek() on write-only GzipFile object") 296 return self._buffer.peek(n) 297 298 @property 299 def closed(self): 300 return self.fileobj is None 301 302 def close(self): 303 fileobj = self.fileobj 304 if fileobj is None: 305 return 306 self.fileobj = None 307 try: 308 if self.mode == WRITE: 309 fileobj.write(self.compress.flush()) 310 write32u(fileobj, self.crc) 311 # self.size may exceed 2GB, or even 4GB 312 write32u(fileobj, self.size & 0xffffffff) 313 elif self.mode == READ: 314 self._buffer.close() 315 finally: 316 myfileobj = self.myfileobj 317 if myfileobj: 318 self.myfileobj = None 319 myfileobj.close() 320 321 def flush(self,zlib_mode=zlib.Z_SYNC_FLUSH): 322 self._check_not_closed() 323 if self.mode == WRITE: 324 # Ensure the compressor's buffer is flushed 325 self.fileobj.write(self.compress.flush(zlib_mode)) 326 self.fileobj.flush() 327 328 def fileno(self): 329 """Invoke the underlying file object's fileno() method. 330 331 This will raise AttributeError if the underlying file object 332 doesn't support fileno(). 333 """ 334 return self.fileobj.fileno() 335 336 def rewind(self): 337 '''Return the uncompressed stream file position indicator to the 338 beginning of the file''' 339 if self.mode != READ: 340 raise OSError("Can't rewind in write mode") 341 self._buffer.seek(0) 342 343 def readable(self): 344 return self.mode == READ 345 346 def writable(self): 347 return self.mode == WRITE 348 349 def seekable(self): 350 return True 351 352 def seek(self, offset, whence=io.SEEK_SET): 353 if self.mode == WRITE: 354 if whence != io.SEEK_SET: 355 if whence == io.SEEK_CUR: 356 offset = self.offset + offset 357 else: 358 raise ValueError('Seek from end not supported') 359 if offset < self.offset: 360 raise OSError('Negative seek in write mode') 361 count = offset - self.offset 362 chunk = b'\0' * 1024 363 for i in range(count // 1024): 364 self.write(chunk) 365 self.write(b'\0' * (count % 1024)) 366 elif self.mode == READ: 367 self._check_not_closed() 368 return self._buffer.seek(offset, whence) 369 370 return self.offset 371 372 def readline(self, size=-1): 373 self._check_not_closed() 374 return self._buffer.readline(size) 375 376 377class _GzipReader(_compression.DecompressReader): 378 def __init__(self, fp): 379 super().__init__(_PaddedFile(fp), zlib.decompressobj, 380 wbits=-zlib.MAX_WBITS) 381 # Set flag indicating start of a new member 382 self._new_member = True 383 self._last_mtime = None 384 385 def _init_read(self): 386 self._crc = zlib.crc32(b"") 387 self._stream_size = 0 # Decompressed size of unconcatenated stream 388 389 def _read_exact(self, n): 390 '''Read exactly *n* bytes from `self._fp` 391 392 This method is required because self._fp may be unbuffered, 393 i.e. return short reads. 394 ''' 395 396 data = self._fp.read(n) 397 while len(data) < n: 398 b = self._fp.read(n - len(data)) 399 if not b: 400 raise EOFError("Compressed file ended before the " 401 "end-of-stream marker was reached") 402 data += b 403 return data 404 405 def _read_gzip_header(self): 406 magic = self._fp.read(2) 407 if magic == b'': 408 return False 409 410 if magic != b'\037\213': 411 raise OSError('Not a gzipped file (%r)' % magic) 412 413 (method, flag, 414 self._last_mtime) = struct.unpack("<BBIxx", self._read_exact(8)) 415 if method != 8: 416 raise OSError('Unknown compression method') 417 418 if flag & FEXTRA: 419 # Read & discard the extra field, if present 420 extra_len, = struct.unpack("<H", self._read_exact(2)) 421 self._read_exact(extra_len) 422 if flag & FNAME: 423 # Read and discard a null-terminated string containing the filename 424 while True: 425 s = self._fp.read(1) 426 if not s or s==b'\000': 427 break 428 if flag & FCOMMENT: 429 # Read and discard a null-terminated string containing a comment 430 while True: 431 s = self._fp.read(1) 432 if not s or s==b'\000': 433 break 434 if flag & FHCRC: 435 self._read_exact(2) # Read & discard the 16-bit header CRC 436 return True 437 438 def read(self, size=-1): 439 if size < 0: 440 return self.readall() 441 # size=0 is special because decompress(max_length=0) is not supported 442 if not size: 443 return b"" 444 445 # For certain input data, a single 446 # call to decompress() may not return 447 # any data. In this case, retry until we get some data or reach EOF. 448 while True: 449 if self._decompressor.eof: 450 # Ending case: we've come to the end of a member in the file, 451 # so finish up this member, and read a new gzip header. 452 # Check the CRC and file size, and set the flag so we read 453 # a new member 454 self._read_eof() 455 self._new_member = True 456 self._decompressor = self._decomp_factory( 457 **self._decomp_args) 458 459 if self._new_member: 460 # If the _new_member flag is set, we have to 461 # jump to the next member, if there is one. 462 self._init_read() 463 if not self._read_gzip_header(): 464 self._size = self._pos 465 return b"" 466 self._new_member = False 467 468 # Read a chunk of data from the file 469 buf = self._fp.read(io.DEFAULT_BUFFER_SIZE) 470 471 uncompress = self._decompressor.decompress(buf, size) 472 if self._decompressor.unconsumed_tail != b"": 473 self._fp.prepend(self._decompressor.unconsumed_tail) 474 elif self._decompressor.unused_data != b"": 475 # Prepend the already read bytes to the fileobj so they can 476 # be seen by _read_eof() and _read_gzip_header() 477 self._fp.prepend(self._decompressor.unused_data) 478 479 if uncompress != b"": 480 break 481 if buf == b"": 482 raise EOFError("Compressed file ended before the " 483 "end-of-stream marker was reached") 484 485 self._add_read_data( uncompress ) 486 self._pos += len(uncompress) 487 return uncompress 488 489 def _add_read_data(self, data): 490 self._crc = zlib.crc32(data, self._crc) 491 self._stream_size = self._stream_size + len(data) 492 493 def _read_eof(self): 494 # We've read to the end of the file 495 # We check the that the computed CRC and size of the 496 # uncompressed data matches the stored values. Note that the size 497 # stored is the true file size mod 2**32. 498 crc32, isize = struct.unpack("<II", self._read_exact(8)) 499 if crc32 != self._crc: 500 raise OSError("CRC check failed %s != %s" % (hex(crc32), 501 hex(self._crc))) 502 elif isize != (self._stream_size & 0xffffffff): 503 raise OSError("Incorrect length of data produced") 504 505 # Gzip files can be padded with zeroes and still have archives. 506 # Consume all zero bytes and set the file position to the first 507 # non-zero byte. See http://www.gzip.org/#faq8 508 c = b"\x00" 509 while c == b"\x00": 510 c = self._fp.read(1) 511 if c: 512 self._fp.prepend(c) 513 514 def _rewind(self): 515 super()._rewind() 516 self._new_member = True 517 518def compress(data, compresslevel=9): 519 """Compress data in one shot and return the compressed string. 520 Optional argument is the compression level, in range of 0-9. 521 """ 522 buf = io.BytesIO() 523 with GzipFile(fileobj=buf, mode='wb', compresslevel=compresslevel) as f: 524 f.write(data) 525 return buf.getvalue() 526 527def decompress(data): 528 """Decompress a gzip compressed string in one shot. 529 Return the decompressed string. 530 """ 531 with GzipFile(fileobj=io.BytesIO(data)) as f: 532 return f.read() 533 534 535def _test(): 536 # Act like gzip; with -d, act like gunzip. 537 # The input file is not deleted, however, nor are any other gzip 538 # options or features supported. 539 args = sys.argv[1:] 540 decompress = args and args[0] == "-d" 541 if decompress: 542 args = args[1:] 543 if not args: 544 args = ["-"] 545 for arg in args: 546 if decompress: 547 if arg == "-": 548 f = GzipFile(filename="", mode="rb", fileobj=sys.stdin.buffer) 549 g = sys.stdout.buffer 550 else: 551 if arg[-3:] != ".gz": 552 print("filename doesn't end in .gz:", repr(arg)) 553 continue 554 f = open(arg, "rb") 555 g = builtins.open(arg[:-3], "wb") 556 else: 557 if arg == "-": 558 f = sys.stdin.buffer 559 g = GzipFile(filename="", mode="wb", fileobj=sys.stdout.buffer) 560 else: 561 f = builtins.open(arg, "rb") 562 g = open(arg + ".gz", "wb") 563 while True: 564 chunk = f.read(1024) 565 if not chunk: 566 break 567 g.write(chunk) 568 if g is not sys.stdout.buffer: 569 g.close() 570 if f is not sys.stdin.buffer: 571 f.close() 572 573if __name__ == '__main__': 574 _test() 575