1# Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2# 2011, 2012, 2013, 2014, 2015, 2016, 2017 Python Software Foundation; All 3# Rights Reserved 4# 5# This is a backport from python 3.4 into python 2.7. Text and exclusive mode 6# support are removed as they're unsupported in 2.7. This backport patches a 7# streaming bug that exists in python 2.7. 8 9"""Functions that read and write gzipped files. 10 11The user of the file doesn't have to worry about the compression, 12but random access is not allowed.""" 13 14# based on Andrew Kuchling's minigzip.py distributed with the zlib module 15 16import six 17from six.moves import builtins 18from six.moves import range 19 20import struct 21import sys 22import time 23import os 24import zlib 25import io 26 27__all__ = ["GzipFile", "open", "compress", "decompress"] 28 29FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16 30 31READ, WRITE = 1, 2 32 33 34def open(filename, mode="rb", compresslevel=9): 35 """Shorthand for GzipFile(filename, mode, compresslevel). 36 37 The filename argument is required; mode defaults to 'rb' 38 and compresslevel defaults to 9. 39 40 """ 41 return GzipFile(filename, mode, compresslevel) 42 43 44def write32u(output, value): 45 # The L format writes the bit pattern correctly whether signed 46 # or unsigned. 47 output.write(struct.pack("<L", value)) 48 49 50class _PaddedFile(object): 51 """Minimal read-only file object that prepends a string to the contents 52 of an actual file. Shouldn't be used outside of gzip.py, as it lacks 53 essential functionality.""" 54 55 def __init__(self, f, prepend=b''): 56 self._buffer = prepend 57 self._length = len(prepend) 58 self.file = f 59 self._read = 0 60 61 def read(self, size): 62 if self._read is None: 63 return self.file.read(size) 64 if self._read + size <= self._length: 65 read = self._read 66 self._read += size 67 return self._buffer[read:self._read] 68 else: 69 read = self._read 70 self._read = None 71 return self._buffer[read:] + \ 72 self.file.read(size - self._length + read) 73 74 def prepend(self, prepend=b'', readprevious=False): 75 if self._read is None: 76 self._buffer = prepend 77 elif readprevious and len(prepend) <= self._read: 78 self._read -= len(prepend) 79 return 80 else: 81 self._buffer = self._buffer[self._read:] + prepend 82 self._length = len(self._buffer) 83 self._read = 0 84 85 def unused(self): 86 if self._read is None: 87 return b'' 88 return self._buffer[self._read:] 89 90 def seek(self, offset, whence=0): 91 # This is only ever called with offset=whence=0 92 if whence == 1 and self._read is not None: 93 if 0 <= offset + self._read <= self._length: 94 self._read += offset 95 return 96 else: 97 offset += self._length - self._read 98 self._read = None 99 self._buffer = None 100 return self.file.seek(offset, whence) 101 102 def __getattr__(self, name): 103 return getattr(self.file, name) 104 105 106class GzipFile(io.BufferedIOBase): 107 """The GzipFile class simulates most of the methods of a file object with 108 the exception of the readinto() and truncate() methods. 109 110 This class only supports opening files in binary mode. If you need to open 111 a compressed file in text mode, use the gzip.open() function. 112 113 """ 114 115 myfileobj = None 116 max_read_chunk = 10 * 1024 * 1024 # 10Mb 117 118 def __init__(self, filename=None, mode=None, 119 compresslevel=9, fileobj=None, mtime=None): 120 """Constructor for the GzipFile class. 121 122 At least one of fileobj and filename must be given a 123 non-trivial value. 124 125 The new class instance is based on fileobj, which can be a regular 126 file, an io.BytesIO object, or any other object which simulates a file. 127 It defaults to None, in which case filename is opened to provide 128 a file object. 129 130 When fileobj is not None, the filename argument is only used to be 131 included in the gzip file header, which may includes the original 132 filename of the uncompressed file. It defaults to the filename of 133 fileobj, if discernible; otherwise, it defaults to the empty string, 134 and in this case the original filename is not included in the header. 135 136 The mode argument can be any of 'r', 'rb', 'a', 'ab', 'w', or 'wb', 137 depending on whether the file will be read or written. The default 138 is the mode of fileobj if discernible; otherwise, the default is 'rb'. 139 A mode of 'r' is equivalent to one of 'rb', and similarly for 'w' and 140 'wb', and 'a' and 'ab'. 141 142 The compresslevel argument is an integer from 0 to 9 controlling the 143 level of compression; 1 is fastest and produces the least compression, 144 and 9 is slowest and produces the most compression. 0 is no compression 145 at all. The default is 9. 146 147 The mtime argument is an optional numeric timestamp to be written 148 to the stream when compressing. All gzip compressed streams 149 are required to contain a timestamp. If omitted or None, the 150 current time is used. This module ignores the timestamp when 151 decompressing; however, some programs, such as gunzip, make use 152 of it. The format of the timestamp is the same as that of the 153 return value of time.time() and of the st_mtime member of the 154 object returned by os.stat(). 155 156 """ 157 158 if mode and ('t' in mode or 'U' in mode): 159 raise ValueError("Invalid mode: {!r}".format(mode)) 160 if mode and 'b' not in mode: 161 mode += 'b' 162 if fileobj is None: 163 fileobj = self.myfileobj = builtins.open(filename, mode or 'rb') 164 if filename is None: 165 filename = getattr(fileobj, 'name', '') 166 if not isinstance(filename, six.string_types): 167 filename = '' 168 if mode is None: 169 mode = getattr(fileobj, 'mode', 'rb') 170 171 if mode.startswith('r'): 172 self.mode = READ 173 # Set flag indicating start of a new member 174 self._new_member = True 175 # Buffer data read from gzip file. extrastart is offset in 176 # stream where buffer starts. extrasize is number of 177 # bytes remaining in buffer from current stream position. 178 self.extrabuf = b"" 179 self.extrasize = 0 180 self.extrastart = 0 181 self.name = filename 182 # Starts small, scales exponentially 183 self.min_readsize = 100 184 fileobj = _PaddedFile(fileobj) 185 186 elif mode.startswith(('w', 'a')): 187 self.mode = WRITE 188 self._init_write(filename) 189 self.compress = zlib.compressobj(compresslevel, 190 zlib.DEFLATED, 191 -zlib.MAX_WBITS, 192 zlib.DEF_MEM_LEVEL, 193 0) 194 else: 195 raise ValueError("Invalid mode: {!r}".format(mode)) 196 197 self.fileobj = fileobj 198 self.offset = 0 199 self.mtime = mtime 200 201 if self.mode == WRITE: 202 self._write_gzip_header() 203 204 @property 205 def filename(self): 206 import warnings 207 warnings.warn("use the name attribute", DeprecationWarning, 2) 208 if self.mode == WRITE and self.name[-3:] != ".gz": 209 return self.name + ".gz" 210 return self.name 211 212 def __repr__(self): 213 fileobj = self.fileobj 214 if isinstance(fileobj, _PaddedFile): 215 fileobj = fileobj.file 216 s = repr(fileobj) 217 return '<gzip ' + s[1:-1] + ' ' + hex(id(self)) + '>' 218 219 def _check_closed(self): 220 """Raises a ValueError if the underlying file object has been closed. 221 222 """ 223 if self.closed: 224 raise ValueError('I/O operation on closed file.') 225 226 def _init_write(self, filename): 227 self.name = filename 228 self.crc = zlib.crc32(b"") & 0xffffffff 229 self.size = 0 230 self.writebuf = [] 231 self.bufsize = 0 232 233 def _write_gzip_header(self): 234 self.fileobj.write(b'\037\213') # magic header 235 self.fileobj.write(b'\010') # compression method 236 try: 237 # RFC 1952 requires the FNAME field to be Latin-1. Do not 238 # include filenames that cannot be represented that way. 239 fname = os.path.basename(self.name) 240 if not isinstance(fname, six.binary_type): 241 fname = fname.encode('latin-1') 242 if fname.endswith(b'.gz'): 243 fname = fname[:-3] 244 except UnicodeEncodeError: 245 fname = b'' 246 flags = 0 247 if fname: 248 flags = FNAME 249 self.fileobj.write(six.unichr(flags).encode('latin-1')) 250 mtime = self.mtime 251 if mtime is None: 252 mtime = time.time() 253 write32u(self.fileobj, int(mtime)) 254 self.fileobj.write(b'\002') 255 self.fileobj.write(b'\377') 256 if fname: 257 self.fileobj.write(fname + b'\000') 258 259 def _init_read(self): 260 self.crc = zlib.crc32(b"") & 0xffffffff 261 self.size = 0 262 263 def _read_exact(self, n): 264 data = self.fileobj.read(n) 265 while len(data) < n: 266 b = self.fileobj.read(n - len(data)) 267 if not b: 268 raise EOFError("Compressed file ended before the " 269 "end-of-stream marker was reached") 270 data += b 271 return data 272 273 def _read_gzip_header(self): 274 magic = self.fileobj.read(2) 275 if magic == b'': 276 return False 277 278 if magic != b'\037\213': 279 raise OSError('Not a gzipped file') 280 281 method, flag, self.mtime = struct.unpack("<BBIxx", self._read_exact(8)) 282 if method != 8: 283 raise OSError('Unknown compression method') 284 285 if flag & FEXTRA: 286 # Read & discard the extra field, if present 287 extra_len, = struct.unpack("<H", self._read_exact(2)) 288 self._read_exact(extra_len) 289 if flag & FNAME: 290 # Read and discard a null-terminated string containing the filename 291 while True: 292 s = self.fileobj.read(1) 293 if not s or s == b'\000': 294 break 295 if flag & FCOMMENT: 296 # Read and discard a null-terminated string containing a comment 297 while True: 298 s = self.fileobj.read(1) 299 if not s or s == b'\000': 300 break 301 if flag & FHCRC: 302 self._read_exact(2) # Read & discard the 16-bit header CRC 303 304 unused = self.fileobj.unused() 305 if unused: 306 uncompress = self.decompress.decompress(unused) 307 self._add_read_data(uncompress) 308 return True 309 310 def write(self, data): 311 self._check_closed() 312 if self.mode != WRITE: 313 import errno 314 raise OSError(errno.EBADF, "write() on read-only GzipFile object") 315 316 if self.fileobj is None: 317 raise ValueError("write() on closed GzipFile object") 318 319 # Convert data type if called by io.BufferedWriter. 320 if isinstance(data, memoryview): 321 data = data.tobytes() 322 323 if len(data) > 0: 324 self.fileobj.write(self.compress.compress(data)) 325 self.size += len(data) 326 self.crc = zlib.crc32(data, self.crc) & 0xffffffff 327 self.offset += len(data) 328 329 return len(data) 330 331 def read(self, size=-1): 332 self._check_closed() 333 if self.mode != READ: 334 import errno 335 raise OSError(errno.EBADF, "read() on write-only GzipFile object") 336 337 if self.extrasize <= 0 and self.fileobj is None: 338 return b'' 339 340 readsize = 1024 341 if size < 0: # get the whole thing 342 while self._read(readsize): 343 readsize = min(self.max_read_chunk, readsize * 2) 344 size = self.extrasize 345 else: # just get some more of it 346 while size > self.extrasize: 347 if not self._read(readsize): 348 if size > self.extrasize: 349 size = self.extrasize 350 break 351 readsize = min(self.max_read_chunk, readsize * 2) 352 353 offset = self.offset - self.extrastart 354 chunk = self.extrabuf[offset: offset + size] 355 self.extrasize = self.extrasize - size 356 357 self.offset += size 358 return chunk 359 360 def read1(self, size=-1): 361 self._check_closed() 362 if self.mode != READ: 363 import errno 364 raise OSError(errno.EBADF, "read1() on write-only GzipFile object") 365 366 if self.extrasize <= 0 and self.fileobj is None: 367 return b'' 368 369 # For certain input data, a single call to _read() may not return 370 # any data. In this case, retry until we get some data or reach EOF. 371 while self.extrasize <= 0 and self._read(): 372 pass 373 if size < 0 or size > self.extrasize: 374 size = self.extrasize 375 376 offset = self.offset - self.extrastart 377 chunk = self.extrabuf[offset: offset + size] 378 self.extrasize -= size 379 self.offset += size 380 return chunk 381 382 def peek(self, n): 383 if self.mode != READ: 384 import errno 385 raise OSError(errno.EBADF, "peek() on write-only GzipFile object") 386 387 # Do not return ridiculously small buffers, for one common idiom 388 # is to call peek(1) and expect more bytes in return. 389 if n < 100: 390 n = 100 391 if self.extrasize == 0: 392 if self.fileobj is None: 393 return b'' 394 # Ensure that we don't return b"" if we haven't reached EOF. 395 # 1024 is the same buffering heuristic used in read() 396 while self.extrasize == 0 and self._read(max(n, 1024)): 397 pass 398 offset = self.offset - self.extrastart 399 remaining = self.extrasize 400 assert remaining == len(self.extrabuf) - offset 401 return self.extrabuf[offset:offset + n] 402 403 def _unread(self, buf): 404 self.extrasize = len(buf) + self.extrasize 405 self.offset -= len(buf) 406 407 def _read(self, size=1024): 408 if self.fileobj is None: 409 return False 410 411 if self._new_member: 412 # If the _new_member flag is set, we have to 413 # jump to the next member, if there is one. 414 self._init_read() 415 if not self._read_gzip_header(): 416 return False 417 self.decompress = zlib.decompressobj(-zlib.MAX_WBITS) 418 self._new_member = False 419 420 # Read a chunk of data from the file 421 buf = self.fileobj.read(size) 422 423 # If the EOF has been reached, flush the decompression object 424 # and mark this object as finished. 425 426 if buf == b"": 427 uncompress = self.decompress.flush() 428 # Prepend the already read bytes to the fileobj to they can be 429 # seen by _read_eof() 430 self.fileobj.prepend(self.decompress.unused_data, True) 431 self._read_eof() 432 self._add_read_data(uncompress) 433 return False 434 435 uncompress = self.decompress.decompress(buf) 436 self._add_read_data(uncompress) 437 438 if self.decompress.unused_data != b"": 439 # Ending case: we've come to the end of a member in the file, 440 # so seek back to the start of the unused data, finish up 441 # this member, and read a new gzip header. 442 # Prepend the already read bytes to the fileobj to they can be 443 # seen by _read_eof() and _read_gzip_header() 444 self.fileobj.prepend(self.decompress.unused_data, True) 445 # Check the CRC and file size, and set the flag so we read 446 # a new member on the next call 447 self._read_eof() 448 self._new_member = True 449 return True 450 451 def _add_read_data(self, data): 452 self.crc = zlib.crc32(data, self.crc) & 0xffffffff 453 offset = self.offset - self.extrastart 454 self.extrabuf = self.extrabuf[offset:] + data 455 self.extrasize = self.extrasize + len(data) 456 self.extrastart = self.offset 457 self.size = self.size + len(data) 458 459 def _read_eof(self): 460 # We've read to the end of the file 461 # We check the that the computed CRC and size of the 462 # uncompressed data matches the stored values. Note that the size 463 # stored is the true file size mod 2**32. 464 crc32, isize = struct.unpack("<II", self._read_exact(8)) 465 if crc32 != self.crc: 466 raise OSError("CRC check failed %s != %s" % (hex(crc32), 467 hex(self.crc))) 468 elif isize != (self.size & 0xffffffff): 469 raise OSError("Incorrect length of data produced") 470 471 # Gzip files can be padded with zeroes and still have archives. 472 # Consume all zero bytes and set the file position to the first 473 # non-zero byte. See http://www.gzip.org/#faq8 474 c = b"\x00" 475 while c == b"\x00": 476 c = self.fileobj.read(1) 477 if c: 478 self.fileobj.prepend(c, True) 479 480 @property 481 def closed(self): 482 return self.fileobj is None 483 484 def close(self): 485 fileobj = self.fileobj 486 if fileobj is None: 487 return 488 self.fileobj = None 489 try: 490 if self.mode == WRITE: 491 fileobj.write(self.compress.flush()) 492 write32u(fileobj, self.crc) 493 # self.size may exceed 2GB, or even 4GB 494 write32u(fileobj, self.size & 0xffffffff) 495 finally: 496 myfileobj = self.myfileobj 497 if myfileobj: 498 self.myfileobj = None 499 myfileobj.close() 500 501 def flush(self, zlib_mode=zlib.Z_SYNC_FLUSH): 502 self._check_closed() 503 if self.mode == WRITE: 504 # Ensure the compressor's buffer is flushed 505 self.fileobj.write(self.compress.flush(zlib_mode)) 506 self.fileobj.flush() 507 508 def fileno(self): 509 """Invoke the underlying file object's fileno() method. 510 511 This will raise AttributeError if the underlying file object 512 doesn't support fileno(). 513 """ 514 return self.fileobj.fileno() 515 516 def rewind(self): 517 '''Return the uncompressed stream file position indicator to the 518 beginning of the file''' 519 if self.mode != READ: 520 raise OSError("Can't rewind in write mode") 521 self.fileobj.seek(0) 522 self._new_member = True 523 self.extrabuf = b"" 524 self.extrasize = 0 525 self.extrastart = 0 526 self.offset = 0 527 528 def readable(self): 529 return self.mode == READ 530 531 def writable(self): 532 return self.mode == WRITE 533 534 def seekable(self): 535 return True 536 537 def seek(self, offset, whence=0): 538 if whence: 539 if whence == 1: 540 offset = self.offset + offset 541 else: 542 raise ValueError('Seek from end not supported') 543 if self.mode == WRITE: 544 if offset < self.offset: 545 raise OSError('Negative seek in write mode') 546 count = offset - self.offset 547 chunk = bytes(1024) 548 for i in range(count // 1024): 549 self.write(chunk) 550 self.write(bytes(count % 1024)) 551 elif self.mode == READ: 552 if offset < self.offset: 553 # for negative seek, rewind and do positive seek 554 self.rewind() 555 count = offset - self.offset 556 for i in range(count // 1024): 557 self.read(1024) 558 self.read(count % 1024) 559 560 return self.offset 561 562 def readline(self, size=-1): 563 if size < 0: 564 # Shortcut common case - newline found in buffer. 565 offset = self.offset - self.extrastart 566 i = self.extrabuf.find(b'\n', offset) + 1 567 if i > 0: 568 self.extrasize -= i - offset 569 self.offset += i - offset 570 return self.extrabuf[offset: i] 571 572 size = sys.maxsize 573 readsize = self.min_readsize 574 else: 575 readsize = size 576 bufs = [] 577 while size != 0: 578 c = self.read(readsize) 579 i = c.find(b'\n') 580 581 # We set i=size to break out of the loop under two 582 # conditions: 1) there's no newline, and the chunk is 583 # larger than size, or 2) there is a newline, but the 584 # resulting line would be longer than 'size'. 585 if (size <= i) or (i == -1 and len(c) > size): 586 i = size - 1 587 588 if i >= 0 or c == b'': 589 bufs.append(c[:i + 1]) # Add portion of last chunk 590 self._unread(c[i + 1:]) # Push back rest of chunk 591 break 592 593 # Append chunk to list, decrease 'size', 594 bufs.append(c) 595 size = size - len(c) 596 readsize = min(size, readsize * 2) 597 if readsize > self.min_readsize: 598 self.min_readsize = min(readsize, self.min_readsize * 2, 512) 599 return b''.join(bufs) # Return resulting line 600 601 602def compress(data, compresslevel=9): 603 """Compress data in one shot and return the compressed string. 604 Optional argument is the compression level, in range of 0-9. 605 """ 606 buf = io.BytesIO() 607 with GzipFile(fileobj=buf, mode='wb', compresslevel=compresslevel) as f: 608 f.write(data) 609 return buf.getvalue() 610 611 612def decompress(data): 613 """Decompress a gzip compressed string in one shot. 614 Return the decompressed string. 615 """ 616 with GzipFile(fileobj=io.BytesIO(data)) as f: 617 return f.read() 618