1"""Functions that read and write gzipped files. 2 3The user of the file doesn't have to worry about the compression, 4but random access is not allowed.""" 5 6# based on Andrew Kuchling's minigzip.py distributed with the zlib module 7 8import struct, sys, time, os 9import zlib 10import io 11import __builtin__ 12 13__all__ = ["GzipFile","open"] 14 15FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16 16 17READ, WRITE = 1, 2 18 19def write32u(output, value): 20 # The L format writes the bit pattern correctly whether signed 21 # or unsigned. 22 output.write(struct.pack("<L", value)) 23 24def read32(input): 25 return struct.unpack("<I", input.read(4))[0] 26 27def open(filename, mode="rb", compresslevel=9): 28 """Shorthand for GzipFile(filename, mode, compresslevel). 29 30 The filename argument is required; mode defaults to 'rb' 31 and compresslevel defaults to 9. 32 33 """ 34 return GzipFile(filename, mode, compresslevel) 35 36class GzipFile(io.BufferedIOBase): 37 """The GzipFile class simulates most of the methods of a file object with 38 the exception of the readinto() and truncate() methods. 39 40 """ 41 42 myfileobj = None 43 max_read_chunk = 10 * 1024 * 1024 # 10Mb 44 45 def __init__(self, filename=None, mode=None, 46 compresslevel=9, fileobj=None, mtime=None): 47 """Constructor for the GzipFile class. 48 49 At least one of fileobj and filename must be given a 50 non-trivial value. 51 52 The new class instance is based on fileobj, which can be a regular 53 file, a StringIO object, or any other object which simulates a file. 54 It defaults to None, in which case filename is opened to provide 55 a file object. 56 57 When fileobj is not None, the filename argument is only used to be 58 included in the gzip file header, which may include the original 59 filename of the uncompressed file. It defaults to the filename of 60 fileobj, if discernible; otherwise, it defaults to the empty string, 61 and in this case the original filename is not included in the header. 62 63 The mode argument can be any of 'r', 'rb', 'a', 'ab', 'w', or 'wb', 64 depending on whether the file will be read or written. The default 65 is the mode of fileobj if discernible; otherwise, the default is 'rb'. 66 Be aware that only the 'rb', 'ab', and 'wb' values should be used 67 for cross-platform portability. 68 69 The compresslevel argument is an integer from 0 to 9 controlling the 70 level of compression; 1 is fastest and produces the least compression, 71 and 9 is slowest and produces the most compression. 0 is no compression 72 at all. The default is 9. 73 74 The mtime argument is an optional numeric timestamp to be written 75 to the stream when compressing. All gzip compressed streams 76 are required to contain a timestamp. If omitted or None, the 77 current time is used. This module ignores the timestamp when 78 decompressing; however, some programs, such as gunzip, make use 79 of it. The format of the timestamp is the same as that of the 80 return value of time.time() and of the st_mtime member of the 81 object returned by os.stat(). 82 83 """ 84 85 # Make sure we don't inadvertently enable universal newlines on the 86 # underlying file object - in read mode, this causes data corruption. 87 if mode: 88 mode = mode.replace('U', '') 89 # guarantee the file is opened in binary mode on platforms 90 # that care about that sort of thing 91 if mode and 'b' not in mode: 92 mode += 'b' 93 if fileobj is None: 94 fileobj = self.myfileobj = __builtin__.open(filename, mode or 'rb') 95 if filename is None: 96 # Issue #13781: os.fdopen() creates a fileobj with a bogus name 97 # attribute. Avoid saving this in the gzip header's filename field. 98 if hasattr(fileobj, 'name') and fileobj.name != '<fdopen>': 99 filename = fileobj.name 100 else: 101 filename = '' 102 if mode is None: 103 if hasattr(fileobj, 'mode'): mode = fileobj.mode 104 else: mode = 'rb' 105 106 if mode[0:1] == 'r': 107 self.mode = READ 108 # Set flag indicating start of a new member 109 self._new_member = True 110 # Buffer data read from gzip file. extrastart is offset in 111 # stream where buffer starts. extrasize is number of 112 # bytes remaining in buffer from current stream position. 113 self.extrabuf = "" 114 self.extrasize = 0 115 self.extrastart = 0 116 self.name = filename 117 # Starts small, scales exponentially 118 self.min_readsize = 100 119 120 elif mode[0:1] == 'w' or mode[0:1] == 'a': 121 self.mode = WRITE 122 self._init_write(filename) 123 self.compress = zlib.compressobj(compresslevel, 124 zlib.DEFLATED, 125 -zlib.MAX_WBITS, 126 zlib.DEF_MEM_LEVEL, 127 0) 128 else: 129 raise IOError, "Mode " + mode + " not supported" 130 131 self.fileobj = fileobj 132 self.offset = 0 133 self.mtime = mtime 134 135 if self.mode == WRITE: 136 self._write_gzip_header() 137 138 @property 139 def filename(self): 140 import warnings 141 warnings.warn("use the name attribute", DeprecationWarning, 2) 142 if self.mode == WRITE and self.name[-3:] != ".gz": 143 return self.name + ".gz" 144 return self.name 145 146 def __repr__(self): 147 s = repr(self.fileobj) 148 return '<gzip ' + s[1:-1] + ' ' + hex(id(self)) + '>' 149 150 def _check_closed(self): 151 """Raises a ValueError if the underlying file object has been closed. 152 153 """ 154 if self.closed: 155 raise ValueError('I/O operation on closed file.') 156 157 def _init_write(self, filename): 158 self.name = filename 159 self.crc = zlib.crc32("") & 0xffffffffL 160 self.size = 0 161 self.writebuf = [] 162 self.bufsize = 0 163 164 def _write_gzip_header(self): 165 self.fileobj.write('\037\213') # magic header 166 self.fileobj.write('\010') # compression method 167 try: 168 # RFC 1952 requires the FNAME field to be Latin-1. Do not 169 # include filenames that cannot be represented that way. 170 fname = os.path.basename(self.name) 171 if not isinstance(fname, str): 172 fname = fname.encode('latin-1') 173 if fname.endswith('.gz'): 174 fname = fname[:-3] 175 except UnicodeEncodeError: 176 fname = '' 177 flags = 0 178 if fname: 179 flags = FNAME 180 self.fileobj.write(chr(flags)) 181 mtime = self.mtime 182 if mtime is None: 183 mtime = time.time() 184 write32u(self.fileobj, long(mtime)) 185 self.fileobj.write('\002') 186 self.fileobj.write('\377') 187 if fname: 188 self.fileobj.write(fname + '\000') 189 190 def _init_read(self): 191 self.crc = zlib.crc32("") & 0xffffffffL 192 self.size = 0 193 194 def _read_gzip_header(self): 195 magic = self.fileobj.read(2) 196 if magic != '\037\213': 197 raise IOError, 'Not a gzipped file' 198 method = ord( self.fileobj.read(1) ) 199 if method != 8: 200 raise IOError, 'Unknown compression method' 201 flag = ord( self.fileobj.read(1) ) 202 self.mtime = read32(self.fileobj) 203 # extraflag = self.fileobj.read(1) 204 # os = self.fileobj.read(1) 205 self.fileobj.read(2) 206 207 if flag & FEXTRA: 208 # Read & discard the extra field, if present 209 xlen = ord(self.fileobj.read(1)) 210 xlen = xlen + 256*ord(self.fileobj.read(1)) 211 self.fileobj.read(xlen) 212 if flag & FNAME: 213 # Read and discard a null-terminated string containing the filename 214 while True: 215 s = self.fileobj.read(1) 216 if not s or s=='\000': 217 break 218 if flag & FCOMMENT: 219 # Read and discard a null-terminated string containing a comment 220 while True: 221 s = self.fileobj.read(1) 222 if not s or s=='\000': 223 break 224 if flag & FHCRC: 225 self.fileobj.read(2) # Read & discard the 16-bit header CRC 226 227 def write(self,data): 228 self._check_closed() 229 if self.mode != WRITE: 230 import errno 231 raise IOError(errno.EBADF, "write() on read-only GzipFile object") 232 233 if self.fileobj is None: 234 raise ValueError, "write() on closed GzipFile object" 235 236 # Convert data type if called by io.BufferedWriter. 237 if isinstance(data, memoryview): 238 data = data.tobytes() 239 240 if len(data) > 0: 241 self.fileobj.write(self.compress.compress(data)) 242 self.size += len(data) 243 self.crc = zlib.crc32(data, self.crc) & 0xffffffffL 244 self.offset += len(data) 245 246 return len(data) 247 248 def read(self, size=-1): 249 self._check_closed() 250 if self.mode != READ: 251 import errno 252 raise IOError(errno.EBADF, "read() on write-only GzipFile object") 253 254 if self.extrasize <= 0 and self.fileobj is None: 255 return '' 256 257 readsize = 1024 258 if size < 0: # get the whole thing 259 try: 260 while True: 261 self._read(readsize) 262 readsize = min(self.max_read_chunk, readsize * 2) 263 except EOFError: 264 size = self.extrasize 265 else: # just get some more of it 266 try: 267 while size > self.extrasize: 268 self._read(readsize) 269 readsize = min(self.max_read_chunk, readsize * 2) 270 except EOFError: 271 if size > self.extrasize: 272 size = self.extrasize 273 274 offset = self.offset - self.extrastart 275 chunk = self.extrabuf[offset: offset + size] 276 self.extrasize = self.extrasize - size 277 278 self.offset += size 279 return chunk 280 281 def _unread(self, buf): 282 self.extrasize = len(buf) + self.extrasize 283 self.offset -= len(buf) 284 285 def _read(self, size=1024): 286 if self.fileobj is None: 287 raise EOFError, "Reached EOF" 288 289 if self._new_member: 290 # If the _new_member flag is set, we have to 291 # jump to the next member, if there is one. 292 # 293 # First, check if we're at the end of the file; 294 # if so, it's time to stop; no more members to read. 295 pos = self.fileobj.tell() # Save current position 296 self.fileobj.seek(0, 2) # Seek to end of file 297 if pos == self.fileobj.tell(): 298 raise EOFError, "Reached EOF" 299 else: 300 self.fileobj.seek( pos ) # Return to original position 301 302 self._init_read() 303 self._read_gzip_header() 304 self.decompress = zlib.decompressobj(-zlib.MAX_WBITS) 305 self._new_member = False 306 307 # Read a chunk of data from the file 308 buf = self.fileobj.read(size) 309 310 # If the EOF has been reached, flush the decompression object 311 # and mark this object as finished. 312 313 if buf == "": 314 uncompress = self.decompress.flush() 315 self._read_eof() 316 self._add_read_data( uncompress ) 317 raise EOFError, 'Reached EOF' 318 319 uncompress = self.decompress.decompress(buf) 320 self._add_read_data( uncompress ) 321 322 if self.decompress.unused_data != "": 323 # Ending case: we've come to the end of a member in the file, 324 # so seek back to the start of the unused data, finish up 325 # this member, and read a new gzip header. 326 # (The number of bytes to seek back is the length of the unused 327 # data, minus 8 because _read_eof() will rewind a further 8 bytes) 328 self.fileobj.seek( -len(self.decompress.unused_data)+8, 1) 329 330 # Check the CRC and file size, and set the flag so we read 331 # a new member on the next call 332 self._read_eof() 333 self._new_member = True 334 335 def _add_read_data(self, data): 336 self.crc = zlib.crc32(data, self.crc) & 0xffffffffL 337 offset = self.offset - self.extrastart 338 self.extrabuf = self.extrabuf[offset:] + data 339 self.extrasize = self.extrasize + len(data) 340 self.extrastart = self.offset 341 self.size = self.size + len(data) 342 343 def _read_eof(self): 344 # We've read to the end of the file, so we have to rewind in order 345 # to reread the 8 bytes containing the CRC and the file size. 346 # We check the that the computed CRC and size of the 347 # uncompressed data matches the stored values. Note that the size 348 # stored is the true file size mod 2**32. 349 self.fileobj.seek(-8, 1) 350 crc32 = read32(self.fileobj) 351 isize = read32(self.fileobj) # may exceed 2GB 352 if crc32 != self.crc: 353 raise IOError("CRC check failed %s != %s" % (hex(crc32), 354 hex(self.crc))) 355 elif isize != (self.size & 0xffffffffL): 356 raise IOError, "Incorrect length of data produced" 357 358 # Gzip files can be padded with zeroes and still have archives. 359 # Consume all zero bytes and set the file position to the first 360 # non-zero byte. See http://www.gzip.org/#faq8 361 c = "\x00" 362 while c == "\x00": 363 c = self.fileobj.read(1) 364 if c: 365 self.fileobj.seek(-1, 1) 366 367 @property 368 def closed(self): 369 return self.fileobj is None 370 371 def close(self): 372 fileobj = self.fileobj 373 if fileobj is None: 374 return 375 self.fileobj = None 376 try: 377 if self.mode == WRITE: 378 fileobj.write(self.compress.flush()) 379 write32u(fileobj, self.crc) 380 # self.size may exceed 2GB, or even 4GB 381 write32u(fileobj, self.size & 0xffffffffL) 382 finally: 383 myfileobj = self.myfileobj 384 if myfileobj: 385 self.myfileobj = None 386 myfileobj.close() 387 388 def flush(self,zlib_mode=zlib.Z_SYNC_FLUSH): 389 self._check_closed() 390 if self.mode == WRITE: 391 # Ensure the compressor's buffer is flushed 392 self.fileobj.write(self.compress.flush(zlib_mode)) 393 self.fileobj.flush() 394 395 def fileno(self): 396 """Invoke the underlying file object's fileno() method. 397 398 This will raise AttributeError if the underlying file object 399 doesn't support fileno(). 400 """ 401 return self.fileobj.fileno() 402 403 def rewind(self): 404 '''Return the uncompressed stream file position indicator to the 405 beginning of the file''' 406 if self.mode != READ: 407 raise IOError("Can't rewind in write mode") 408 self.fileobj.seek(0) 409 self._new_member = True 410 self.extrabuf = "" 411 self.extrasize = 0 412 self.extrastart = 0 413 self.offset = 0 414 415 def readable(self): 416 return self.mode == READ 417 418 def writable(self): 419 return self.mode == WRITE 420 421 def seekable(self): 422 return True 423 424 def seek(self, offset, whence=0): 425 if whence: 426 if whence == 1: 427 offset = self.offset + offset 428 else: 429 raise ValueError('Seek from end not supported') 430 if self.mode == WRITE: 431 if offset < self.offset: 432 raise IOError('Negative seek in write mode') 433 count = offset - self.offset 434 for i in xrange(count // 1024): 435 self.write(1024 * '\0') 436 self.write((count % 1024) * '\0') 437 elif self.mode == READ: 438 if offset < self.offset: 439 # for negative seek, rewind and do positive seek 440 self.rewind() 441 count = offset - self.offset 442 for i in xrange(count // 1024): 443 self.read(1024) 444 self.read(count % 1024) 445 446 return self.offset 447 448 def readline(self, size=-1): 449 if size < 0: 450 # Shortcut common case - newline found in buffer. 451 offset = self.offset - self.extrastart 452 i = self.extrabuf.find('\n', offset) + 1 453 if i > 0: 454 self.extrasize -= i - offset 455 self.offset += i - offset 456 return self.extrabuf[offset: i] 457 458 size = sys.maxint 459 readsize = self.min_readsize 460 else: 461 readsize = size 462 bufs = [] 463 while size != 0: 464 c = self.read(readsize) 465 i = c.find('\n') 466 467 # We set i=size to break out of the loop under two 468 # conditions: 1) there's no newline, and the chunk is 469 # larger than size, or 2) there is a newline, but the 470 # resulting line would be longer than 'size'. 471 if (size <= i) or (i == -1 and len(c) > size): 472 i = size - 1 473 474 if i >= 0 or c == '': 475 bufs.append(c[:i + 1]) # Add portion of last chunk 476 self._unread(c[i + 1:]) # Push back rest of chunk 477 break 478 479 # Append chunk to list, decrease 'size', 480 bufs.append(c) 481 size = size - len(c) 482 readsize = min(size, readsize * 2) 483 if readsize > self.min_readsize: 484 self.min_readsize = min(readsize, self.min_readsize * 2, 512) 485 return ''.join(bufs) # Return resulting line 486 487 488def _test(): 489 # Act like gzip; with -d, act like gunzip. 490 # The input file is not deleted, however, nor are any other gzip 491 # options or features supported. 492 args = sys.argv[1:] 493 decompress = args and args[0] == "-d" 494 if decompress: 495 args = args[1:] 496 if not args: 497 args = ["-"] 498 for arg in args: 499 if decompress: 500 if arg == "-": 501 f = GzipFile(filename="", mode="rb", fileobj=sys.stdin) 502 g = sys.stdout 503 else: 504 if arg[-3:] != ".gz": 505 print "filename doesn't end in .gz:", repr(arg) 506 continue 507 f = open(arg, "rb") 508 g = __builtin__.open(arg[:-3], "wb") 509 else: 510 if arg == "-": 511 f = sys.stdin 512 g = GzipFile(filename="", mode="wb", fileobj=sys.stdout) 513 else: 514 f = __builtin__.open(arg, "rb") 515 g = open(arg + ".gz", "wb") 516 while True: 517 chunk = f.read(1024) 518 if not chunk: 519 break 520 g.write(chunk) 521 if g is not sys.stdout: 522 g.close() 523 if f is not sys.stdin: 524 f.close() 525 526if __name__ == '__main__': 527 _test() 528