1"""Functions that read and write gzipped files. 2 3The user of the file doesn't have to worry about the compression, 4but random access is not allowed.""" 5 6# based on Andrew Kuchling's minigzip.py distributed with the zlib module 7 8import struct, sys, time, os 9import zlib 10import io 11import __builtin__ 12 13__all__ = ["GzipFile","open"] 14 15FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16 16 17READ, WRITE = 1, 2 18 19def write32u(output, value): 20 # The L format writes the bit pattern correctly whether signed 21 # or unsigned. 22 output.write(struct.pack("<L", value)) 23 24def read32(input): 25 return struct.unpack("<I", input.read(4))[0] 26 27def open(filename, mode="rb", compresslevel=9): 28 """Shorthand for GzipFile(filename, mode, compresslevel). 29 30 The filename argument is required; mode defaults to 'rb' 31 and compresslevel defaults to 9. 32 33 """ 34 return GzipFile(filename, mode, compresslevel) 35 36class GzipFile(io.BufferedIOBase): 37 """The GzipFile class simulates most of the methods of a file object with 38 the exception of the readinto() and truncate() methods. 39 40 """ 41 42 myfileobj = None 43 max_read_chunk = 10 * 1024 * 1024 # 10Mb 44 45 def __init__(self, filename=None, mode=None, 46 compresslevel=9, fileobj=None, mtime=None): 47 """Constructor for the GzipFile class. 48 49 At least one of fileobj and filename must be given a 50 non-trivial value. 51 52 The new class instance is based on fileobj, which can be a regular 53 file, a StringIO object, or any other object which simulates a file. 54 It defaults to None, in which case filename is opened to provide 55 a file object. 56 57 When fileobj is not None, the filename argument is only used to be 58 included in the gzip file header, which may includes the original 59 filename of the uncompressed file. It defaults to the filename of 60 fileobj, if discernible; otherwise, it defaults to the empty string, 61 and in this case the original filename is not included in the header. 62 63 The mode argument can be any of 'r', 'rb', 'a', 'ab', 'w', or 'wb', 64 depending on whether the file will be read or written. The default 65 is the mode of fileobj if discernible; otherwise, the default is 'rb'. 66 Be aware that only the 'rb', 'ab', and 'wb' values should be used 67 for cross-platform portability. 68 69 The compresslevel argument is an integer from 1 to 9 controlling the 70 level of compression; 1 is fastest and produces the least compression, 71 and 9 is slowest and produces the most compression. The default is 9. 72 73 The mtime argument is an optional numeric timestamp to be written 74 to the stream when compressing. All gzip compressed streams 75 are required to contain a timestamp. If omitted or None, the 76 current time is used. This module ignores the timestamp when 77 decompressing; however, some programs, such as gunzip, make use 78 of it. The format of the timestamp is the same as that of the 79 return value of time.time() and of the st_mtime member of the 80 object returned by os.stat(). 81 82 """ 83 84 # guarantee the file is opened in binary mode on platforms 85 # that care about that sort of thing 86 if mode and 'b' not in mode: 87 mode += 'b' 88 if fileobj is None: 89 fileobj = self.myfileobj = __builtin__.open(filename, mode or 'rb') 90 if filename is None: 91 if hasattr(fileobj, 'name'): filename = fileobj.name 92 else: filename = '' 93 if mode is None: 94 if hasattr(fileobj, 'mode'): mode = fileobj.mode 95 else: mode = 'rb' 96 97 if mode[0:1] == 'r': 98 self.mode = READ 99 # Set flag indicating start of a new member 100 self._new_member = True 101 # Buffer data read from gzip file. extrastart is offset in 102 # stream where buffer starts. extrasize is number of 103 # bytes remaining in buffer from current stream position. 104 self.extrabuf = "" 105 self.extrasize = 0 106 self.extrastart = 0 107 self.name = filename 108 # Starts small, scales exponentially 109 self.min_readsize = 100 110 111 elif mode[0:1] == 'w' or mode[0:1] == 'a': 112 self.mode = WRITE 113 self._init_write(filename) 114 self.compress = zlib.compressobj(compresslevel, 115 zlib.DEFLATED, 116 -zlib.MAX_WBITS, 117 zlib.DEF_MEM_LEVEL, 118 0) 119 else: 120 raise IOError, "Mode " + mode + " not supported" 121 122 self.fileobj = fileobj 123 self.offset = 0 124 self.mtime = mtime 125 126 if self.mode == WRITE: 127 self._write_gzip_header() 128 129 @property 130 def filename(self): 131 import warnings 132 warnings.warn("use the name attribute", DeprecationWarning, 2) 133 if self.mode == WRITE and self.name[-3:] != ".gz": 134 return self.name + ".gz" 135 return self.name 136 137 def __repr__(self): 138 s = repr(self.fileobj) 139 return '<gzip ' + s[1:-1] + ' ' + hex(id(self)) + '>' 140 141 def _check_closed(self): 142 """Raises a ValueError if the underlying file object has been closed. 143 144 """ 145 if self.closed: 146 raise ValueError('I/O operation on closed file.') 147 148 def _init_write(self, filename): 149 self.name = filename 150 self.crc = zlib.crc32("") & 0xffffffffL 151 self.size = 0 152 self.writebuf = [] 153 self.bufsize = 0 154 155 def _write_gzip_header(self): 156 self.fileobj.write('\037\213') # magic header 157 self.fileobj.write('\010') # compression method 158 fname = os.path.basename(self.name) 159 if fname.endswith(".gz"): 160 fname = fname[:-3] 161 flags = 0 162 if fname: 163 flags = FNAME 164 self.fileobj.write(chr(flags)) 165 mtime = self.mtime 166 if mtime is None: 167 mtime = time.time() 168 write32u(self.fileobj, long(mtime)) 169 self.fileobj.write('\002') 170 self.fileobj.write('\377') 171 if fname: 172 self.fileobj.write(fname + '\000') 173 174 def _init_read(self): 175 self.crc = zlib.crc32("") & 0xffffffffL 176 self.size = 0 177 178 def _read_gzip_header(self): 179 magic = self.fileobj.read(2) 180 if magic != '\037\213': 181 raise IOError, 'Not a gzipped file' 182 method = ord( self.fileobj.read(1) ) 183 if method != 8: 184 raise IOError, 'Unknown compression method' 185 flag = ord( self.fileobj.read(1) ) 186 self.mtime = read32(self.fileobj) 187 # extraflag = self.fileobj.read(1) 188 # os = self.fileobj.read(1) 189 self.fileobj.read(2) 190 191 if flag & FEXTRA: 192 # Read & discard the extra field, if present 193 xlen = ord(self.fileobj.read(1)) 194 xlen = xlen + 256*ord(self.fileobj.read(1)) 195 self.fileobj.read(xlen) 196 if flag & FNAME: 197 # Read and discard a null-terminated string containing the filename 198 while True: 199 s = self.fileobj.read(1) 200 if not s or s=='\000': 201 break 202 if flag & FCOMMENT: 203 # Read and discard a null-terminated string containing a comment 204 while True: 205 s = self.fileobj.read(1) 206 if not s or s=='\000': 207 break 208 if flag & FHCRC: 209 self.fileobj.read(2) # Read & discard the 16-bit header CRC 210 211 def write(self,data): 212 self._check_closed() 213 if self.mode != WRITE: 214 import errno 215 raise IOError(errno.EBADF, "write() on read-only GzipFile object") 216 217 if self.fileobj is None: 218 raise ValueError, "write() on closed GzipFile object" 219 220 # Convert data type if called by io.BufferedWriter. 221 if isinstance(data, memoryview): 222 data = data.tobytes() 223 224 if len(data) > 0: 225 self.size = self.size + len(data) 226 self.crc = zlib.crc32(data, self.crc) & 0xffffffffL 227 self.fileobj.write( self.compress.compress(data) ) 228 self.offset += len(data) 229 230 return len(data) 231 232 def read(self, size=-1): 233 self._check_closed() 234 if self.mode != READ: 235 import errno 236 raise IOError(errno.EBADF, "read() on write-only GzipFile object") 237 238 if self.extrasize <= 0 and self.fileobj is None: 239 return '' 240 241 readsize = 1024 242 if size < 0: # get the whole thing 243 try: 244 while True: 245 self._read(readsize) 246 readsize = min(self.max_read_chunk, readsize * 2) 247 except EOFError: 248 size = self.extrasize 249 else: # just get some more of it 250 try: 251 while size > self.extrasize: 252 self._read(readsize) 253 readsize = min(self.max_read_chunk, readsize * 2) 254 except EOFError: 255 if size > self.extrasize: 256 size = self.extrasize 257 258 offset = self.offset - self.extrastart 259 chunk = self.extrabuf[offset: offset + size] 260 self.extrasize = self.extrasize - size 261 262 self.offset += size 263 return chunk 264 265 def _unread(self, buf): 266 self.extrasize = len(buf) + self.extrasize 267 self.offset -= len(buf) 268 269 def _read(self, size=1024): 270 if self.fileobj is None: 271 raise EOFError, "Reached EOF" 272 273 if self._new_member: 274 # If the _new_member flag is set, we have to 275 # jump to the next member, if there is one. 276 # 277 # First, check if we're at the end of the file; 278 # if so, it's time to stop; no more members to read. 279 pos = self.fileobj.tell() # Save current position 280 self.fileobj.seek(0, 2) # Seek to end of file 281 if pos == self.fileobj.tell(): 282 raise EOFError, "Reached EOF" 283 else: 284 self.fileobj.seek( pos ) # Return to original position 285 286 self._init_read() 287 self._read_gzip_header() 288 self.decompress = zlib.decompressobj(-zlib.MAX_WBITS) 289 self._new_member = False 290 291 # Read a chunk of data from the file 292 buf = self.fileobj.read(size) 293 294 # If the EOF has been reached, flush the decompression object 295 # and mark this object as finished. 296 297 if buf == "": 298 uncompress = self.decompress.flush() 299 self._read_eof() 300 self._add_read_data( uncompress ) 301 raise EOFError, 'Reached EOF' 302 303 uncompress = self.decompress.decompress(buf) 304 self._add_read_data( uncompress ) 305 306 if self.decompress.unused_data != "": 307 # Ending case: we've come to the end of a member in the file, 308 # so seek back to the start of the unused data, finish up 309 # this member, and read a new gzip header. 310 # (The number of bytes to seek back is the length of the unused 311 # data, minus 8 because _read_eof() will rewind a further 8 bytes) 312 self.fileobj.seek( -len(self.decompress.unused_data)+8, 1) 313 314 # Check the CRC and file size, and set the flag so we read 315 # a new member on the next call 316 self._read_eof() 317 self._new_member = True 318 319 def _add_read_data(self, data): 320 self.crc = zlib.crc32(data, self.crc) & 0xffffffffL 321 offset = self.offset - self.extrastart 322 self.extrabuf = self.extrabuf[offset:] + data 323 self.extrasize = self.extrasize + len(data) 324 self.extrastart = self.offset 325 self.size = self.size + len(data) 326 327 def _read_eof(self): 328 # We've read to the end of the file, so we have to rewind in order 329 # to reread the 8 bytes containing the CRC and the file size. 330 # We check the that the computed CRC and size of the 331 # uncompressed data matches the stored values. Note that the size 332 # stored is the true file size mod 2**32. 333 self.fileobj.seek(-8, 1) 334 crc32 = read32(self.fileobj) 335 isize = read32(self.fileobj) # may exceed 2GB 336 if crc32 != self.crc: 337 raise IOError("CRC check failed %s != %s" % (hex(crc32), 338 hex(self.crc))) 339 elif isize != (self.size & 0xffffffffL): 340 raise IOError, "Incorrect length of data produced" 341 342 # Gzip files can be padded with zeroes and still have archives. 343 # Consume all zero bytes and set the file position to the first 344 # non-zero byte. See http://www.gzip.org/#faq8 345 c = "\x00" 346 while c == "\x00": 347 c = self.fileobj.read(1) 348 if c: 349 self.fileobj.seek(-1, 1) 350 351 @property 352 def closed(self): 353 return self.fileobj is None 354 355 def close(self): 356 if self.fileobj is None: 357 return 358 if self.mode == WRITE: 359 self.fileobj.write(self.compress.flush()) 360 write32u(self.fileobj, self.crc) 361 # self.size may exceed 2GB, or even 4GB 362 write32u(self.fileobj, self.size & 0xffffffffL) 363 self.fileobj = None 364 elif self.mode == READ: 365 self.fileobj = None 366 if self.myfileobj: 367 self.myfileobj.close() 368 self.myfileobj = None 369 370 def flush(self,zlib_mode=zlib.Z_SYNC_FLUSH): 371 self._check_closed() 372 if self.mode == WRITE: 373 # Ensure the compressor's buffer is flushed 374 self.fileobj.write(self.compress.flush(zlib_mode)) 375 self.fileobj.flush() 376 377 def fileno(self): 378 """Invoke the underlying file object's fileno() method. 379 380 This will raise AttributeError if the underlying file object 381 doesn't support fileno(). 382 """ 383 return self.fileobj.fileno() 384 385 def rewind(self): 386 '''Return the uncompressed stream file position indicator to the 387 beginning of the file''' 388 if self.mode != READ: 389 raise IOError("Can't rewind in write mode") 390 self.fileobj.seek(0) 391 self._new_member = True 392 self.extrabuf = "" 393 self.extrasize = 0 394 self.extrastart = 0 395 self.offset = 0 396 397 def readable(self): 398 return self.mode == READ 399 400 def writable(self): 401 return self.mode == WRITE 402 403 def seekable(self): 404 return True 405 406 def seek(self, offset, whence=0): 407 if whence: 408 if whence == 1: 409 offset = self.offset + offset 410 else: 411 raise ValueError('Seek from end not supported') 412 if self.mode == WRITE: 413 if offset < self.offset: 414 raise IOError('Negative seek in write mode') 415 count = offset - self.offset 416 for i in range(count // 1024): 417 self.write(1024 * '\0') 418 self.write((count % 1024) * '\0') 419 elif self.mode == READ: 420 if offset < self.offset: 421 # for negative seek, rewind and do positive seek 422 self.rewind() 423 count = offset - self.offset 424 for i in range(count // 1024): 425 self.read(1024) 426 self.read(count % 1024) 427 428 return self.offset 429 430 def readline(self, size=-1): 431 if size < 0: 432 # Shortcut common case - newline found in buffer. 433 offset = self.offset - self.extrastart 434 i = self.extrabuf.find('\n', offset) + 1 435 if i > 0: 436 self.extrasize -= i - offset 437 self.offset += i - offset 438 return self.extrabuf[offset: i] 439 440 size = sys.maxint 441 readsize = self.min_readsize 442 else: 443 readsize = size 444 bufs = [] 445 while size != 0: 446 c = self.read(readsize) 447 i = c.find('\n') 448 449 # We set i=size to break out of the loop under two 450 # conditions: 1) there's no newline, and the chunk is 451 # larger than size, or 2) there is a newline, but the 452 # resulting line would be longer than 'size'. 453 if (size <= i) or (i == -1 and len(c) > size): 454 i = size - 1 455 456 if i >= 0 or c == '': 457 bufs.append(c[:i + 1]) # Add portion of last chunk 458 self._unread(c[i + 1:]) # Push back rest of chunk 459 break 460 461 # Append chunk to list, decrease 'size', 462 bufs.append(c) 463 size = size - len(c) 464 readsize = min(size, readsize * 2) 465 if readsize > self.min_readsize: 466 self.min_readsize = min(readsize, self.min_readsize * 2, 512) 467 return ''.join(bufs) # Return resulting line 468 469 470def _test(): 471 # Act like gzip; with -d, act like gunzip. 472 # The input file is not deleted, however, nor are any other gzip 473 # options or features supported. 474 args = sys.argv[1:] 475 decompress = args and args[0] == "-d" 476 if decompress: 477 args = args[1:] 478 if not args: 479 args = ["-"] 480 for arg in args: 481 if decompress: 482 if arg == "-": 483 f = GzipFile(filename="", mode="rb", fileobj=sys.stdin) 484 g = sys.stdout 485 else: 486 if arg[-3:] != ".gz": 487 print "filename doesn't end in .gz:", repr(arg) 488 continue 489 f = open(arg, "rb") 490 g = __builtin__.open(arg[:-3], "wb") 491 else: 492 if arg == "-": 493 f = sys.stdin 494 g = GzipFile(filename="", mode="wb", fileobj=sys.stdout) 495 else: 496 f = __builtin__.open(arg, "rb") 497 g = open(arg + ".gz", "wb") 498 while True: 499 chunk = f.read(1024) 500 if not chunk: 501 break 502 g.write(chunk) 503 if g is not sys.stdout: 504 g.close() 505 if f is not sys.stdin: 506 f.close() 507 508if __name__ == '__main__': 509 _test() 510