1#!/usr/bin/env python3 2#------------------------------------------------------------------- 3# tarfile.py 4#------------------------------------------------------------------- 5# Copyright (C) 2002 Lars Gustaebel <lars@gustaebel.de> 6# All rights reserved. 7# 8# Permission is hereby granted, free of charge, to any person 9# obtaining a copy of this software and associated documentation 10# files (the "Software"), to deal in the Software without 11# restriction, including without limitation the rights to use, 12# copy, modify, merge, publish, distribute, sublicense, and/or sell 13# copies of the Software, and to permit persons to whom the 14# Software is furnished to do so, subject to the following 15# conditions: 16# 17# The above copyright notice and this permission notice shall be 18# included in all copies or substantial portions of the Software. 19# 20# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 21# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 22# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 23# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 24# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 25# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 26# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 27# OTHER DEALINGS IN THE SOFTWARE. 28# 29"""Read from and write to tar format archives. 30""" 31 32version = "0.9.0" 33__author__ = "Lars Gust\u00e4bel (lars@gustaebel.de)" 34__date__ = "$Date: 2011-02-25 17:42:01 +0200 (Fri, 25 Feb 2011) $" 35__cvsid__ = "$Id: tarfile.py 88586 2011-02-25 15:42:01Z marc-andre.lemburg $" 36__credits__ = "Gustavo Niemeyer, Niels Gust\u00e4bel, Richard Townsend." 37 38#--------- 39# Imports 40#--------- 41from builtins import open as bltn_open 42import sys 43import os 44import io 45import shutil 46import stat 47import time 48import struct 49import copy 50import re 51 52try: 53 import pwd 54except ImportError: 55 pwd = None 56try: 57 import grp 58except ImportError: 59 grp = None 60 61# os.symlink on Windows prior to 6.0 raises NotImplementedError 62symlink_exception = (AttributeError, NotImplementedError) 63try: 64 # OSError (winerror=1314) will be raised if the caller does not hold the 65 # SeCreateSymbolicLinkPrivilege privilege 66 symlink_exception += (OSError,) 67except NameError: 68 pass 69 70# from tarfile import * 71__all__ = ["TarFile", "TarInfo", "is_tarfile", "TarError", "ReadError", 72 "CompressionError", "StreamError", "ExtractError", "HeaderError", 73 "ENCODING", "USTAR_FORMAT", "GNU_FORMAT", "PAX_FORMAT", 74 "DEFAULT_FORMAT", "open"] 75 76#--------------------------------------------------------- 77# tar constants 78#--------------------------------------------------------- 79NUL = b"\0" # the null character 80BLOCKSIZE = 512 # length of processing blocks 81RECORDSIZE = BLOCKSIZE * 20 # length of records 82GNU_MAGIC = b"ustar \0" # magic gnu tar string 83POSIX_MAGIC = b"ustar\x0000" # magic posix tar string 84 85LENGTH_NAME = 100 # maximum length of a filename 86LENGTH_LINK = 100 # maximum length of a linkname 87LENGTH_PREFIX = 155 # maximum length of the prefix field 88 89REGTYPE = b"0" # regular file 90AREGTYPE = b"\0" # regular file 91LNKTYPE = b"1" # link (inside tarfile) 92SYMTYPE = b"2" # symbolic link 93CHRTYPE = b"3" # character special device 94BLKTYPE = b"4" # block special device 95DIRTYPE = b"5" # directory 96FIFOTYPE = b"6" # fifo special device 97CONTTYPE = b"7" # contiguous file 98 99GNUTYPE_LONGNAME = b"L" # GNU tar longname 100GNUTYPE_LONGLINK = b"K" # GNU tar longlink 101GNUTYPE_SPARSE = b"S" # GNU tar sparse file 102 103XHDTYPE = b"x" # POSIX.1-2001 extended header 104XGLTYPE = b"g" # POSIX.1-2001 global header 105SOLARIS_XHDTYPE = b"X" # Solaris extended header 106 107USTAR_FORMAT = 0 # POSIX.1-1988 (ustar) format 108GNU_FORMAT = 1 # GNU tar format 109PAX_FORMAT = 2 # POSIX.1-2001 (pax) format 110DEFAULT_FORMAT = GNU_FORMAT 111 112#--------------------------------------------------------- 113# tarfile constants 114#--------------------------------------------------------- 115# File types that tarfile supports: 116SUPPORTED_TYPES = (REGTYPE, AREGTYPE, LNKTYPE, 117 SYMTYPE, DIRTYPE, FIFOTYPE, 118 CONTTYPE, CHRTYPE, BLKTYPE, 119 GNUTYPE_LONGNAME, GNUTYPE_LONGLINK, 120 GNUTYPE_SPARSE) 121 122# File types that will be treated as a regular file. 123REGULAR_TYPES = (REGTYPE, AREGTYPE, 124 CONTTYPE, GNUTYPE_SPARSE) 125 126# File types that are part of the GNU tar format. 127GNU_TYPES = (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK, 128 GNUTYPE_SPARSE) 129 130# Fields from a pax header that override a TarInfo attribute. 131PAX_FIELDS = ("path", "linkpath", "size", "mtime", 132 "uid", "gid", "uname", "gname") 133 134# Fields from a pax header that are affected by hdrcharset. 135PAX_NAME_FIELDS = {"path", "linkpath", "uname", "gname"} 136 137# Fields in a pax header that are numbers, all other fields 138# are treated as strings. 139PAX_NUMBER_FIELDS = { 140 "atime": float, 141 "ctime": float, 142 "mtime": float, 143 "uid": int, 144 "gid": int, 145 "size": int 146} 147 148#--------------------------------------------------------- 149# initialization 150#--------------------------------------------------------- 151if os.name == "nt": 152 ENCODING = "utf-8" 153else: 154 ENCODING = sys.getfilesystemencoding() 155 156#--------------------------------------------------------- 157# Some useful functions 158#--------------------------------------------------------- 159 160def stn(s, length, encoding, errors): 161 """Convert a string to a null-terminated bytes object. 162 """ 163 s = s.encode(encoding, errors) 164 return s[:length] + (length - len(s)) * NUL 165 166def nts(s, encoding, errors): 167 """Convert a null-terminated bytes object to a string. 168 """ 169 p = s.find(b"\0") 170 if p != -1: 171 s = s[:p] 172 return s.decode(encoding, errors) 173 174def nti(s): 175 """Convert a number field to a python number. 176 """ 177 # There are two possible encodings for a number field, see 178 # itn() below. 179 if s[0] in (0o200, 0o377): 180 n = 0 181 for i in range(len(s) - 1): 182 n <<= 8 183 n += s[i + 1] 184 if s[0] == 0o377: 185 n = -(256 ** (len(s) - 1) - n) 186 else: 187 try: 188 s = nts(s, "ascii", "strict") 189 n = int(s.strip() or "0", 8) 190 except ValueError: 191 raise InvalidHeaderError("invalid header") 192 return n 193 194def itn(n, digits=8, format=DEFAULT_FORMAT): 195 """Convert a python number to a number field. 196 """ 197 # POSIX 1003.1-1988 requires numbers to be encoded as a string of 198 # octal digits followed by a null-byte, this allows values up to 199 # (8**(digits-1))-1. GNU tar allows storing numbers greater than 200 # that if necessary. A leading 0o200 or 0o377 byte indicate this 201 # particular encoding, the following digits-1 bytes are a big-endian 202 # base-256 representation. This allows values up to (256**(digits-1))-1. 203 # A 0o200 byte indicates a positive number, a 0o377 byte a negative 204 # number. 205 if 0 <= n < 8 ** (digits - 1): 206 s = bytes("%0*o" % (digits - 1, int(n)), "ascii") + NUL 207 elif format == GNU_FORMAT and -256 ** (digits - 1) <= n < 256 ** (digits - 1): 208 if n >= 0: 209 s = bytearray([0o200]) 210 else: 211 s = bytearray([0o377]) 212 n = 256 ** digits + n 213 214 for i in range(digits - 1): 215 s.insert(1, n & 0o377) 216 n >>= 8 217 else: 218 raise ValueError("overflow in number field") 219 220 return s 221 222def calc_chksums(buf): 223 """Calculate the checksum for a member's header by summing up all 224 characters except for the chksum field which is treated as if 225 it was filled with spaces. According to the GNU tar sources, 226 some tars (Sun and NeXT) calculate chksum with signed char, 227 which will be different if there are chars in the buffer with 228 the high bit set. So we calculate two checksums, unsigned and 229 signed. 230 """ 231 unsigned_chksum = 256 + sum(struct.unpack_from("148B8x356B", buf)) 232 signed_chksum = 256 + sum(struct.unpack_from("148b8x356b", buf)) 233 return unsigned_chksum, signed_chksum 234 235def copyfileobj(src, dst, length=None, exception=OSError, bufsize=None): 236 """Copy length bytes from fileobj src to fileobj dst. 237 If length is None, copy the entire content. 238 """ 239 bufsize = bufsize or 16 * 1024 240 if length == 0: 241 return 242 if length is None: 243 shutil.copyfileobj(src, dst, bufsize) 244 return 245 246 blocks, remainder = divmod(length, bufsize) 247 for b in range(blocks): 248 buf = src.read(bufsize) 249 if len(buf) < bufsize: 250 raise exception("unexpected end of data") 251 dst.write(buf) 252 253 if remainder != 0: 254 buf = src.read(remainder) 255 if len(buf) < remainder: 256 raise exception("unexpected end of data") 257 dst.write(buf) 258 return 259 260def filemode(mode): 261 """Deprecated in this location; use stat.filemode.""" 262 import warnings 263 warnings.warn("deprecated in favor of stat.filemode", 264 DeprecationWarning, 2) 265 return stat.filemode(mode) 266 267def _safe_print(s): 268 encoding = getattr(sys.stdout, 'encoding', None) 269 if encoding is not None: 270 s = s.encode(encoding, 'backslashreplace').decode(encoding) 271 print(s, end=' ') 272 273 274class TarError(Exception): 275 """Base exception.""" 276 pass 277class ExtractError(TarError): 278 """General exception for extract errors.""" 279 pass 280class ReadError(TarError): 281 """Exception for unreadable tar archives.""" 282 pass 283class CompressionError(TarError): 284 """Exception for unavailable compression methods.""" 285 pass 286class StreamError(TarError): 287 """Exception for unsupported operations on stream-like TarFiles.""" 288 pass 289class HeaderError(TarError): 290 """Base exception for header errors.""" 291 pass 292class EmptyHeaderError(HeaderError): 293 """Exception for empty headers.""" 294 pass 295class TruncatedHeaderError(HeaderError): 296 """Exception for truncated headers.""" 297 pass 298class EOFHeaderError(HeaderError): 299 """Exception for end of file headers.""" 300 pass 301class InvalidHeaderError(HeaderError): 302 """Exception for invalid headers.""" 303 pass 304class SubsequentHeaderError(HeaderError): 305 """Exception for missing and invalid extended headers.""" 306 pass 307 308#--------------------------- 309# internal stream interface 310#--------------------------- 311class _LowLevelFile: 312 """Low-level file object. Supports reading and writing. 313 It is used instead of a regular file object for streaming 314 access. 315 """ 316 317 def __init__(self, name, mode): 318 mode = { 319 "r": os.O_RDONLY, 320 "w": os.O_WRONLY | os.O_CREAT | os.O_TRUNC, 321 }[mode] 322 if hasattr(os, "O_BINARY"): 323 mode |= os.O_BINARY 324 self.fd = os.open(name, mode, 0o666) 325 326 def close(self): 327 os.close(self.fd) 328 329 def read(self, size): 330 return os.read(self.fd, size) 331 332 def write(self, s): 333 os.write(self.fd, s) 334 335class _Stream: 336 """Class that serves as an adapter between TarFile and 337 a stream-like object. The stream-like object only 338 needs to have a read() or write() method and is accessed 339 blockwise. Use of gzip or bzip2 compression is possible. 340 A stream-like object could be for example: sys.stdin, 341 sys.stdout, a socket, a tape device etc. 342 343 _Stream is intended to be used only internally. 344 """ 345 346 def __init__(self, name, mode, comptype, fileobj, bufsize): 347 """Construct a _Stream object. 348 """ 349 self._extfileobj = True 350 if fileobj is None: 351 fileobj = _LowLevelFile(name, mode) 352 self._extfileobj = False 353 354 if comptype == '*': 355 # Enable transparent compression detection for the 356 # stream interface 357 fileobj = _StreamProxy(fileobj) 358 comptype = fileobj.getcomptype() 359 360 self.name = name or "" 361 self.mode = mode 362 self.comptype = comptype 363 self.fileobj = fileobj 364 self.bufsize = bufsize 365 self.buf = b"" 366 self.pos = 0 367 self.closed = False 368 369 try: 370 if comptype == "gz": 371 try: 372 import zlib 373 except ImportError: 374 raise CompressionError("zlib module is not available") 375 self.zlib = zlib 376 self.crc = zlib.crc32(b"") 377 if mode == "r": 378 self._init_read_gz() 379 self.exception = zlib.error 380 else: 381 self._init_write_gz() 382 383 elif comptype == "bz2": 384 try: 385 import bz2 386 except ImportError: 387 raise CompressionError("bz2 module is not available") 388 if mode == "r": 389 self.dbuf = b"" 390 self.cmp = bz2.BZ2Decompressor() 391 self.exception = OSError 392 else: 393 self.cmp = bz2.BZ2Compressor() 394 395 elif comptype == "xz": 396 try: 397 import lzma 398 except ImportError: 399 raise CompressionError("lzma module is not available") 400 if mode == "r": 401 self.dbuf = b"" 402 self.cmp = lzma.LZMADecompressor() 403 self.exception = lzma.LZMAError 404 else: 405 self.cmp = lzma.LZMACompressor() 406 407 elif comptype != "tar": 408 raise CompressionError("unknown compression type %r" % comptype) 409 410 except: 411 if not self._extfileobj: 412 self.fileobj.close() 413 self.closed = True 414 raise 415 416 def __del__(self): 417 if hasattr(self, "closed") and not self.closed: 418 self.close() 419 420 def _init_write_gz(self): 421 """Initialize for writing with gzip compression. 422 """ 423 self.cmp = self.zlib.compressobj(9, self.zlib.DEFLATED, 424 -self.zlib.MAX_WBITS, 425 self.zlib.DEF_MEM_LEVEL, 426 0) 427 timestamp = struct.pack("<L", int(time.time())) 428 self.__write(b"\037\213\010\010" + timestamp + b"\002\377") 429 if self.name.endswith(".gz"): 430 self.name = self.name[:-3] 431 # RFC1952 says we must use ISO-8859-1 for the FNAME field. 432 self.__write(self.name.encode("iso-8859-1", "replace") + NUL) 433 434 def write(self, s): 435 """Write string s to the stream. 436 """ 437 if self.comptype == "gz": 438 self.crc = self.zlib.crc32(s, self.crc) 439 self.pos += len(s) 440 if self.comptype != "tar": 441 s = self.cmp.compress(s) 442 self.__write(s) 443 444 def __write(self, s): 445 """Write string s to the stream if a whole new block 446 is ready to be written. 447 """ 448 self.buf += s 449 while len(self.buf) > self.bufsize: 450 self.fileobj.write(self.buf[:self.bufsize]) 451 self.buf = self.buf[self.bufsize:] 452 453 def close(self): 454 """Close the _Stream object. No operation should be 455 done on it afterwards. 456 """ 457 if self.closed: 458 return 459 460 self.closed = True 461 try: 462 if self.mode == "w" and self.comptype != "tar": 463 self.buf += self.cmp.flush() 464 465 if self.mode == "w" and self.buf: 466 self.fileobj.write(self.buf) 467 self.buf = b"" 468 if self.comptype == "gz": 469 self.fileobj.write(struct.pack("<L", self.crc)) 470 self.fileobj.write(struct.pack("<L", self.pos & 0xffffFFFF)) 471 finally: 472 if not self._extfileobj: 473 self.fileobj.close() 474 475 def _init_read_gz(self): 476 """Initialize for reading a gzip compressed fileobj. 477 """ 478 self.cmp = self.zlib.decompressobj(-self.zlib.MAX_WBITS) 479 self.dbuf = b"" 480 481 # taken from gzip.GzipFile with some alterations 482 if self.__read(2) != b"\037\213": 483 raise ReadError("not a gzip file") 484 if self.__read(1) != b"\010": 485 raise CompressionError("unsupported compression method") 486 487 flag = ord(self.__read(1)) 488 self.__read(6) 489 490 if flag & 4: 491 xlen = ord(self.__read(1)) + 256 * ord(self.__read(1)) 492 self.read(xlen) 493 if flag & 8: 494 while True: 495 s = self.__read(1) 496 if not s or s == NUL: 497 break 498 if flag & 16: 499 while True: 500 s = self.__read(1) 501 if not s or s == NUL: 502 break 503 if flag & 2: 504 self.__read(2) 505 506 def tell(self): 507 """Return the stream's file pointer position. 508 """ 509 return self.pos 510 511 def seek(self, pos=0): 512 """Set the stream's file pointer to pos. Negative seeking 513 is forbidden. 514 """ 515 if pos - self.pos >= 0: 516 blocks, remainder = divmod(pos - self.pos, self.bufsize) 517 for i in range(blocks): 518 self.read(self.bufsize) 519 self.read(remainder) 520 else: 521 raise StreamError("seeking backwards is not allowed") 522 return self.pos 523 524 def read(self, size=None): 525 """Return the next size number of bytes from the stream. 526 If size is not defined, return all bytes of the stream 527 up to EOF. 528 """ 529 if size is None: 530 t = [] 531 while True: 532 buf = self._read(self.bufsize) 533 if not buf: 534 break 535 t.append(buf) 536 buf = "".join(t) 537 else: 538 buf = self._read(size) 539 self.pos += len(buf) 540 return buf 541 542 def _read(self, size): 543 """Return size bytes from the stream. 544 """ 545 if self.comptype == "tar": 546 return self.__read(size) 547 548 c = len(self.dbuf) 549 while c < size: 550 buf = self.__read(self.bufsize) 551 if not buf: 552 break 553 try: 554 buf = self.cmp.decompress(buf) 555 except self.exception: 556 raise ReadError("invalid compressed data") 557 self.dbuf += buf 558 c += len(buf) 559 buf = self.dbuf[:size] 560 self.dbuf = self.dbuf[size:] 561 return buf 562 563 def __read(self, size): 564 """Return size bytes from stream. If internal buffer is empty, 565 read another block from the stream. 566 """ 567 c = len(self.buf) 568 while c < size: 569 buf = self.fileobj.read(self.bufsize) 570 if not buf: 571 break 572 self.buf += buf 573 c += len(buf) 574 buf = self.buf[:size] 575 self.buf = self.buf[size:] 576 return buf 577# class _Stream 578 579class _StreamProxy(object): 580 """Small proxy class that enables transparent compression 581 detection for the Stream interface (mode 'r|*'). 582 """ 583 584 def __init__(self, fileobj): 585 self.fileobj = fileobj 586 self.buf = self.fileobj.read(BLOCKSIZE) 587 588 def read(self, size): 589 self.read = self.fileobj.read 590 return self.buf 591 592 def getcomptype(self): 593 if self.buf.startswith(b"\x1f\x8b\x08"): 594 return "gz" 595 elif self.buf[0:3] == b"BZh" and self.buf[4:10] == b"1AY&SY": 596 return "bz2" 597 elif self.buf.startswith((b"\x5d\x00\x00\x80", b"\xfd7zXZ")): 598 return "xz" 599 else: 600 return "tar" 601 602 def close(self): 603 self.fileobj.close() 604# class StreamProxy 605 606#------------------------ 607# Extraction file object 608#------------------------ 609class _FileInFile(object): 610 """A thin wrapper around an existing file object that 611 provides a part of its data as an individual file 612 object. 613 """ 614 615 def __init__(self, fileobj, offset, size, blockinfo=None): 616 self.fileobj = fileobj 617 self.offset = offset 618 self.size = size 619 self.position = 0 620 self.name = getattr(fileobj, "name", None) 621 self.closed = False 622 623 if blockinfo is None: 624 blockinfo = [(0, size)] 625 626 # Construct a map with data and zero blocks. 627 self.map_index = 0 628 self.map = [] 629 lastpos = 0 630 realpos = self.offset 631 for offset, size in blockinfo: 632 if offset > lastpos: 633 self.map.append((False, lastpos, offset, None)) 634 self.map.append((True, offset, offset + size, realpos)) 635 realpos += size 636 lastpos = offset + size 637 if lastpos < self.size: 638 self.map.append((False, lastpos, self.size, None)) 639 640 def flush(self): 641 pass 642 643 def readable(self): 644 return True 645 646 def writable(self): 647 return False 648 649 def seekable(self): 650 return self.fileobj.seekable() 651 652 def tell(self): 653 """Return the current file position. 654 """ 655 return self.position 656 657 def seek(self, position, whence=io.SEEK_SET): 658 """Seek to a position in the file. 659 """ 660 if whence == io.SEEK_SET: 661 self.position = min(max(position, 0), self.size) 662 elif whence == io.SEEK_CUR: 663 if position < 0: 664 self.position = max(self.position + position, 0) 665 else: 666 self.position = min(self.position + position, self.size) 667 elif whence == io.SEEK_END: 668 self.position = max(min(self.size + position, self.size), 0) 669 else: 670 raise ValueError("Invalid argument") 671 return self.position 672 673 def read(self, size=None): 674 """Read data from the file. 675 """ 676 if size is None: 677 size = self.size - self.position 678 else: 679 size = min(size, self.size - self.position) 680 681 buf = b"" 682 while size > 0: 683 while True: 684 data, start, stop, offset = self.map[self.map_index] 685 if start <= self.position < stop: 686 break 687 else: 688 self.map_index += 1 689 if self.map_index == len(self.map): 690 self.map_index = 0 691 length = min(size, stop - self.position) 692 if data: 693 self.fileobj.seek(offset + (self.position - start)) 694 b = self.fileobj.read(length) 695 if len(b) != length: 696 raise ReadError("unexpected end of data") 697 buf += b 698 else: 699 buf += NUL * length 700 size -= length 701 self.position += length 702 return buf 703 704 def readinto(self, b): 705 buf = self.read(len(b)) 706 b[:len(buf)] = buf 707 return len(buf) 708 709 def close(self): 710 self.closed = True 711#class _FileInFile 712 713class ExFileObject(io.BufferedReader): 714 715 def __init__(self, tarfile, tarinfo): 716 fileobj = _FileInFile(tarfile.fileobj, tarinfo.offset_data, 717 tarinfo.size, tarinfo.sparse) 718 super().__init__(fileobj) 719#class ExFileObject 720 721#------------------ 722# Exported Classes 723#------------------ 724class TarInfo(object): 725 """Informational class which holds the details about an 726 archive member given by a tar header block. 727 TarInfo objects are returned by TarFile.getmember(), 728 TarFile.getmembers() and TarFile.gettarinfo() and are 729 usually created internally. 730 """ 731 732 __slots__ = ("name", "mode", "uid", "gid", "size", "mtime", 733 "chksum", "type", "linkname", "uname", "gname", 734 "devmajor", "devminor", 735 "offset", "offset_data", "pax_headers", "sparse", 736 "tarfile", "_sparse_structs", "_link_target") 737 738 def __init__(self, name=""): 739 """Construct a TarInfo object. name is the optional name 740 of the member. 741 """ 742 self.name = name # member name 743 self.mode = 0o644 # file permissions 744 self.uid = 0 # user id 745 self.gid = 0 # group id 746 self.size = 0 # file size 747 self.mtime = 0 # modification time 748 self.chksum = 0 # header checksum 749 self.type = REGTYPE # member type 750 self.linkname = "" # link name 751 self.uname = "" # user name 752 self.gname = "" # group name 753 self.devmajor = 0 # device major number 754 self.devminor = 0 # device minor number 755 756 self.offset = 0 # the tar header starts here 757 self.offset_data = 0 # the file's data starts here 758 759 self.sparse = None # sparse member information 760 self.pax_headers = {} # pax header information 761 762 # In pax headers the "name" and "linkname" field are called 763 # "path" and "linkpath". 764 def _getpath(self): 765 return self.name 766 def _setpath(self, name): 767 self.name = name 768 path = property(_getpath, _setpath) 769 770 def _getlinkpath(self): 771 return self.linkname 772 def _setlinkpath(self, linkname): 773 self.linkname = linkname 774 linkpath = property(_getlinkpath, _setlinkpath) 775 776 def __repr__(self): 777 return "<%s %r at %#x>" % (self.__class__.__name__,self.name,id(self)) 778 779 def get_info(self): 780 """Return the TarInfo's attributes as a dictionary. 781 """ 782 info = { 783 "name": self.name, 784 "mode": self.mode & 0o7777, 785 "uid": self.uid, 786 "gid": self.gid, 787 "size": self.size, 788 "mtime": self.mtime, 789 "chksum": self.chksum, 790 "type": self.type, 791 "linkname": self.linkname, 792 "uname": self.uname, 793 "gname": self.gname, 794 "devmajor": self.devmajor, 795 "devminor": self.devminor 796 } 797 798 if info["type"] == DIRTYPE and not info["name"].endswith("/"): 799 info["name"] += "/" 800 801 return info 802 803 def tobuf(self, format=DEFAULT_FORMAT, encoding=ENCODING, errors="surrogateescape"): 804 """Return a tar header as a string of 512 byte blocks. 805 """ 806 info = self.get_info() 807 808 if format == USTAR_FORMAT: 809 return self.create_ustar_header(info, encoding, errors) 810 elif format == GNU_FORMAT: 811 return self.create_gnu_header(info, encoding, errors) 812 elif format == PAX_FORMAT: 813 return self.create_pax_header(info, encoding) 814 else: 815 raise ValueError("invalid format") 816 817 def create_ustar_header(self, info, encoding, errors): 818 """Return the object as a ustar header block. 819 """ 820 info["magic"] = POSIX_MAGIC 821 822 if len(info["linkname"].encode(encoding, errors)) > LENGTH_LINK: 823 raise ValueError("linkname is too long") 824 825 if len(info["name"].encode(encoding, errors)) > LENGTH_NAME: 826 info["prefix"], info["name"] = self._posix_split_name(info["name"], encoding, errors) 827 828 return self._create_header(info, USTAR_FORMAT, encoding, errors) 829 830 def create_gnu_header(self, info, encoding, errors): 831 """Return the object as a GNU header block sequence. 832 """ 833 info["magic"] = GNU_MAGIC 834 835 buf = b"" 836 if len(info["linkname"].encode(encoding, errors)) > LENGTH_LINK: 837 buf += self._create_gnu_long_header(info["linkname"], GNUTYPE_LONGLINK, encoding, errors) 838 839 if len(info["name"].encode(encoding, errors)) > LENGTH_NAME: 840 buf += self._create_gnu_long_header(info["name"], GNUTYPE_LONGNAME, encoding, errors) 841 842 return buf + self._create_header(info, GNU_FORMAT, encoding, errors) 843 844 def create_pax_header(self, info, encoding): 845 """Return the object as a ustar header block. If it cannot be 846 represented this way, prepend a pax extended header sequence 847 with supplement information. 848 """ 849 info["magic"] = POSIX_MAGIC 850 pax_headers = self.pax_headers.copy() 851 852 # Test string fields for values that exceed the field length or cannot 853 # be represented in ASCII encoding. 854 for name, hname, length in ( 855 ("name", "path", LENGTH_NAME), ("linkname", "linkpath", LENGTH_LINK), 856 ("uname", "uname", 32), ("gname", "gname", 32)): 857 858 if hname in pax_headers: 859 # The pax header has priority. 860 continue 861 862 # Try to encode the string as ASCII. 863 try: 864 info[name].encode("ascii", "strict") 865 except UnicodeEncodeError: 866 pax_headers[hname] = info[name] 867 continue 868 869 if len(info[name]) > length: 870 pax_headers[hname] = info[name] 871 872 # Test number fields for values that exceed the field limit or values 873 # that like to be stored as float. 874 for name, digits in (("uid", 8), ("gid", 8), ("size", 12), ("mtime", 12)): 875 if name in pax_headers: 876 # The pax header has priority. Avoid overflow. 877 info[name] = 0 878 continue 879 880 val = info[name] 881 if not 0 <= val < 8 ** (digits - 1) or isinstance(val, float): 882 pax_headers[name] = str(val) 883 info[name] = 0 884 885 # Create a pax extended header if necessary. 886 if pax_headers: 887 buf = self._create_pax_generic_header(pax_headers, XHDTYPE, encoding) 888 else: 889 buf = b"" 890 891 return buf + self._create_header(info, USTAR_FORMAT, "ascii", "replace") 892 893 @classmethod 894 def create_pax_global_header(cls, pax_headers): 895 """Return the object as a pax global header block sequence. 896 """ 897 return cls._create_pax_generic_header(pax_headers, XGLTYPE, "utf-8") 898 899 def _posix_split_name(self, name, encoding, errors): 900 """Split a name longer than 100 chars into a prefix 901 and a name part. 902 """ 903 components = name.split("/") 904 for i in range(1, len(components)): 905 prefix = "/".join(components[:i]) 906 name = "/".join(components[i:]) 907 if len(prefix.encode(encoding, errors)) <= LENGTH_PREFIX and \ 908 len(name.encode(encoding, errors)) <= LENGTH_NAME: 909 break 910 else: 911 raise ValueError("name is too long") 912 913 return prefix, name 914 915 @staticmethod 916 def _create_header(info, format, encoding, errors): 917 """Return a header block. info is a dictionary with file 918 information, format must be one of the *_FORMAT constants. 919 """ 920 parts = [ 921 stn(info.get("name", ""), 100, encoding, errors), 922 itn(info.get("mode", 0) & 0o7777, 8, format), 923 itn(info.get("uid", 0), 8, format), 924 itn(info.get("gid", 0), 8, format), 925 itn(info.get("size", 0), 12, format), 926 itn(info.get("mtime", 0), 12, format), 927 b" ", # checksum field 928 info.get("type", REGTYPE), 929 stn(info.get("linkname", ""), 100, encoding, errors), 930 info.get("magic", POSIX_MAGIC), 931 stn(info.get("uname", ""), 32, encoding, errors), 932 stn(info.get("gname", ""), 32, encoding, errors), 933 itn(info.get("devmajor", 0), 8, format), 934 itn(info.get("devminor", 0), 8, format), 935 stn(info.get("prefix", ""), 155, encoding, errors) 936 ] 937 938 buf = struct.pack("%ds" % BLOCKSIZE, b"".join(parts)) 939 chksum = calc_chksums(buf[-BLOCKSIZE:])[0] 940 buf = buf[:-364] + bytes("%06o\0" % chksum, "ascii") + buf[-357:] 941 return buf 942 943 @staticmethod 944 def _create_payload(payload): 945 """Return the string payload filled with zero bytes 946 up to the next 512 byte border. 947 """ 948 blocks, remainder = divmod(len(payload), BLOCKSIZE) 949 if remainder > 0: 950 payload += (BLOCKSIZE - remainder) * NUL 951 return payload 952 953 @classmethod 954 def _create_gnu_long_header(cls, name, type, encoding, errors): 955 """Return a GNUTYPE_LONGNAME or GNUTYPE_LONGLINK sequence 956 for name. 957 """ 958 name = name.encode(encoding, errors) + NUL 959 960 info = {} 961 info["name"] = "././@LongLink" 962 info["type"] = type 963 info["size"] = len(name) 964 info["magic"] = GNU_MAGIC 965 966 # create extended header + name blocks. 967 return cls._create_header(info, USTAR_FORMAT, encoding, errors) + \ 968 cls._create_payload(name) 969 970 @classmethod 971 def _create_pax_generic_header(cls, pax_headers, type, encoding): 972 """Return a POSIX.1-2008 extended or global header sequence 973 that contains a list of keyword, value pairs. The values 974 must be strings. 975 """ 976 # Check if one of the fields contains surrogate characters and thereby 977 # forces hdrcharset=BINARY, see _proc_pax() for more information. 978 binary = False 979 for keyword, value in pax_headers.items(): 980 try: 981 value.encode("utf-8", "strict") 982 except UnicodeEncodeError: 983 binary = True 984 break 985 986 records = b"" 987 if binary: 988 # Put the hdrcharset field at the beginning of the header. 989 records += b"21 hdrcharset=BINARY\n" 990 991 for keyword, value in pax_headers.items(): 992 keyword = keyword.encode("utf-8") 993 if binary: 994 # Try to restore the original byte representation of `value'. 995 # Needless to say, that the encoding must match the string. 996 value = value.encode(encoding, "surrogateescape") 997 else: 998 value = value.encode("utf-8") 999 1000 l = len(keyword) + len(value) + 3 # ' ' + '=' + '\n' 1001 n = p = 0 1002 while True: 1003 n = l + len(str(p)) 1004 if n == p: 1005 break 1006 p = n 1007 records += bytes(str(p), "ascii") + b" " + keyword + b"=" + value + b"\n" 1008 1009 # We use a hardcoded "././@PaxHeader" name like star does 1010 # instead of the one that POSIX recommends. 1011 info = {} 1012 info["name"] = "././@PaxHeader" 1013 info["type"] = type 1014 info["size"] = len(records) 1015 info["magic"] = POSIX_MAGIC 1016 1017 # Create pax header + record blocks. 1018 return cls._create_header(info, USTAR_FORMAT, "ascii", "replace") + \ 1019 cls._create_payload(records) 1020 1021 @classmethod 1022 def frombuf(cls, buf, encoding, errors): 1023 """Construct a TarInfo object from a 512 byte bytes object. 1024 """ 1025 if len(buf) == 0: 1026 raise EmptyHeaderError("empty header") 1027 if len(buf) != BLOCKSIZE: 1028 raise TruncatedHeaderError("truncated header") 1029 if buf.count(NUL) == BLOCKSIZE: 1030 raise EOFHeaderError("end of file header") 1031 1032 chksum = nti(buf[148:156]) 1033 if chksum not in calc_chksums(buf): 1034 raise InvalidHeaderError("bad checksum") 1035 1036 obj = cls() 1037 obj.name = nts(buf[0:100], encoding, errors) 1038 obj.mode = nti(buf[100:108]) 1039 obj.uid = nti(buf[108:116]) 1040 obj.gid = nti(buf[116:124]) 1041 obj.size = nti(buf[124:136]) 1042 obj.mtime = nti(buf[136:148]) 1043 obj.chksum = chksum 1044 obj.type = buf[156:157] 1045 obj.linkname = nts(buf[157:257], encoding, errors) 1046 obj.uname = nts(buf[265:297], encoding, errors) 1047 obj.gname = nts(buf[297:329], encoding, errors) 1048 obj.devmajor = nti(buf[329:337]) 1049 obj.devminor = nti(buf[337:345]) 1050 prefix = nts(buf[345:500], encoding, errors) 1051 1052 # Old V7 tar format represents a directory as a regular 1053 # file with a trailing slash. 1054 if obj.type == AREGTYPE and obj.name.endswith("/"): 1055 obj.type = DIRTYPE 1056 1057 # The old GNU sparse format occupies some of the unused 1058 # space in the buffer for up to 4 sparse structures. 1059 # Save the them for later processing in _proc_sparse(). 1060 if obj.type == GNUTYPE_SPARSE: 1061 pos = 386 1062 structs = [] 1063 for i in range(4): 1064 try: 1065 offset = nti(buf[pos:pos + 12]) 1066 numbytes = nti(buf[pos + 12:pos + 24]) 1067 except ValueError: 1068 break 1069 structs.append((offset, numbytes)) 1070 pos += 24 1071 isextended = bool(buf[482]) 1072 origsize = nti(buf[483:495]) 1073 obj._sparse_structs = (structs, isextended, origsize) 1074 1075 # Remove redundant slashes from directories. 1076 if obj.isdir(): 1077 obj.name = obj.name.rstrip("/") 1078 1079 # Reconstruct a ustar longname. 1080 if prefix and obj.type not in GNU_TYPES: 1081 obj.name = prefix + "/" + obj.name 1082 return obj 1083 1084 @classmethod 1085 def fromtarfile(cls, tarfile): 1086 """Return the next TarInfo object from TarFile object 1087 tarfile. 1088 """ 1089 buf = tarfile.fileobj.read(BLOCKSIZE) 1090 obj = cls.frombuf(buf, tarfile.encoding, tarfile.errors) 1091 obj.offset = tarfile.fileobj.tell() - BLOCKSIZE 1092 return obj._proc_member(tarfile) 1093 1094 #-------------------------------------------------------------------------- 1095 # The following are methods that are called depending on the type of a 1096 # member. The entry point is _proc_member() which can be overridden in a 1097 # subclass to add custom _proc_*() methods. A _proc_*() method MUST 1098 # implement the following 1099 # operations: 1100 # 1. Set self.offset_data to the position where the data blocks begin, 1101 # if there is data that follows. 1102 # 2. Set tarfile.offset to the position where the next member's header will 1103 # begin. 1104 # 3. Return self or another valid TarInfo object. 1105 def _proc_member(self, tarfile): 1106 """Choose the right processing method depending on 1107 the type and call it. 1108 """ 1109 if self.type in (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK): 1110 return self._proc_gnulong(tarfile) 1111 elif self.type == GNUTYPE_SPARSE: 1112 return self._proc_sparse(tarfile) 1113 elif self.type in (XHDTYPE, XGLTYPE, SOLARIS_XHDTYPE): 1114 return self._proc_pax(tarfile) 1115 else: 1116 return self._proc_builtin(tarfile) 1117 1118 def _proc_builtin(self, tarfile): 1119 """Process a builtin type or an unknown type which 1120 will be treated as a regular file. 1121 """ 1122 self.offset_data = tarfile.fileobj.tell() 1123 offset = self.offset_data 1124 if self.isreg() or self.type not in SUPPORTED_TYPES: 1125 # Skip the following data blocks. 1126 offset += self._block(self.size) 1127 tarfile.offset = offset 1128 1129 # Patch the TarInfo object with saved global 1130 # header information. 1131 self._apply_pax_info(tarfile.pax_headers, tarfile.encoding, tarfile.errors) 1132 1133 return self 1134 1135 def _proc_gnulong(self, tarfile): 1136 """Process the blocks that hold a GNU longname 1137 or longlink member. 1138 """ 1139 buf = tarfile.fileobj.read(self._block(self.size)) 1140 1141 # Fetch the next header and process it. 1142 try: 1143 next = self.fromtarfile(tarfile) 1144 except HeaderError: 1145 raise SubsequentHeaderError("missing or bad subsequent header") 1146 1147 # Patch the TarInfo object from the next header with 1148 # the longname information. 1149 next.offset = self.offset 1150 if self.type == GNUTYPE_LONGNAME: 1151 next.name = nts(buf, tarfile.encoding, tarfile.errors) 1152 elif self.type == GNUTYPE_LONGLINK: 1153 next.linkname = nts(buf, tarfile.encoding, tarfile.errors) 1154 1155 return next 1156 1157 def _proc_sparse(self, tarfile): 1158 """Process a GNU sparse header plus extra headers. 1159 """ 1160 # We already collected some sparse structures in frombuf(). 1161 structs, isextended, origsize = self._sparse_structs 1162 del self._sparse_structs 1163 1164 # Collect sparse structures from extended header blocks. 1165 while isextended: 1166 buf = tarfile.fileobj.read(BLOCKSIZE) 1167 pos = 0 1168 for i in range(21): 1169 try: 1170 offset = nti(buf[pos:pos + 12]) 1171 numbytes = nti(buf[pos + 12:pos + 24]) 1172 except ValueError: 1173 break 1174 if offset and numbytes: 1175 structs.append((offset, numbytes)) 1176 pos += 24 1177 isextended = bool(buf[504]) 1178 self.sparse = structs 1179 1180 self.offset_data = tarfile.fileobj.tell() 1181 tarfile.offset = self.offset_data + self._block(self.size) 1182 self.size = origsize 1183 return self 1184 1185 def _proc_pax(self, tarfile): 1186 """Process an extended or global header as described in 1187 POSIX.1-2008. 1188 """ 1189 # Read the header information. 1190 buf = tarfile.fileobj.read(self._block(self.size)) 1191 1192 # A pax header stores supplemental information for either 1193 # the following file (extended) or all following files 1194 # (global). 1195 if self.type == XGLTYPE: 1196 pax_headers = tarfile.pax_headers 1197 else: 1198 pax_headers = tarfile.pax_headers.copy() 1199 1200 # Check if the pax header contains a hdrcharset field. This tells us 1201 # the encoding of the path, linkpath, uname and gname fields. Normally, 1202 # these fields are UTF-8 encoded but since POSIX.1-2008 tar 1203 # implementations are allowed to store them as raw binary strings if 1204 # the translation to UTF-8 fails. 1205 match = re.search(br"\d+ hdrcharset=([^\n]+)\n", buf) 1206 if match is not None: 1207 pax_headers["hdrcharset"] = match.group(1).decode("utf-8") 1208 1209 # For the time being, we don't care about anything other than "BINARY". 1210 # The only other value that is currently allowed by the standard is 1211 # "ISO-IR 10646 2000 UTF-8" in other words UTF-8. 1212 hdrcharset = pax_headers.get("hdrcharset") 1213 if hdrcharset == "BINARY": 1214 encoding = tarfile.encoding 1215 else: 1216 encoding = "utf-8" 1217 1218 # Parse pax header information. A record looks like that: 1219 # "%d %s=%s\n" % (length, keyword, value). length is the size 1220 # of the complete record including the length field itself and 1221 # the newline. keyword and value are both UTF-8 encoded strings. 1222 regex = re.compile(br"(\d+) ([^=]+)=") 1223 pos = 0 1224 while True: 1225 match = regex.match(buf, pos) 1226 if not match: 1227 break 1228 1229 length, keyword = match.groups() 1230 length = int(length) 1231 value = buf[match.end(2) + 1:match.start(1) + length - 1] 1232 1233 # Normally, we could just use "utf-8" as the encoding and "strict" 1234 # as the error handler, but we better not take the risk. For 1235 # example, GNU tar <= 1.23 is known to store filenames it cannot 1236 # translate to UTF-8 as raw strings (unfortunately without a 1237 # hdrcharset=BINARY header). 1238 # We first try the strict standard encoding, and if that fails we 1239 # fall back on the user's encoding and error handler. 1240 keyword = self._decode_pax_field(keyword, "utf-8", "utf-8", 1241 tarfile.errors) 1242 if keyword in PAX_NAME_FIELDS: 1243 value = self._decode_pax_field(value, encoding, tarfile.encoding, 1244 tarfile.errors) 1245 else: 1246 value = self._decode_pax_field(value, "utf-8", "utf-8", 1247 tarfile.errors) 1248 1249 pax_headers[keyword] = value 1250 pos += length 1251 1252 # Fetch the next header. 1253 try: 1254 next = self.fromtarfile(tarfile) 1255 except HeaderError: 1256 raise SubsequentHeaderError("missing or bad subsequent header") 1257 1258 # Process GNU sparse information. 1259 if "GNU.sparse.map" in pax_headers: 1260 # GNU extended sparse format version 0.1. 1261 self._proc_gnusparse_01(next, pax_headers) 1262 1263 elif "GNU.sparse.size" in pax_headers: 1264 # GNU extended sparse format version 0.0. 1265 self._proc_gnusparse_00(next, pax_headers, buf) 1266 1267 elif pax_headers.get("GNU.sparse.major") == "1" and pax_headers.get("GNU.sparse.minor") == "0": 1268 # GNU extended sparse format version 1.0. 1269 self._proc_gnusparse_10(next, pax_headers, tarfile) 1270 1271 if self.type in (XHDTYPE, SOLARIS_XHDTYPE): 1272 # Patch the TarInfo object with the extended header info. 1273 next._apply_pax_info(pax_headers, tarfile.encoding, tarfile.errors) 1274 next.offset = self.offset 1275 1276 if "size" in pax_headers: 1277 # If the extended header replaces the size field, 1278 # we need to recalculate the offset where the next 1279 # header starts. 1280 offset = next.offset_data 1281 if next.isreg() or next.type not in SUPPORTED_TYPES: 1282 offset += next._block(next.size) 1283 tarfile.offset = offset 1284 1285 return next 1286 1287 def _proc_gnusparse_00(self, next, pax_headers, buf): 1288 """Process a GNU tar extended sparse header, version 0.0. 1289 """ 1290 offsets = [] 1291 for match in re.finditer(br"\d+ GNU.sparse.offset=(\d+)\n", buf): 1292 offsets.append(int(match.group(1))) 1293 numbytes = [] 1294 for match in re.finditer(br"\d+ GNU.sparse.numbytes=(\d+)\n", buf): 1295 numbytes.append(int(match.group(1))) 1296 next.sparse = list(zip(offsets, numbytes)) 1297 1298 def _proc_gnusparse_01(self, next, pax_headers): 1299 """Process a GNU tar extended sparse header, version 0.1. 1300 """ 1301 sparse = [int(x) for x in pax_headers["GNU.sparse.map"].split(",")] 1302 next.sparse = list(zip(sparse[::2], sparse[1::2])) 1303 1304 def _proc_gnusparse_10(self, next, pax_headers, tarfile): 1305 """Process a GNU tar extended sparse header, version 1.0. 1306 """ 1307 fields = None 1308 sparse = [] 1309 buf = tarfile.fileobj.read(BLOCKSIZE) 1310 fields, buf = buf.split(b"\n", 1) 1311 fields = int(fields) 1312 while len(sparse) < fields * 2: 1313 if b"\n" not in buf: 1314 buf += tarfile.fileobj.read(BLOCKSIZE) 1315 number, buf = buf.split(b"\n", 1) 1316 sparse.append(int(number)) 1317 next.offset_data = tarfile.fileobj.tell() 1318 next.sparse = list(zip(sparse[::2], sparse[1::2])) 1319 1320 def _apply_pax_info(self, pax_headers, encoding, errors): 1321 """Replace fields with supplemental information from a previous 1322 pax extended or global header. 1323 """ 1324 for keyword, value in pax_headers.items(): 1325 if keyword == "GNU.sparse.name": 1326 setattr(self, "path", value) 1327 elif keyword == "GNU.sparse.size": 1328 setattr(self, "size", int(value)) 1329 elif keyword == "GNU.sparse.realsize": 1330 setattr(self, "size", int(value)) 1331 elif keyword in PAX_FIELDS: 1332 if keyword in PAX_NUMBER_FIELDS: 1333 try: 1334 value = PAX_NUMBER_FIELDS[keyword](value) 1335 except ValueError: 1336 value = 0 1337 if keyword == "path": 1338 value = value.rstrip("/") 1339 setattr(self, keyword, value) 1340 1341 self.pax_headers = pax_headers.copy() 1342 1343 def _decode_pax_field(self, value, encoding, fallback_encoding, fallback_errors): 1344 """Decode a single field from a pax record. 1345 """ 1346 try: 1347 return value.decode(encoding, "strict") 1348 except UnicodeDecodeError: 1349 return value.decode(fallback_encoding, fallback_errors) 1350 1351 def _block(self, count): 1352 """Round up a byte count by BLOCKSIZE and return it, 1353 e.g. _block(834) => 1024. 1354 """ 1355 blocks, remainder = divmod(count, BLOCKSIZE) 1356 if remainder: 1357 blocks += 1 1358 return blocks * BLOCKSIZE 1359 1360 def isreg(self): 1361 return self.type in REGULAR_TYPES 1362 def isfile(self): 1363 return self.isreg() 1364 def isdir(self): 1365 return self.type == DIRTYPE 1366 def issym(self): 1367 return self.type == SYMTYPE 1368 def islnk(self): 1369 return self.type == LNKTYPE 1370 def ischr(self): 1371 return self.type == CHRTYPE 1372 def isblk(self): 1373 return self.type == BLKTYPE 1374 def isfifo(self): 1375 return self.type == FIFOTYPE 1376 def issparse(self): 1377 return self.sparse is not None 1378 def isdev(self): 1379 return self.type in (CHRTYPE, BLKTYPE, FIFOTYPE) 1380# class TarInfo 1381 1382class TarFile(object): 1383 """The TarFile Class provides an interface to tar archives. 1384 """ 1385 1386 debug = 0 # May be set from 0 (no msgs) to 3 (all msgs) 1387 1388 dereference = False # If true, add content of linked file to the 1389 # tar file, else the link. 1390 1391 ignore_zeros = False # If true, skips empty or invalid blocks and 1392 # continues processing. 1393 1394 errorlevel = 1 # If 0, fatal errors only appear in debug 1395 # messages (if debug >= 0). If > 0, errors 1396 # are passed to the caller as exceptions. 1397 1398 format = DEFAULT_FORMAT # The format to use when creating an archive. 1399 1400 encoding = ENCODING # Encoding for 8-bit character strings. 1401 1402 errors = None # Error handler for unicode conversion. 1403 1404 tarinfo = TarInfo # The default TarInfo class to use. 1405 1406 fileobject = ExFileObject # The file-object for extractfile(). 1407 1408 def __init__(self, name=None, mode="r", fileobj=None, format=None, 1409 tarinfo=None, dereference=None, ignore_zeros=None, encoding=None, 1410 errors="surrogateescape", pax_headers=None, debug=None, 1411 errorlevel=None, copybufsize=None): 1412 """Open an (uncompressed) tar archive `name'. `mode' is either 'r' to 1413 read from an existing archive, 'a' to append data to an existing 1414 file or 'w' to create a new file overwriting an existing one. `mode' 1415 defaults to 'r'. 1416 If `fileobj' is given, it is used for reading or writing data. If it 1417 can be determined, `mode' is overridden by `fileobj's mode. 1418 `fileobj' is not closed, when TarFile is closed. 1419 """ 1420 modes = {"r": "rb", "a": "r+b", "w": "wb", "x": "xb"} 1421 if mode not in modes: 1422 raise ValueError("mode must be 'r', 'a', 'w' or 'x'") 1423 self.mode = mode 1424 self._mode = modes[mode] 1425 1426 if not fileobj: 1427 if self.mode == "a" and not os.path.exists(name): 1428 # Create nonexistent files in append mode. 1429 self.mode = "w" 1430 self._mode = "wb" 1431 fileobj = bltn_open(name, self._mode) 1432 self._extfileobj = False 1433 else: 1434 if (name is None and hasattr(fileobj, "name") and 1435 isinstance(fileobj.name, (str, bytes))): 1436 name = fileobj.name 1437 if hasattr(fileobj, "mode"): 1438 self._mode = fileobj.mode 1439 self._extfileobj = True 1440 self.name = os.path.abspath(name) if name else None 1441 self.fileobj = fileobj 1442 1443 # Init attributes. 1444 if format is not None: 1445 self.format = format 1446 if tarinfo is not None: 1447 self.tarinfo = tarinfo 1448 if dereference is not None: 1449 self.dereference = dereference 1450 if ignore_zeros is not None: 1451 self.ignore_zeros = ignore_zeros 1452 if encoding is not None: 1453 self.encoding = encoding 1454 self.errors = errors 1455 1456 if pax_headers is not None and self.format == PAX_FORMAT: 1457 self.pax_headers = pax_headers 1458 else: 1459 self.pax_headers = {} 1460 1461 if debug is not None: 1462 self.debug = debug 1463 if errorlevel is not None: 1464 self.errorlevel = errorlevel 1465 1466 # Init datastructures. 1467 self.copybufsize = copybufsize 1468 self.closed = False 1469 self.members = [] # list of members as TarInfo objects 1470 self._loaded = False # flag if all members have been read 1471 self.offset = self.fileobj.tell() 1472 # current position in the archive file 1473 self.inodes = {} # dictionary caching the inodes of 1474 # archive members already added 1475 1476 try: 1477 if self.mode == "r": 1478 self.firstmember = None 1479 self.firstmember = self.next() 1480 1481 if self.mode == "a": 1482 # Move to the end of the archive, 1483 # before the first empty block. 1484 while True: 1485 self.fileobj.seek(self.offset) 1486 try: 1487 tarinfo = self.tarinfo.fromtarfile(self) 1488 self.members.append(tarinfo) 1489 except EOFHeaderError: 1490 self.fileobj.seek(self.offset) 1491 break 1492 except HeaderError as e: 1493 raise ReadError(str(e)) 1494 1495 if self.mode in ("a", "w", "x"): 1496 self._loaded = True 1497 1498 if self.pax_headers: 1499 buf = self.tarinfo.create_pax_global_header(self.pax_headers.copy()) 1500 self.fileobj.write(buf) 1501 self.offset += len(buf) 1502 except: 1503 if not self._extfileobj: 1504 self.fileobj.close() 1505 self.closed = True 1506 raise 1507 1508 #-------------------------------------------------------------------------- 1509 # Below are the classmethods which act as alternate constructors to the 1510 # TarFile class. The open() method is the only one that is needed for 1511 # public use; it is the "super"-constructor and is able to select an 1512 # adequate "sub"-constructor for a particular compression using the mapping 1513 # from OPEN_METH. 1514 # 1515 # This concept allows one to subclass TarFile without losing the comfort of 1516 # the super-constructor. A sub-constructor is registered and made available 1517 # by adding it to the mapping in OPEN_METH. 1518 1519 @classmethod 1520 def open(cls, name=None, mode="r", fileobj=None, bufsize=RECORDSIZE, **kwargs): 1521 """Open a tar archive for reading, writing or appending. Return 1522 an appropriate TarFile class. 1523 1524 mode: 1525 'r' or 'r:*' open for reading with transparent compression 1526 'r:' open for reading exclusively uncompressed 1527 'r:gz' open for reading with gzip compression 1528 'r:bz2' open for reading with bzip2 compression 1529 'r:xz' open for reading with lzma compression 1530 'a' or 'a:' open for appending, creating the file if necessary 1531 'w' or 'w:' open for writing without compression 1532 'w:gz' open for writing with gzip compression 1533 'w:bz2' open for writing with bzip2 compression 1534 'w:xz' open for writing with lzma compression 1535 1536 'x' or 'x:' create a tarfile exclusively without compression, raise 1537 an exception if the file is already created 1538 'x:gz' create a gzip compressed tarfile, raise an exception 1539 if the file is already created 1540 'x:bz2' create a bzip2 compressed tarfile, raise an exception 1541 if the file is already created 1542 'x:xz' create an lzma compressed tarfile, raise an exception 1543 if the file is already created 1544 1545 'r|*' open a stream of tar blocks with transparent compression 1546 'r|' open an uncompressed stream of tar blocks for reading 1547 'r|gz' open a gzip compressed stream of tar blocks 1548 'r|bz2' open a bzip2 compressed stream of tar blocks 1549 'r|xz' open an lzma compressed stream of tar blocks 1550 'w|' open an uncompressed stream for writing 1551 'w|gz' open a gzip compressed stream for writing 1552 'w|bz2' open a bzip2 compressed stream for writing 1553 'w|xz' open an lzma compressed stream for writing 1554 """ 1555 1556 if not name and not fileobj: 1557 raise ValueError("nothing to open") 1558 1559 if mode in ("r", "r:*"): 1560 # Find out which *open() is appropriate for opening the file. 1561 def not_compressed(comptype): 1562 return cls.OPEN_METH[comptype] == 'taropen' 1563 for comptype in sorted(cls.OPEN_METH, key=not_compressed): 1564 func = getattr(cls, cls.OPEN_METH[comptype]) 1565 if fileobj is not None: 1566 saved_pos = fileobj.tell() 1567 try: 1568 return func(name, "r", fileobj, **kwargs) 1569 except (ReadError, CompressionError): 1570 if fileobj is not None: 1571 fileobj.seek(saved_pos) 1572 continue 1573 raise ReadError("file could not be opened successfully") 1574 1575 elif ":" in mode: 1576 filemode, comptype = mode.split(":", 1) 1577 filemode = filemode or "r" 1578 comptype = comptype or "tar" 1579 1580 # Select the *open() function according to 1581 # given compression. 1582 if comptype in cls.OPEN_METH: 1583 func = getattr(cls, cls.OPEN_METH[comptype]) 1584 else: 1585 raise CompressionError("unknown compression type %r" % comptype) 1586 return func(name, filemode, fileobj, **kwargs) 1587 1588 elif "|" in mode: 1589 filemode, comptype = mode.split("|", 1) 1590 filemode = filemode or "r" 1591 comptype = comptype or "tar" 1592 1593 if filemode not in ("r", "w"): 1594 raise ValueError("mode must be 'r' or 'w'") 1595 1596 stream = _Stream(name, filemode, comptype, fileobj, bufsize) 1597 try: 1598 t = cls(name, filemode, stream, **kwargs) 1599 except: 1600 stream.close() 1601 raise 1602 t._extfileobj = False 1603 return t 1604 1605 elif mode in ("a", "w", "x"): 1606 return cls.taropen(name, mode, fileobj, **kwargs) 1607 1608 raise ValueError("undiscernible mode") 1609 1610 @classmethod 1611 def taropen(cls, name, mode="r", fileobj=None, **kwargs): 1612 """Open uncompressed tar archive name for reading or writing. 1613 """ 1614 if mode not in ("r", "a", "w", "x"): 1615 raise ValueError("mode must be 'r', 'a', 'w' or 'x'") 1616 return cls(name, mode, fileobj, **kwargs) 1617 1618 @classmethod 1619 def gzopen(cls, name, mode="r", fileobj=None, compresslevel=9, **kwargs): 1620 """Open gzip compressed tar archive name for reading or writing. 1621 Appending is not allowed. 1622 """ 1623 if mode not in ("r", "w", "x"): 1624 raise ValueError("mode must be 'r', 'w' or 'x'") 1625 1626 try: 1627 import gzip 1628 gzip.GzipFile 1629 except (ImportError, AttributeError): 1630 raise CompressionError("gzip module is not available") 1631 1632 try: 1633 fileobj = gzip.GzipFile(name, mode + "b", compresslevel, fileobj) 1634 except OSError: 1635 if fileobj is not None and mode == 'r': 1636 raise ReadError("not a gzip file") 1637 raise 1638 1639 try: 1640 t = cls.taropen(name, mode, fileobj, **kwargs) 1641 except OSError: 1642 fileobj.close() 1643 if mode == 'r': 1644 raise ReadError("not a gzip file") 1645 raise 1646 except: 1647 fileobj.close() 1648 raise 1649 t._extfileobj = False 1650 return t 1651 1652 @classmethod 1653 def bz2open(cls, name, mode="r", fileobj=None, compresslevel=9, **kwargs): 1654 """Open bzip2 compressed tar archive name for reading or writing. 1655 Appending is not allowed. 1656 """ 1657 if mode not in ("r", "w", "x"): 1658 raise ValueError("mode must be 'r', 'w' or 'x'") 1659 1660 try: 1661 import bz2 1662 except ImportError: 1663 raise CompressionError("bz2 module is not available") 1664 1665 fileobj = bz2.BZ2File(fileobj or name, mode, 1666 compresslevel=compresslevel) 1667 1668 try: 1669 t = cls.taropen(name, mode, fileobj, **kwargs) 1670 except (OSError, EOFError): 1671 fileobj.close() 1672 if mode == 'r': 1673 raise ReadError("not a bzip2 file") 1674 raise 1675 except: 1676 fileobj.close() 1677 raise 1678 t._extfileobj = False 1679 return t 1680 1681 @classmethod 1682 def xzopen(cls, name, mode="r", fileobj=None, preset=None, **kwargs): 1683 """Open lzma compressed tar archive name for reading or writing. 1684 Appending is not allowed. 1685 """ 1686 if mode not in ("r", "w", "x"): 1687 raise ValueError("mode must be 'r', 'w' or 'x'") 1688 1689 try: 1690 import lzma 1691 except ImportError: 1692 raise CompressionError("lzma module is not available") 1693 1694 fileobj = lzma.LZMAFile(fileobj or name, mode, preset=preset) 1695 1696 try: 1697 t = cls.taropen(name, mode, fileobj, **kwargs) 1698 except (lzma.LZMAError, EOFError): 1699 fileobj.close() 1700 if mode == 'r': 1701 raise ReadError("not an lzma file") 1702 raise 1703 except: 1704 fileobj.close() 1705 raise 1706 t._extfileobj = False 1707 return t 1708 1709 # All *open() methods are registered here. 1710 OPEN_METH = { 1711 "tar": "taropen", # uncompressed tar 1712 "gz": "gzopen", # gzip compressed tar 1713 "bz2": "bz2open", # bzip2 compressed tar 1714 "xz": "xzopen" # lzma compressed tar 1715 } 1716 1717 #-------------------------------------------------------------------------- 1718 # The public methods which TarFile provides: 1719 1720 def close(self): 1721 """Close the TarFile. In write-mode, two finishing zero blocks are 1722 appended to the archive. 1723 """ 1724 if self.closed: 1725 return 1726 1727 self.closed = True 1728 try: 1729 if self.mode in ("a", "w", "x"): 1730 self.fileobj.write(NUL * (BLOCKSIZE * 2)) 1731 self.offset += (BLOCKSIZE * 2) 1732 # fill up the end with zero-blocks 1733 # (like option -b20 for tar does) 1734 blocks, remainder = divmod(self.offset, RECORDSIZE) 1735 if remainder > 0: 1736 self.fileobj.write(NUL * (RECORDSIZE - remainder)) 1737 finally: 1738 if not self._extfileobj: 1739 self.fileobj.close() 1740 1741 def getmember(self, name): 1742 """Return a TarInfo object for member `name'. If `name' can not be 1743 found in the archive, KeyError is raised. If a member occurs more 1744 than once in the archive, its last occurrence is assumed to be the 1745 most up-to-date version. 1746 """ 1747 tarinfo = self._getmember(name) 1748 if tarinfo is None: 1749 raise KeyError("filename %r not found" % name) 1750 return tarinfo 1751 1752 def getmembers(self): 1753 """Return the members of the archive as a list of TarInfo objects. The 1754 list has the same order as the members in the archive. 1755 """ 1756 self._check() 1757 if not self._loaded: # if we want to obtain a list of 1758 self._load() # all members, we first have to 1759 # scan the whole archive. 1760 return self.members 1761 1762 def getnames(self): 1763 """Return the members of the archive as a list of their names. It has 1764 the same order as the list returned by getmembers(). 1765 """ 1766 return [tarinfo.name for tarinfo in self.getmembers()] 1767 1768 def gettarinfo(self, name=None, arcname=None, fileobj=None): 1769 """Create a TarInfo object from the result of os.stat or equivalent 1770 on an existing file. The file is either named by `name', or 1771 specified as a file object `fileobj' with a file descriptor. If 1772 given, `arcname' specifies an alternative name for the file in the 1773 archive, otherwise, the name is taken from the 'name' attribute of 1774 'fileobj', or the 'name' argument. The name should be a text 1775 string. 1776 """ 1777 self._check("awx") 1778 1779 # When fileobj is given, replace name by 1780 # fileobj's real name. 1781 if fileobj is not None: 1782 name = fileobj.name 1783 1784 # Building the name of the member in the archive. 1785 # Backward slashes are converted to forward slashes, 1786 # Absolute paths are turned to relative paths. 1787 if arcname is None: 1788 arcname = name 1789 drv, arcname = os.path.splitdrive(arcname) 1790 arcname = arcname.replace(os.sep, "/") 1791 arcname = arcname.lstrip("/") 1792 1793 # Now, fill the TarInfo object with 1794 # information specific for the file. 1795 tarinfo = self.tarinfo() 1796 tarinfo.tarfile = self # Not needed 1797 1798 # Use os.stat or os.lstat, depending on platform 1799 # and if symlinks shall be resolved. 1800 if fileobj is None: 1801 if hasattr(os, "lstat") and not self.dereference: 1802 statres = os.lstat(name) 1803 else: 1804 statres = os.stat(name) 1805 else: 1806 statres = os.fstat(fileobj.fileno()) 1807 linkname = "" 1808 1809 stmd = statres.st_mode 1810 if stat.S_ISREG(stmd): 1811 inode = (statres.st_ino, statres.st_dev) 1812 if not self.dereference and statres.st_nlink > 1 and \ 1813 inode in self.inodes and arcname != self.inodes[inode]: 1814 # Is it a hardlink to an already 1815 # archived file? 1816 type = LNKTYPE 1817 linkname = self.inodes[inode] 1818 else: 1819 # The inode is added only if its valid. 1820 # For win32 it is always 0. 1821 type = REGTYPE 1822 if inode[0]: 1823 self.inodes[inode] = arcname 1824 elif stat.S_ISDIR(stmd): 1825 type = DIRTYPE 1826 elif stat.S_ISFIFO(stmd): 1827 type = FIFOTYPE 1828 elif stat.S_ISLNK(stmd): 1829 type = SYMTYPE 1830 linkname = os.readlink(name) 1831 elif stat.S_ISCHR(stmd): 1832 type = CHRTYPE 1833 elif stat.S_ISBLK(stmd): 1834 type = BLKTYPE 1835 else: 1836 return None 1837 1838 # Fill the TarInfo object with all 1839 # information we can get. 1840 tarinfo.name = arcname 1841 tarinfo.mode = stmd 1842 tarinfo.uid = statres.st_uid 1843 tarinfo.gid = statres.st_gid 1844 if type == REGTYPE: 1845 tarinfo.size = statres.st_size 1846 else: 1847 tarinfo.size = 0 1848 tarinfo.mtime = statres.st_mtime 1849 tarinfo.type = type 1850 tarinfo.linkname = linkname 1851 if pwd: 1852 try: 1853 tarinfo.uname = pwd.getpwuid(tarinfo.uid)[0] 1854 except KeyError: 1855 pass 1856 if grp: 1857 try: 1858 tarinfo.gname = grp.getgrgid(tarinfo.gid)[0] 1859 except KeyError: 1860 pass 1861 1862 if type in (CHRTYPE, BLKTYPE): 1863 if hasattr(os, "major") and hasattr(os, "minor"): 1864 tarinfo.devmajor = os.major(statres.st_rdev) 1865 tarinfo.devminor = os.minor(statres.st_rdev) 1866 return tarinfo 1867 1868 def list(self, verbose=True, *, members=None): 1869 """Print a table of contents to sys.stdout. If `verbose' is False, only 1870 the names of the members are printed. If it is True, an `ls -l'-like 1871 output is produced. `members' is optional and must be a subset of the 1872 list returned by getmembers(). 1873 """ 1874 self._check() 1875 1876 if members is None: 1877 members = self 1878 for tarinfo in members: 1879 if verbose: 1880 _safe_print(stat.filemode(tarinfo.mode)) 1881 _safe_print("%s/%s" % (tarinfo.uname or tarinfo.uid, 1882 tarinfo.gname or tarinfo.gid)) 1883 if tarinfo.ischr() or tarinfo.isblk(): 1884 _safe_print("%10s" % 1885 ("%d,%d" % (tarinfo.devmajor, tarinfo.devminor))) 1886 else: 1887 _safe_print("%10d" % tarinfo.size) 1888 _safe_print("%d-%02d-%02d %02d:%02d:%02d" \ 1889 % time.localtime(tarinfo.mtime)[:6]) 1890 1891 _safe_print(tarinfo.name + ("/" if tarinfo.isdir() else "")) 1892 1893 if verbose: 1894 if tarinfo.issym(): 1895 _safe_print("-> " + tarinfo.linkname) 1896 if tarinfo.islnk(): 1897 _safe_print("link to " + tarinfo.linkname) 1898 print() 1899 1900 def add(self, name, arcname=None, recursive=True, exclude=None, *, filter=None): 1901 """Add the file `name' to the archive. `name' may be any type of file 1902 (directory, fifo, symbolic link, etc.). If given, `arcname' 1903 specifies an alternative name for the file in the archive. 1904 Directories are added recursively by default. This can be avoided by 1905 setting `recursive' to False. `exclude' is a function that should 1906 return True for each filename to be excluded. `filter' is a function 1907 that expects a TarInfo object argument and returns the changed 1908 TarInfo object, if it returns None the TarInfo object will be 1909 excluded from the archive. 1910 """ 1911 self._check("awx") 1912 1913 if arcname is None: 1914 arcname = name 1915 1916 # Exclude pathnames. 1917 if exclude is not None: 1918 import warnings 1919 warnings.warn("use the filter argument instead", 1920 DeprecationWarning, 2) 1921 if exclude(name): 1922 self._dbg(2, "tarfile: Excluded %r" % name) 1923 return 1924 1925 # Skip if somebody tries to archive the archive... 1926 if self.name is not None and os.path.abspath(name) == self.name: 1927 self._dbg(2, "tarfile: Skipped %r" % name) 1928 return 1929 1930 self._dbg(1, name) 1931 1932 # Create a TarInfo object from the file. 1933 tarinfo = self.gettarinfo(name, arcname) 1934 1935 if tarinfo is None: 1936 self._dbg(1, "tarfile: Unsupported type %r" % name) 1937 return 1938 1939 # Change or exclude the TarInfo object. 1940 if filter is not None: 1941 tarinfo = filter(tarinfo) 1942 if tarinfo is None: 1943 self._dbg(2, "tarfile: Excluded %r" % name) 1944 return 1945 1946 # Append the tar header and data to the archive. 1947 if tarinfo.isreg(): 1948 with bltn_open(name, "rb") as f: 1949 self.addfile(tarinfo, f) 1950 1951 elif tarinfo.isdir(): 1952 self.addfile(tarinfo) 1953 if recursive: 1954 for f in os.listdir(name): 1955 self.add(os.path.join(name, f), os.path.join(arcname, f), 1956 recursive, exclude, filter=filter) 1957 1958 else: 1959 self.addfile(tarinfo) 1960 1961 def addfile(self, tarinfo, fileobj=None): 1962 """Add the TarInfo object `tarinfo' to the archive. If `fileobj' is 1963 given, it should be a binary file, and tarinfo.size bytes are read 1964 from it and added to the archive. You can create TarInfo objects 1965 directly, or by using gettarinfo(). 1966 """ 1967 self._check("awx") 1968 1969 tarinfo = copy.copy(tarinfo) 1970 1971 buf = tarinfo.tobuf(self.format, self.encoding, self.errors) 1972 self.fileobj.write(buf) 1973 self.offset += len(buf) 1974 bufsize=self.copybufsize 1975 # If there's data to follow, append it. 1976 if fileobj is not None: 1977 copyfileobj(fileobj, self.fileobj, tarinfo.size, bufsize=bufsize) 1978 blocks, remainder = divmod(tarinfo.size, BLOCKSIZE) 1979 if remainder > 0: 1980 self.fileobj.write(NUL * (BLOCKSIZE - remainder)) 1981 blocks += 1 1982 self.offset += blocks * BLOCKSIZE 1983 1984 self.members.append(tarinfo) 1985 1986 def extractall(self, path=".", members=None, *, numeric_owner=False): 1987 """Extract all members from the archive to the current working 1988 directory and set owner, modification time and permissions on 1989 directories afterwards. `path' specifies a different directory 1990 to extract to. `members' is optional and must be a subset of the 1991 list returned by getmembers(). If `numeric_owner` is True, only 1992 the numbers for user/group names are used and not the names. 1993 """ 1994 directories = [] 1995 1996 if members is None: 1997 members = self 1998 1999 for tarinfo in members: 2000 if tarinfo.isdir(): 2001 # Extract directories with a safe mode. 2002 directories.append(tarinfo) 2003 tarinfo = copy.copy(tarinfo) 2004 tarinfo.mode = 0o700 2005 # Do not set_attrs directories, as we will do that further down 2006 self.extract(tarinfo, path, set_attrs=not tarinfo.isdir(), 2007 numeric_owner=numeric_owner) 2008 2009 # Reverse sort directories. 2010 directories.sort(key=lambda a: a.name) 2011 directories.reverse() 2012 2013 # Set correct owner, mtime and filemode on directories. 2014 for tarinfo in directories: 2015 dirpath = os.path.join(path, tarinfo.name) 2016 try: 2017 self.chown(tarinfo, dirpath, numeric_owner=numeric_owner) 2018 self.utime(tarinfo, dirpath) 2019 self.chmod(tarinfo, dirpath) 2020 except ExtractError as e: 2021 if self.errorlevel > 1: 2022 raise 2023 else: 2024 self._dbg(1, "tarfile: %s" % e) 2025 2026 def extract(self, member, path="", set_attrs=True, *, numeric_owner=False): 2027 """Extract a member from the archive to the current working directory, 2028 using its full name. Its file information is extracted as accurately 2029 as possible. `member' may be a filename or a TarInfo object. You can 2030 specify a different directory using `path'. File attributes (owner, 2031 mtime, mode) are set unless `set_attrs' is False. If `numeric_owner` 2032 is True, only the numbers for user/group names are used and not 2033 the names. 2034 """ 2035 self._check("r") 2036 2037 if isinstance(member, str): 2038 tarinfo = self.getmember(member) 2039 else: 2040 tarinfo = member 2041 2042 # Prepare the link target for makelink(). 2043 if tarinfo.islnk(): 2044 tarinfo._link_target = os.path.join(path, tarinfo.linkname) 2045 2046 try: 2047 self._extract_member(tarinfo, os.path.join(path, tarinfo.name), 2048 set_attrs=set_attrs, 2049 numeric_owner=numeric_owner) 2050 except OSError as e: 2051 if self.errorlevel > 0: 2052 raise 2053 else: 2054 if e.filename is None: 2055 self._dbg(1, "tarfile: %s" % e.strerror) 2056 else: 2057 self._dbg(1, "tarfile: %s %r" % (e.strerror, e.filename)) 2058 except ExtractError as e: 2059 if self.errorlevel > 1: 2060 raise 2061 else: 2062 self._dbg(1, "tarfile: %s" % e) 2063 2064 def extractfile(self, member): 2065 """Extract a member from the archive as a file object. `member' may be 2066 a filename or a TarInfo object. If `member' is a regular file or a 2067 link, an io.BufferedReader object is returned. Otherwise, None is 2068 returned. 2069 """ 2070 self._check("r") 2071 2072 if isinstance(member, str): 2073 tarinfo = self.getmember(member) 2074 else: 2075 tarinfo = member 2076 2077 if tarinfo.isreg() or tarinfo.type not in SUPPORTED_TYPES: 2078 # Members with unknown types are treated as regular files. 2079 return self.fileobject(self, tarinfo) 2080 2081 elif tarinfo.islnk() or tarinfo.issym(): 2082 if isinstance(self.fileobj, _Stream): 2083 # A small but ugly workaround for the case that someone tries 2084 # to extract a (sym)link as a file-object from a non-seekable 2085 # stream of tar blocks. 2086 raise StreamError("cannot extract (sym)link as file object") 2087 else: 2088 # A (sym)link's file object is its target's file object. 2089 return self.extractfile(self._find_link_target(tarinfo)) 2090 else: 2091 # If there's no data associated with the member (directory, chrdev, 2092 # blkdev, etc.), return None instead of a file object. 2093 return None 2094 2095 def _extract_member(self, tarinfo, targetpath, set_attrs=True, 2096 numeric_owner=False): 2097 """Extract the TarInfo object tarinfo to a physical 2098 file called targetpath. 2099 """ 2100 # Fetch the TarInfo object for the given name 2101 # and build the destination pathname, replacing 2102 # forward slashes to platform specific separators. 2103 targetpath = targetpath.rstrip("/") 2104 targetpath = targetpath.replace("/", os.sep) 2105 2106 # Create all upper directories. 2107 upperdirs = os.path.dirname(targetpath) 2108 if upperdirs and not os.path.exists(upperdirs): 2109 # Create directories that are not part of the archive with 2110 # default permissions. 2111 os.makedirs(upperdirs) 2112 2113 if tarinfo.islnk() or tarinfo.issym(): 2114 self._dbg(1, "%s -> %s" % (tarinfo.name, tarinfo.linkname)) 2115 else: 2116 self._dbg(1, tarinfo.name) 2117 2118 if tarinfo.isreg(): 2119 self.makefile(tarinfo, targetpath) 2120 elif tarinfo.isdir(): 2121 self.makedir(tarinfo, targetpath) 2122 elif tarinfo.isfifo(): 2123 self.makefifo(tarinfo, targetpath) 2124 elif tarinfo.ischr() or tarinfo.isblk(): 2125 self.makedev(tarinfo, targetpath) 2126 elif tarinfo.islnk() or tarinfo.issym(): 2127 self.makelink(tarinfo, targetpath) 2128 elif tarinfo.type not in SUPPORTED_TYPES: 2129 self.makeunknown(tarinfo, targetpath) 2130 else: 2131 self.makefile(tarinfo, targetpath) 2132 2133 if set_attrs: 2134 self.chown(tarinfo, targetpath, numeric_owner) 2135 if not tarinfo.issym(): 2136 self.chmod(tarinfo, targetpath) 2137 self.utime(tarinfo, targetpath) 2138 2139 #-------------------------------------------------------------------------- 2140 # Below are the different file methods. They are called via 2141 # _extract_member() when extract() is called. They can be replaced in a 2142 # subclass to implement other functionality. 2143 2144 def makedir(self, tarinfo, targetpath): 2145 """Make a directory called targetpath. 2146 """ 2147 try: 2148 # Use a safe mode for the directory, the real mode is set 2149 # later in _extract_member(). 2150 os.mkdir(targetpath, 0o700) 2151 except FileExistsError: 2152 pass 2153 2154 def makefile(self, tarinfo, targetpath): 2155 """Make a file called targetpath. 2156 """ 2157 source = self.fileobj 2158 source.seek(tarinfo.offset_data) 2159 bufsize = self.copybufsize 2160 with bltn_open(targetpath, "wb") as target: 2161 if tarinfo.sparse is not None: 2162 for offset, size in tarinfo.sparse: 2163 target.seek(offset) 2164 copyfileobj(source, target, size, ReadError, bufsize) 2165 target.seek(tarinfo.size) 2166 target.truncate() 2167 else: 2168 copyfileobj(source, target, tarinfo.size, ReadError, bufsize) 2169 2170 def makeunknown(self, tarinfo, targetpath): 2171 """Make a file from a TarInfo object with an unknown type 2172 at targetpath. 2173 """ 2174 self.makefile(tarinfo, targetpath) 2175 self._dbg(1, "tarfile: Unknown file type %r, " \ 2176 "extracted as regular file." % tarinfo.type) 2177 2178 def makefifo(self, tarinfo, targetpath): 2179 """Make a fifo called targetpath. 2180 """ 2181 if hasattr(os, "mkfifo"): 2182 os.mkfifo(targetpath) 2183 else: 2184 raise ExtractError("fifo not supported by system") 2185 2186 def makedev(self, tarinfo, targetpath): 2187 """Make a character or block device called targetpath. 2188 """ 2189 if not hasattr(os, "mknod") or not hasattr(os, "makedev"): 2190 raise ExtractError("special devices not supported by system") 2191 2192 mode = tarinfo.mode 2193 if tarinfo.isblk(): 2194 mode |= stat.S_IFBLK 2195 else: 2196 mode |= stat.S_IFCHR 2197 2198 os.mknod(targetpath, mode, 2199 os.makedev(tarinfo.devmajor, tarinfo.devminor)) 2200 2201 def makelink(self, tarinfo, targetpath): 2202 """Make a (symbolic) link called targetpath. If it cannot be created 2203 (platform limitation), we try to make a copy of the referenced file 2204 instead of a link. 2205 """ 2206 try: 2207 # For systems that support symbolic and hard links. 2208 if tarinfo.issym(): 2209 os.symlink(tarinfo.linkname, targetpath) 2210 else: 2211 # See extract(). 2212 if os.path.exists(tarinfo._link_target): 2213 os.link(tarinfo._link_target, targetpath) 2214 else: 2215 self._extract_member(self._find_link_target(tarinfo), 2216 targetpath) 2217 except symlink_exception: 2218 try: 2219 self._extract_member(self._find_link_target(tarinfo), 2220 targetpath) 2221 except KeyError: 2222 raise ExtractError("unable to resolve link inside archive") 2223 2224 def chown(self, tarinfo, targetpath, numeric_owner): 2225 """Set owner of targetpath according to tarinfo. If numeric_owner 2226 is True, use .gid/.uid instead of .gname/.uname. If numeric_owner 2227 is False, fall back to .gid/.uid when the search based on name 2228 fails. 2229 """ 2230 if hasattr(os, "geteuid") and os.geteuid() == 0: 2231 # We have to be root to do so. 2232 g = tarinfo.gid 2233 u = tarinfo.uid 2234 if not numeric_owner: 2235 try: 2236 if grp: 2237 g = grp.getgrnam(tarinfo.gname)[2] 2238 except KeyError: 2239 pass 2240 try: 2241 if pwd: 2242 u = pwd.getpwnam(tarinfo.uname)[2] 2243 except KeyError: 2244 pass 2245 try: 2246 if tarinfo.issym() and hasattr(os, "lchown"): 2247 os.lchown(targetpath, u, g) 2248 else: 2249 os.chown(targetpath, u, g) 2250 except OSError: 2251 raise ExtractError("could not change owner") 2252 2253 def chmod(self, tarinfo, targetpath): 2254 """Set file permissions of targetpath according to tarinfo. 2255 """ 2256 if hasattr(os, 'chmod'): 2257 try: 2258 os.chmod(targetpath, tarinfo.mode) 2259 except OSError: 2260 raise ExtractError("could not change mode") 2261 2262 def utime(self, tarinfo, targetpath): 2263 """Set modification time of targetpath according to tarinfo. 2264 """ 2265 if not hasattr(os, 'utime'): 2266 return 2267 try: 2268 os.utime(targetpath, (tarinfo.mtime, tarinfo.mtime)) 2269 except OSError: 2270 raise ExtractError("could not change modification time") 2271 2272 #-------------------------------------------------------------------------- 2273 def next(self): 2274 """Return the next member of the archive as a TarInfo object, when 2275 TarFile is opened for reading. Return None if there is no more 2276 available. 2277 """ 2278 self._check("ra") 2279 if self.firstmember is not None: 2280 m = self.firstmember 2281 self.firstmember = None 2282 return m 2283 2284 # Advance the file pointer. 2285 if self.offset != self.fileobj.tell(): 2286 self.fileobj.seek(self.offset - 1) 2287 if not self.fileobj.read(1): 2288 raise ReadError("unexpected end of data") 2289 2290 # Read the next block. 2291 tarinfo = None 2292 while True: 2293 try: 2294 tarinfo = self.tarinfo.fromtarfile(self) 2295 except EOFHeaderError as e: 2296 if self.ignore_zeros: 2297 self._dbg(2, "0x%X: %s" % (self.offset, e)) 2298 self.offset += BLOCKSIZE 2299 continue 2300 except InvalidHeaderError as e: 2301 if self.ignore_zeros: 2302 self._dbg(2, "0x%X: %s" % (self.offset, e)) 2303 self.offset += BLOCKSIZE 2304 continue 2305 elif self.offset == 0: 2306 raise ReadError(str(e)) 2307 except EmptyHeaderError: 2308 if self.offset == 0: 2309 raise ReadError("empty file") 2310 except TruncatedHeaderError as e: 2311 if self.offset == 0: 2312 raise ReadError(str(e)) 2313 except SubsequentHeaderError as e: 2314 raise ReadError(str(e)) 2315 break 2316 2317 if tarinfo is not None: 2318 self.members.append(tarinfo) 2319 else: 2320 self._loaded = True 2321 2322 return tarinfo 2323 2324 #-------------------------------------------------------------------------- 2325 # Little helper methods: 2326 2327 def _getmember(self, name, tarinfo=None, normalize=False): 2328 """Find an archive member by name from bottom to top. 2329 If tarinfo is given, it is used as the starting point. 2330 """ 2331 # Ensure that all members have been loaded. 2332 members = self.getmembers() 2333 2334 # Limit the member search list up to tarinfo. 2335 if tarinfo is not None: 2336 members = members[:members.index(tarinfo)] 2337 2338 if normalize: 2339 name = os.path.normpath(name) 2340 2341 for member in reversed(members): 2342 if normalize: 2343 member_name = os.path.normpath(member.name) 2344 else: 2345 member_name = member.name 2346 2347 if name == member_name: 2348 return member 2349 2350 def _load(self): 2351 """Read through the entire archive file and look for readable 2352 members. 2353 """ 2354 while True: 2355 tarinfo = self.next() 2356 if tarinfo is None: 2357 break 2358 self._loaded = True 2359 2360 def _check(self, mode=None): 2361 """Check if TarFile is still open, and if the operation's mode 2362 corresponds to TarFile's mode. 2363 """ 2364 if self.closed: 2365 raise OSError("%s is closed" % self.__class__.__name__) 2366 if mode is not None and self.mode not in mode: 2367 raise OSError("bad operation for mode %r" % self.mode) 2368 2369 def _find_link_target(self, tarinfo): 2370 """Find the target member of a symlink or hardlink member in the 2371 archive. 2372 """ 2373 if tarinfo.issym(): 2374 # Always search the entire archive. 2375 linkname = "/".join(filter(None, (os.path.dirname(tarinfo.name), tarinfo.linkname))) 2376 limit = None 2377 else: 2378 # Search the archive before the link, because a hard link is 2379 # just a reference to an already archived file. 2380 linkname = tarinfo.linkname 2381 limit = tarinfo 2382 2383 member = self._getmember(linkname, tarinfo=limit, normalize=True) 2384 if member is None: 2385 raise KeyError("linkname %r not found" % linkname) 2386 return member 2387 2388 def __iter__(self): 2389 """Provide an iterator object. 2390 """ 2391 if self._loaded: 2392 yield from self.members 2393 return 2394 2395 # Yield items using TarFile's next() method. 2396 # When all members have been read, set TarFile as _loaded. 2397 index = 0 2398 # Fix for SF #1100429: Under rare circumstances it can 2399 # happen that getmembers() is called during iteration, 2400 # which will have already exhausted the next() method. 2401 if self.firstmember is not None: 2402 tarinfo = self.next() 2403 index += 1 2404 yield tarinfo 2405 2406 while True: 2407 if index < len(self.members): 2408 tarinfo = self.members[index] 2409 elif not self._loaded: 2410 tarinfo = self.next() 2411 if not tarinfo: 2412 self._loaded = True 2413 return 2414 else: 2415 return 2416 index += 1 2417 yield tarinfo 2418 2419 def _dbg(self, level, msg): 2420 """Write debugging output to sys.stderr. 2421 """ 2422 if level <= self.debug: 2423 print(msg, file=sys.stderr) 2424 2425 def __enter__(self): 2426 self._check() 2427 return self 2428 2429 def __exit__(self, type, value, traceback): 2430 if type is None: 2431 self.close() 2432 else: 2433 # An exception occurred. We must not call close() because 2434 # it would try to write end-of-archive blocks and padding. 2435 if not self._extfileobj: 2436 self.fileobj.close() 2437 self.closed = True 2438 2439#-------------------- 2440# exported functions 2441#-------------------- 2442def is_tarfile(name): 2443 """Return True if name points to a tar archive that we 2444 are able to handle, else return False. 2445 """ 2446 try: 2447 t = open(name) 2448 t.close() 2449 return True 2450 except TarError: 2451 return False 2452 2453open = TarFile.open 2454 2455 2456def main(): 2457 import argparse 2458 2459 description = 'A simple command line interface for tarfile module.' 2460 parser = argparse.ArgumentParser(description=description) 2461 parser.add_argument('-v', '--verbose', action='store_true', default=False, 2462 help='Verbose output') 2463 group = parser.add_mutually_exclusive_group() 2464 group.add_argument('-l', '--list', metavar='<tarfile>', 2465 help='Show listing of a tarfile') 2466 group.add_argument('-e', '--extract', nargs='+', 2467 metavar=('<tarfile>', '<output_dir>'), 2468 help='Extract tarfile into target dir') 2469 group.add_argument('-c', '--create', nargs='+', 2470 metavar=('<name>', '<file>'), 2471 help='Create tarfile from sources') 2472 group.add_argument('-t', '--test', metavar='<tarfile>', 2473 help='Test if a tarfile is valid') 2474 args = parser.parse_args() 2475 2476 if args.test: 2477 src = args.test 2478 if is_tarfile(src): 2479 with open(src, 'r') as tar: 2480 tar.getmembers() 2481 print(tar.getmembers(), file=sys.stderr) 2482 if args.verbose: 2483 print('{!r} is a tar archive.'.format(src)) 2484 else: 2485 parser.exit(1, '{!r} is not a tar archive.\n'.format(src)) 2486 2487 elif args.list: 2488 src = args.list 2489 if is_tarfile(src): 2490 with TarFile.open(src, 'r:*') as tf: 2491 tf.list(verbose=args.verbose) 2492 else: 2493 parser.exit(1, '{!r} is not a tar archive.\n'.format(src)) 2494 2495 elif args.extract: 2496 if len(args.extract) == 1: 2497 src = args.extract[0] 2498 curdir = os.curdir 2499 elif len(args.extract) == 2: 2500 src, curdir = args.extract 2501 else: 2502 parser.exit(1, parser.format_help()) 2503 2504 if is_tarfile(src): 2505 with TarFile.open(src, 'r:*') as tf: 2506 tf.extractall(path=curdir) 2507 if args.verbose: 2508 if curdir == '.': 2509 msg = '{!r} file is extracted.'.format(src) 2510 else: 2511 msg = ('{!r} file is extracted ' 2512 'into {!r} directory.').format(src, curdir) 2513 print(msg) 2514 else: 2515 parser.exit(1, '{!r} is not a tar archive.\n'.format(src)) 2516 2517 elif args.create: 2518 tar_name = args.create.pop(0) 2519 _, ext = os.path.splitext(tar_name) 2520 compressions = { 2521 # gz 2522 '.gz': 'gz', 2523 '.tgz': 'gz', 2524 # xz 2525 '.xz': 'xz', 2526 '.txz': 'xz', 2527 # bz2 2528 '.bz2': 'bz2', 2529 '.tbz': 'bz2', 2530 '.tbz2': 'bz2', 2531 '.tb2': 'bz2', 2532 } 2533 tar_mode = 'w:' + compressions[ext] if ext in compressions else 'w' 2534 tar_files = args.create 2535 2536 with TarFile.open(tar_name, tar_mode) as tf: 2537 for file_name in tar_files: 2538 tf.add(file_name) 2539 2540 if args.verbose: 2541 print('{!r} file created.'.format(tar_name)) 2542 2543 else: 2544 parser.exit(1, parser.format_help()) 2545 2546if __name__ == '__main__': 2547 main() 2548