1#!/usr/bin/env python3 2#------------------------------------------------------------------- 3# tarfile.py 4#------------------------------------------------------------------- 5# Copyright (C) 2002 Lars Gustaebel <lars@gustaebel.de> 6# All rights reserved. 7# 8# Permission is hereby granted, free of charge, to any person 9# obtaining a copy of this software and associated documentation 10# files (the "Software"), to deal in the Software without 11# restriction, including without limitation the rights to use, 12# copy, modify, merge, publish, distribute, sublicense, and/or sell 13# copies of the Software, and to permit persons to whom the 14# Software is furnished to do so, subject to the following 15# conditions: 16# 17# The above copyright notice and this permission notice shall be 18# included in all copies or substantial portions of the Software. 19# 20# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 21# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 22# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 23# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 24# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 25# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 26# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 27# OTHER DEALINGS IN THE SOFTWARE. 28# 29"""Read from and write to tar format archives. 30""" 31 32version = "0.9.0" 33__author__ = "Lars Gust\u00e4bel (lars@gustaebel.de)" 34__credits__ = "Gustavo Niemeyer, Niels Gust\u00e4bel, Richard Townsend." 35 36#--------- 37# Imports 38#--------- 39from builtins import open as bltn_open 40import sys 41import os 42import io 43import shutil 44import stat 45import time 46import struct 47import copy 48import re 49 50try: 51 import pwd 52except ImportError: 53 pwd = None 54try: 55 import grp 56except ImportError: 57 grp = None 58 59# os.symlink on Windows prior to 6.0 raises NotImplementedError 60symlink_exception = (AttributeError, NotImplementedError) 61try: 62 # OSError (winerror=1314) will be raised if the caller does not hold the 63 # SeCreateSymbolicLinkPrivilege privilege 64 symlink_exception += (OSError,) 65except NameError: 66 pass 67 68# from tarfile import * 69__all__ = ["TarFile", "TarInfo", "is_tarfile", "TarError", "ReadError", 70 "CompressionError", "StreamError", "ExtractError", "HeaderError", 71 "ENCODING", "USTAR_FORMAT", "GNU_FORMAT", "PAX_FORMAT", 72 "DEFAULT_FORMAT", "open"] 73 74#--------------------------------------------------------- 75# tar constants 76#--------------------------------------------------------- 77NUL = b"\0" # the null character 78BLOCKSIZE = 512 # length of processing blocks 79RECORDSIZE = BLOCKSIZE * 20 # length of records 80GNU_MAGIC = b"ustar \0" # magic gnu tar string 81POSIX_MAGIC = b"ustar\x0000" # magic posix tar string 82 83LENGTH_NAME = 100 # maximum length of a filename 84LENGTH_LINK = 100 # maximum length of a linkname 85LENGTH_PREFIX = 155 # maximum length of the prefix field 86 87REGTYPE = b"0" # regular file 88AREGTYPE = b"\0" # regular file 89LNKTYPE = b"1" # link (inside tarfile) 90SYMTYPE = b"2" # symbolic link 91CHRTYPE = b"3" # character special device 92BLKTYPE = b"4" # block special device 93DIRTYPE = b"5" # directory 94FIFOTYPE = b"6" # fifo special device 95CONTTYPE = b"7" # contiguous file 96 97GNUTYPE_LONGNAME = b"L" # GNU tar longname 98GNUTYPE_LONGLINK = b"K" # GNU tar longlink 99GNUTYPE_SPARSE = b"S" # GNU tar sparse file 100 101XHDTYPE = b"x" # POSIX.1-2001 extended header 102XGLTYPE = b"g" # POSIX.1-2001 global header 103SOLARIS_XHDTYPE = b"X" # Solaris extended header 104 105USTAR_FORMAT = 0 # POSIX.1-1988 (ustar) format 106GNU_FORMAT = 1 # GNU tar format 107PAX_FORMAT = 2 # POSIX.1-2001 (pax) format 108DEFAULT_FORMAT = PAX_FORMAT 109 110#--------------------------------------------------------- 111# tarfile constants 112#--------------------------------------------------------- 113# File types that tarfile supports: 114SUPPORTED_TYPES = (REGTYPE, AREGTYPE, LNKTYPE, 115 SYMTYPE, DIRTYPE, FIFOTYPE, 116 CONTTYPE, CHRTYPE, BLKTYPE, 117 GNUTYPE_LONGNAME, GNUTYPE_LONGLINK, 118 GNUTYPE_SPARSE) 119 120# File types that will be treated as a regular file. 121REGULAR_TYPES = (REGTYPE, AREGTYPE, 122 CONTTYPE, GNUTYPE_SPARSE) 123 124# File types that are part of the GNU tar format. 125GNU_TYPES = (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK, 126 GNUTYPE_SPARSE) 127 128# Fields from a pax header that override a TarInfo attribute. 129PAX_FIELDS = ("path", "linkpath", "size", "mtime", 130 "uid", "gid", "uname", "gname") 131 132# Fields from a pax header that are affected by hdrcharset. 133PAX_NAME_FIELDS = {"path", "linkpath", "uname", "gname"} 134 135# Fields in a pax header that are numbers, all other fields 136# are treated as strings. 137PAX_NUMBER_FIELDS = { 138 "atime": float, 139 "ctime": float, 140 "mtime": float, 141 "uid": int, 142 "gid": int, 143 "size": int 144} 145 146#--------------------------------------------------------- 147# initialization 148#--------------------------------------------------------- 149if os.name == "nt": 150 ENCODING = "utf-8" 151else: 152 ENCODING = sys.getfilesystemencoding() 153 154#--------------------------------------------------------- 155# Some useful functions 156#--------------------------------------------------------- 157 158def stn(s, length, encoding, errors): 159 """Convert a string to a null-terminated bytes object. 160 """ 161 s = s.encode(encoding, errors) 162 return s[:length] + (length - len(s)) * NUL 163 164def nts(s, encoding, errors): 165 """Convert a null-terminated bytes object to a string. 166 """ 167 p = s.find(b"\0") 168 if p != -1: 169 s = s[:p] 170 return s.decode(encoding, errors) 171 172def nti(s): 173 """Convert a number field to a python number. 174 """ 175 # There are two possible encodings for a number field, see 176 # itn() below. 177 if s[0] in (0o200, 0o377): 178 n = 0 179 for i in range(len(s) - 1): 180 n <<= 8 181 n += s[i + 1] 182 if s[0] == 0o377: 183 n = -(256 ** (len(s) - 1) - n) 184 else: 185 try: 186 s = nts(s, "ascii", "strict") 187 n = int(s.strip() or "0", 8) 188 except ValueError: 189 raise InvalidHeaderError("invalid header") 190 return n 191 192def itn(n, digits=8, format=DEFAULT_FORMAT): 193 """Convert a python number to a number field. 194 """ 195 # POSIX 1003.1-1988 requires numbers to be encoded as a string of 196 # octal digits followed by a null-byte, this allows values up to 197 # (8**(digits-1))-1. GNU tar allows storing numbers greater than 198 # that if necessary. A leading 0o200 or 0o377 byte indicate this 199 # particular encoding, the following digits-1 bytes are a big-endian 200 # base-256 representation. This allows values up to (256**(digits-1))-1. 201 # A 0o200 byte indicates a positive number, a 0o377 byte a negative 202 # number. 203 original_n = n 204 n = int(n) 205 if 0 <= n < 8 ** (digits - 1): 206 s = bytes("%0*o" % (digits - 1, n), "ascii") + NUL 207 elif format == GNU_FORMAT and -256 ** (digits - 1) <= n < 256 ** (digits - 1): 208 if n >= 0: 209 s = bytearray([0o200]) 210 else: 211 s = bytearray([0o377]) 212 n = 256 ** digits + n 213 214 for i in range(digits - 1): 215 s.insert(1, n & 0o377) 216 n >>= 8 217 else: 218 raise ValueError("overflow in number field") 219 220 return s 221 222def calc_chksums(buf): 223 """Calculate the checksum for a member's header by summing up all 224 characters except for the chksum field which is treated as if 225 it was filled with spaces. According to the GNU tar sources, 226 some tars (Sun and NeXT) calculate chksum with signed char, 227 which will be different if there are chars in the buffer with 228 the high bit set. So we calculate two checksums, unsigned and 229 signed. 230 """ 231 unsigned_chksum = 256 + sum(struct.unpack_from("148B8x356B", buf)) 232 signed_chksum = 256 + sum(struct.unpack_from("148b8x356b", buf)) 233 return unsigned_chksum, signed_chksum 234 235def copyfileobj(src, dst, length=None, exception=OSError, bufsize=None): 236 """Copy length bytes from fileobj src to fileobj dst. 237 If length is None, copy the entire content. 238 """ 239 bufsize = bufsize or 16 * 1024 240 if length == 0: 241 return 242 if length is None: 243 shutil.copyfileobj(src, dst, bufsize) 244 return 245 246 blocks, remainder = divmod(length, bufsize) 247 for b in range(blocks): 248 buf = src.read(bufsize) 249 if len(buf) < bufsize: 250 raise exception("unexpected end of data") 251 dst.write(buf) 252 253 if remainder != 0: 254 buf = src.read(remainder) 255 if len(buf) < remainder: 256 raise exception("unexpected end of data") 257 dst.write(buf) 258 return 259 260def _safe_print(s): 261 encoding = getattr(sys.stdout, 'encoding', None) 262 if encoding is not None: 263 s = s.encode(encoding, 'backslashreplace').decode(encoding) 264 print(s, end=' ') 265 266 267class TarError(Exception): 268 """Base exception.""" 269 pass 270class ExtractError(TarError): 271 """General exception for extract errors.""" 272 pass 273class ReadError(TarError): 274 """Exception for unreadable tar archives.""" 275 pass 276class CompressionError(TarError): 277 """Exception for unavailable compression methods.""" 278 pass 279class StreamError(TarError): 280 """Exception for unsupported operations on stream-like TarFiles.""" 281 pass 282class HeaderError(TarError): 283 """Base exception for header errors.""" 284 pass 285class EmptyHeaderError(HeaderError): 286 """Exception for empty headers.""" 287 pass 288class TruncatedHeaderError(HeaderError): 289 """Exception for truncated headers.""" 290 pass 291class EOFHeaderError(HeaderError): 292 """Exception for end of file headers.""" 293 pass 294class InvalidHeaderError(HeaderError): 295 """Exception for invalid headers.""" 296 pass 297class SubsequentHeaderError(HeaderError): 298 """Exception for missing and invalid extended headers.""" 299 pass 300 301#--------------------------- 302# internal stream interface 303#--------------------------- 304class _LowLevelFile: 305 """Low-level file object. Supports reading and writing. 306 It is used instead of a regular file object for streaming 307 access. 308 """ 309 310 def __init__(self, name, mode): 311 mode = { 312 "r": os.O_RDONLY, 313 "w": os.O_WRONLY | os.O_CREAT | os.O_TRUNC, 314 }[mode] 315 if hasattr(os, "O_BINARY"): 316 mode |= os.O_BINARY 317 self.fd = os.open(name, mode, 0o666) 318 319 def close(self): 320 os.close(self.fd) 321 322 def read(self, size): 323 return os.read(self.fd, size) 324 325 def write(self, s): 326 os.write(self.fd, s) 327 328class _Stream: 329 """Class that serves as an adapter between TarFile and 330 a stream-like object. The stream-like object only 331 needs to have a read() or write() method and is accessed 332 blockwise. Use of gzip or bzip2 compression is possible. 333 A stream-like object could be for example: sys.stdin, 334 sys.stdout, a socket, a tape device etc. 335 336 _Stream is intended to be used only internally. 337 """ 338 339 def __init__(self, name, mode, comptype, fileobj, bufsize): 340 """Construct a _Stream object. 341 """ 342 self._extfileobj = True 343 if fileobj is None: 344 fileobj = _LowLevelFile(name, mode) 345 self._extfileobj = False 346 347 if comptype == '*': 348 # Enable transparent compression detection for the 349 # stream interface 350 fileobj = _StreamProxy(fileobj) 351 comptype = fileobj.getcomptype() 352 353 self.name = name or "" 354 self.mode = mode 355 self.comptype = comptype 356 self.fileobj = fileobj 357 self.bufsize = bufsize 358 self.buf = b"" 359 self.pos = 0 360 self.closed = False 361 362 try: 363 if comptype == "gz": 364 try: 365 import zlib 366 except ImportError: 367 raise CompressionError("zlib module is not available") from None 368 self.zlib = zlib 369 self.crc = zlib.crc32(b"") 370 if mode == "r": 371 self._init_read_gz() 372 self.exception = zlib.error 373 else: 374 self._init_write_gz() 375 376 elif comptype == "bz2": 377 try: 378 import bz2 379 except ImportError: 380 raise CompressionError("bz2 module is not available") from None 381 if mode == "r": 382 self.dbuf = b"" 383 self.cmp = bz2.BZ2Decompressor() 384 self.exception = OSError 385 else: 386 self.cmp = bz2.BZ2Compressor() 387 388 elif comptype == "xz": 389 try: 390 import lzma 391 except ImportError: 392 raise CompressionError("lzma module is not available") from None 393 if mode == "r": 394 self.dbuf = b"" 395 self.cmp = lzma.LZMADecompressor() 396 self.exception = lzma.LZMAError 397 else: 398 self.cmp = lzma.LZMACompressor() 399 400 elif comptype != "tar": 401 raise CompressionError("unknown compression type %r" % comptype) 402 403 except: 404 if not self._extfileobj: 405 self.fileobj.close() 406 self.closed = True 407 raise 408 409 def __del__(self): 410 if hasattr(self, "closed") and not self.closed: 411 self.close() 412 413 def _init_write_gz(self): 414 """Initialize for writing with gzip compression. 415 """ 416 self.cmp = self.zlib.compressobj(9, self.zlib.DEFLATED, 417 -self.zlib.MAX_WBITS, 418 self.zlib.DEF_MEM_LEVEL, 419 0) 420 timestamp = struct.pack("<L", int(time.time())) 421 self.__write(b"\037\213\010\010" + timestamp + b"\002\377") 422 if self.name.endswith(".gz"): 423 self.name = self.name[:-3] 424 # Honor "directory components removed" from RFC1952 425 self.name = os.path.basename(self.name) 426 # RFC1952 says we must use ISO-8859-1 for the FNAME field. 427 self.__write(self.name.encode("iso-8859-1", "replace") + NUL) 428 429 def write(self, s): 430 """Write string s to the stream. 431 """ 432 if self.comptype == "gz": 433 self.crc = self.zlib.crc32(s, self.crc) 434 self.pos += len(s) 435 if self.comptype != "tar": 436 s = self.cmp.compress(s) 437 self.__write(s) 438 439 def __write(self, s): 440 """Write string s to the stream if a whole new block 441 is ready to be written. 442 """ 443 self.buf += s 444 while len(self.buf) > self.bufsize: 445 self.fileobj.write(self.buf[:self.bufsize]) 446 self.buf = self.buf[self.bufsize:] 447 448 def close(self): 449 """Close the _Stream object. No operation should be 450 done on it afterwards. 451 """ 452 if self.closed: 453 return 454 455 self.closed = True 456 try: 457 if self.mode == "w" and self.comptype != "tar": 458 self.buf += self.cmp.flush() 459 460 if self.mode == "w" and self.buf: 461 self.fileobj.write(self.buf) 462 self.buf = b"" 463 if self.comptype == "gz": 464 self.fileobj.write(struct.pack("<L", self.crc)) 465 self.fileobj.write(struct.pack("<L", self.pos & 0xffffFFFF)) 466 finally: 467 if not self._extfileobj: 468 self.fileobj.close() 469 470 def _init_read_gz(self): 471 """Initialize for reading a gzip compressed fileobj. 472 """ 473 self.cmp = self.zlib.decompressobj(-self.zlib.MAX_WBITS) 474 self.dbuf = b"" 475 476 # taken from gzip.GzipFile with some alterations 477 if self.__read(2) != b"\037\213": 478 raise ReadError("not a gzip file") 479 if self.__read(1) != b"\010": 480 raise CompressionError("unsupported compression method") 481 482 flag = ord(self.__read(1)) 483 self.__read(6) 484 485 if flag & 4: 486 xlen = ord(self.__read(1)) + 256 * ord(self.__read(1)) 487 self.read(xlen) 488 if flag & 8: 489 while True: 490 s = self.__read(1) 491 if not s or s == NUL: 492 break 493 if flag & 16: 494 while True: 495 s = self.__read(1) 496 if not s or s == NUL: 497 break 498 if flag & 2: 499 self.__read(2) 500 501 def tell(self): 502 """Return the stream's file pointer position. 503 """ 504 return self.pos 505 506 def seek(self, pos=0): 507 """Set the stream's file pointer to pos. Negative seeking 508 is forbidden. 509 """ 510 if pos - self.pos >= 0: 511 blocks, remainder = divmod(pos - self.pos, self.bufsize) 512 for i in range(blocks): 513 self.read(self.bufsize) 514 self.read(remainder) 515 else: 516 raise StreamError("seeking backwards is not allowed") 517 return self.pos 518 519 def read(self, size): 520 """Return the next size number of bytes from the stream.""" 521 assert size is not None 522 buf = self._read(size) 523 self.pos += len(buf) 524 return buf 525 526 def _read(self, size): 527 """Return size bytes from the stream. 528 """ 529 if self.comptype == "tar": 530 return self.__read(size) 531 532 c = len(self.dbuf) 533 t = [self.dbuf] 534 while c < size: 535 # Skip underlying buffer to avoid unaligned double buffering. 536 if self.buf: 537 buf = self.buf 538 self.buf = b"" 539 else: 540 buf = self.fileobj.read(self.bufsize) 541 if not buf: 542 break 543 try: 544 buf = self.cmp.decompress(buf) 545 except self.exception as e: 546 raise ReadError("invalid compressed data") from e 547 t.append(buf) 548 c += len(buf) 549 t = b"".join(t) 550 self.dbuf = t[size:] 551 return t[:size] 552 553 def __read(self, size): 554 """Return size bytes from stream. If internal buffer is empty, 555 read another block from the stream. 556 """ 557 c = len(self.buf) 558 t = [self.buf] 559 while c < size: 560 buf = self.fileobj.read(self.bufsize) 561 if not buf: 562 break 563 t.append(buf) 564 c += len(buf) 565 t = b"".join(t) 566 self.buf = t[size:] 567 return t[:size] 568# class _Stream 569 570class _StreamProxy(object): 571 """Small proxy class that enables transparent compression 572 detection for the Stream interface (mode 'r|*'). 573 """ 574 575 def __init__(self, fileobj): 576 self.fileobj = fileobj 577 self.buf = self.fileobj.read(BLOCKSIZE) 578 579 def read(self, size): 580 self.read = self.fileobj.read 581 return self.buf 582 583 def getcomptype(self): 584 if self.buf.startswith(b"\x1f\x8b\x08"): 585 return "gz" 586 elif self.buf[0:3] == b"BZh" and self.buf[4:10] == b"1AY&SY": 587 return "bz2" 588 elif self.buf.startswith((b"\x5d\x00\x00\x80", b"\xfd7zXZ")): 589 return "xz" 590 else: 591 return "tar" 592 593 def close(self): 594 self.fileobj.close() 595# class StreamProxy 596 597#------------------------ 598# Extraction file object 599#------------------------ 600class _FileInFile(object): 601 """A thin wrapper around an existing file object that 602 provides a part of its data as an individual file 603 object. 604 """ 605 606 def __init__(self, fileobj, offset, size, blockinfo=None): 607 self.fileobj = fileobj 608 self.offset = offset 609 self.size = size 610 self.position = 0 611 self.name = getattr(fileobj, "name", None) 612 self.closed = False 613 614 if blockinfo is None: 615 blockinfo = [(0, size)] 616 617 # Construct a map with data and zero blocks. 618 self.map_index = 0 619 self.map = [] 620 lastpos = 0 621 realpos = self.offset 622 for offset, size in blockinfo: 623 if offset > lastpos: 624 self.map.append((False, lastpos, offset, None)) 625 self.map.append((True, offset, offset + size, realpos)) 626 realpos += size 627 lastpos = offset + size 628 if lastpos < self.size: 629 self.map.append((False, lastpos, self.size, None)) 630 631 def flush(self): 632 pass 633 634 def readable(self): 635 return True 636 637 def writable(self): 638 return False 639 640 def seekable(self): 641 return self.fileobj.seekable() 642 643 def tell(self): 644 """Return the current file position. 645 """ 646 return self.position 647 648 def seek(self, position, whence=io.SEEK_SET): 649 """Seek to a position in the file. 650 """ 651 if whence == io.SEEK_SET: 652 self.position = min(max(position, 0), self.size) 653 elif whence == io.SEEK_CUR: 654 if position < 0: 655 self.position = max(self.position + position, 0) 656 else: 657 self.position = min(self.position + position, self.size) 658 elif whence == io.SEEK_END: 659 self.position = max(min(self.size + position, self.size), 0) 660 else: 661 raise ValueError("Invalid argument") 662 return self.position 663 664 def read(self, size=None): 665 """Read data from the file. 666 """ 667 if size is None: 668 size = self.size - self.position 669 else: 670 size = min(size, self.size - self.position) 671 672 buf = b"" 673 while size > 0: 674 while True: 675 data, start, stop, offset = self.map[self.map_index] 676 if start <= self.position < stop: 677 break 678 else: 679 self.map_index += 1 680 if self.map_index == len(self.map): 681 self.map_index = 0 682 length = min(size, stop - self.position) 683 if data: 684 self.fileobj.seek(offset + (self.position - start)) 685 b = self.fileobj.read(length) 686 if len(b) != length: 687 raise ReadError("unexpected end of data") 688 buf += b 689 else: 690 buf += NUL * length 691 size -= length 692 self.position += length 693 return buf 694 695 def readinto(self, b): 696 buf = self.read(len(b)) 697 b[:len(buf)] = buf 698 return len(buf) 699 700 def close(self): 701 self.closed = True 702#class _FileInFile 703 704class ExFileObject(io.BufferedReader): 705 706 def __init__(self, tarfile, tarinfo): 707 fileobj = _FileInFile(tarfile.fileobj, tarinfo.offset_data, 708 tarinfo.size, tarinfo.sparse) 709 super().__init__(fileobj) 710#class ExFileObject 711 712#------------------ 713# Exported Classes 714#------------------ 715class TarInfo(object): 716 """Informational class which holds the details about an 717 archive member given by a tar header block. 718 TarInfo objects are returned by TarFile.getmember(), 719 TarFile.getmembers() and TarFile.gettarinfo() and are 720 usually created internally. 721 """ 722 723 __slots__ = dict( 724 name = 'Name of the archive member.', 725 mode = 'Permission bits.', 726 uid = 'User ID of the user who originally stored this member.', 727 gid = 'Group ID of the user who originally stored this member.', 728 size = 'Size in bytes.', 729 mtime = 'Time of last modification.', 730 chksum = 'Header checksum.', 731 type = ('File type. type is usually one of these constants: ' 732 'REGTYPE, AREGTYPE, LNKTYPE, SYMTYPE, DIRTYPE, FIFOTYPE, ' 733 'CONTTYPE, CHRTYPE, BLKTYPE, GNUTYPE_SPARSE.'), 734 linkname = ('Name of the target file name, which is only present ' 735 'in TarInfo objects of type LNKTYPE and SYMTYPE.'), 736 uname = 'User name.', 737 gname = 'Group name.', 738 devmajor = 'Device major number.', 739 devminor = 'Device minor number.', 740 offset = 'The tar header starts here.', 741 offset_data = "The file's data starts here.", 742 pax_headers = ('A dictionary containing key-value pairs of an ' 743 'associated pax extended header.'), 744 sparse = 'Sparse member information.', 745 tarfile = None, 746 _sparse_structs = None, 747 _link_target = None, 748 ) 749 750 def __init__(self, name=""): 751 """Construct a TarInfo object. name is the optional name 752 of the member. 753 """ 754 self.name = name # member name 755 self.mode = 0o644 # file permissions 756 self.uid = 0 # user id 757 self.gid = 0 # group id 758 self.size = 0 # file size 759 self.mtime = 0 # modification time 760 self.chksum = 0 # header checksum 761 self.type = REGTYPE # member type 762 self.linkname = "" # link name 763 self.uname = "" # user name 764 self.gname = "" # group name 765 self.devmajor = 0 # device major number 766 self.devminor = 0 # device minor number 767 768 self.offset = 0 # the tar header starts here 769 self.offset_data = 0 # the file's data starts here 770 771 self.sparse = None # sparse member information 772 self.pax_headers = {} # pax header information 773 774 @property 775 def path(self): 776 'In pax headers, "name" is called "path".' 777 return self.name 778 779 @path.setter 780 def path(self, name): 781 self.name = name 782 783 @property 784 def linkpath(self): 785 'In pax headers, "linkname" is called "linkpath".' 786 return self.linkname 787 788 @linkpath.setter 789 def linkpath(self, linkname): 790 self.linkname = linkname 791 792 def __repr__(self): 793 return "<%s %r at %#x>" % (self.__class__.__name__,self.name,id(self)) 794 795 def get_info(self): 796 """Return the TarInfo's attributes as a dictionary. 797 """ 798 info = { 799 "name": self.name, 800 "mode": self.mode & 0o7777, 801 "uid": self.uid, 802 "gid": self.gid, 803 "size": self.size, 804 "mtime": self.mtime, 805 "chksum": self.chksum, 806 "type": self.type, 807 "linkname": self.linkname, 808 "uname": self.uname, 809 "gname": self.gname, 810 "devmajor": self.devmajor, 811 "devminor": self.devminor 812 } 813 814 if info["type"] == DIRTYPE and not info["name"].endswith("/"): 815 info["name"] += "/" 816 817 return info 818 819 def tobuf(self, format=DEFAULT_FORMAT, encoding=ENCODING, errors="surrogateescape"): 820 """Return a tar header as a string of 512 byte blocks. 821 """ 822 info = self.get_info() 823 824 if format == USTAR_FORMAT: 825 return self.create_ustar_header(info, encoding, errors) 826 elif format == GNU_FORMAT: 827 return self.create_gnu_header(info, encoding, errors) 828 elif format == PAX_FORMAT: 829 return self.create_pax_header(info, encoding) 830 else: 831 raise ValueError("invalid format") 832 833 def create_ustar_header(self, info, encoding, errors): 834 """Return the object as a ustar header block. 835 """ 836 info["magic"] = POSIX_MAGIC 837 838 if len(info["linkname"].encode(encoding, errors)) > LENGTH_LINK: 839 raise ValueError("linkname is too long") 840 841 if len(info["name"].encode(encoding, errors)) > LENGTH_NAME: 842 info["prefix"], info["name"] = self._posix_split_name(info["name"], encoding, errors) 843 844 return self._create_header(info, USTAR_FORMAT, encoding, errors) 845 846 def create_gnu_header(self, info, encoding, errors): 847 """Return the object as a GNU header block sequence. 848 """ 849 info["magic"] = GNU_MAGIC 850 851 buf = b"" 852 if len(info["linkname"].encode(encoding, errors)) > LENGTH_LINK: 853 buf += self._create_gnu_long_header(info["linkname"], GNUTYPE_LONGLINK, encoding, errors) 854 855 if len(info["name"].encode(encoding, errors)) > LENGTH_NAME: 856 buf += self._create_gnu_long_header(info["name"], GNUTYPE_LONGNAME, encoding, errors) 857 858 return buf + self._create_header(info, GNU_FORMAT, encoding, errors) 859 860 def create_pax_header(self, info, encoding): 861 """Return the object as a ustar header block. If it cannot be 862 represented this way, prepend a pax extended header sequence 863 with supplement information. 864 """ 865 info["magic"] = POSIX_MAGIC 866 pax_headers = self.pax_headers.copy() 867 868 # Test string fields for values that exceed the field length or cannot 869 # be represented in ASCII encoding. 870 for name, hname, length in ( 871 ("name", "path", LENGTH_NAME), ("linkname", "linkpath", LENGTH_LINK), 872 ("uname", "uname", 32), ("gname", "gname", 32)): 873 874 if hname in pax_headers: 875 # The pax header has priority. 876 continue 877 878 # Try to encode the string as ASCII. 879 try: 880 info[name].encode("ascii", "strict") 881 except UnicodeEncodeError: 882 pax_headers[hname] = info[name] 883 continue 884 885 if len(info[name]) > length: 886 pax_headers[hname] = info[name] 887 888 # Test number fields for values that exceed the field limit or values 889 # that like to be stored as float. 890 for name, digits in (("uid", 8), ("gid", 8), ("size", 12), ("mtime", 12)): 891 needs_pax = False 892 893 val = info[name] 894 val_is_float = isinstance(val, float) 895 val_int = round(val) if val_is_float else val 896 if not 0 <= val_int < 8 ** (digits - 1): 897 # Avoid overflow. 898 info[name] = 0 899 needs_pax = True 900 elif val_is_float: 901 # Put rounded value in ustar header, and full 902 # precision value in pax header. 903 info[name] = val_int 904 needs_pax = True 905 906 # The existing pax header has priority. 907 if needs_pax and name not in pax_headers: 908 pax_headers[name] = str(val) 909 910 # Create a pax extended header if necessary. 911 if pax_headers: 912 buf = self._create_pax_generic_header(pax_headers, XHDTYPE, encoding) 913 else: 914 buf = b"" 915 916 return buf + self._create_header(info, USTAR_FORMAT, "ascii", "replace") 917 918 @classmethod 919 def create_pax_global_header(cls, pax_headers): 920 """Return the object as a pax global header block sequence. 921 """ 922 return cls._create_pax_generic_header(pax_headers, XGLTYPE, "utf-8") 923 924 def _posix_split_name(self, name, encoding, errors): 925 """Split a name longer than 100 chars into a prefix 926 and a name part. 927 """ 928 components = name.split("/") 929 for i in range(1, len(components)): 930 prefix = "/".join(components[:i]) 931 name = "/".join(components[i:]) 932 if len(prefix.encode(encoding, errors)) <= LENGTH_PREFIX and \ 933 len(name.encode(encoding, errors)) <= LENGTH_NAME: 934 break 935 else: 936 raise ValueError("name is too long") 937 938 return prefix, name 939 940 @staticmethod 941 def _create_header(info, format, encoding, errors): 942 """Return a header block. info is a dictionary with file 943 information, format must be one of the *_FORMAT constants. 944 """ 945 has_device_fields = info.get("type") in (CHRTYPE, BLKTYPE) 946 if has_device_fields: 947 devmajor = itn(info.get("devmajor", 0), 8, format) 948 devminor = itn(info.get("devminor", 0), 8, format) 949 else: 950 devmajor = stn("", 8, encoding, errors) 951 devminor = stn("", 8, encoding, errors) 952 953 parts = [ 954 stn(info.get("name", ""), 100, encoding, errors), 955 itn(info.get("mode", 0) & 0o7777, 8, format), 956 itn(info.get("uid", 0), 8, format), 957 itn(info.get("gid", 0), 8, format), 958 itn(info.get("size", 0), 12, format), 959 itn(info.get("mtime", 0), 12, format), 960 b" ", # checksum field 961 info.get("type", REGTYPE), 962 stn(info.get("linkname", ""), 100, encoding, errors), 963 info.get("magic", POSIX_MAGIC), 964 stn(info.get("uname", ""), 32, encoding, errors), 965 stn(info.get("gname", ""), 32, encoding, errors), 966 devmajor, 967 devminor, 968 stn(info.get("prefix", ""), 155, encoding, errors) 969 ] 970 971 buf = struct.pack("%ds" % BLOCKSIZE, b"".join(parts)) 972 chksum = calc_chksums(buf[-BLOCKSIZE:])[0] 973 buf = buf[:-364] + bytes("%06o\0" % chksum, "ascii") + buf[-357:] 974 return buf 975 976 @staticmethod 977 def _create_payload(payload): 978 """Return the string payload filled with zero bytes 979 up to the next 512 byte border. 980 """ 981 blocks, remainder = divmod(len(payload), BLOCKSIZE) 982 if remainder > 0: 983 payload += (BLOCKSIZE - remainder) * NUL 984 return payload 985 986 @classmethod 987 def _create_gnu_long_header(cls, name, type, encoding, errors): 988 """Return a GNUTYPE_LONGNAME or GNUTYPE_LONGLINK sequence 989 for name. 990 """ 991 name = name.encode(encoding, errors) + NUL 992 993 info = {} 994 info["name"] = "././@LongLink" 995 info["type"] = type 996 info["size"] = len(name) 997 info["magic"] = GNU_MAGIC 998 999 # create extended header + name blocks. 1000 return cls._create_header(info, USTAR_FORMAT, encoding, errors) + \ 1001 cls._create_payload(name) 1002 1003 @classmethod 1004 def _create_pax_generic_header(cls, pax_headers, type, encoding): 1005 """Return a POSIX.1-2008 extended or global header sequence 1006 that contains a list of keyword, value pairs. The values 1007 must be strings. 1008 """ 1009 # Check if one of the fields contains surrogate characters and thereby 1010 # forces hdrcharset=BINARY, see _proc_pax() for more information. 1011 binary = False 1012 for keyword, value in pax_headers.items(): 1013 try: 1014 value.encode("utf-8", "strict") 1015 except UnicodeEncodeError: 1016 binary = True 1017 break 1018 1019 records = b"" 1020 if binary: 1021 # Put the hdrcharset field at the beginning of the header. 1022 records += b"21 hdrcharset=BINARY\n" 1023 1024 for keyword, value in pax_headers.items(): 1025 keyword = keyword.encode("utf-8") 1026 if binary: 1027 # Try to restore the original byte representation of `value'. 1028 # Needless to say, that the encoding must match the string. 1029 value = value.encode(encoding, "surrogateescape") 1030 else: 1031 value = value.encode("utf-8") 1032 1033 l = len(keyword) + len(value) + 3 # ' ' + '=' + '\n' 1034 n = p = 0 1035 while True: 1036 n = l + len(str(p)) 1037 if n == p: 1038 break 1039 p = n 1040 records += bytes(str(p), "ascii") + b" " + keyword + b"=" + value + b"\n" 1041 1042 # We use a hardcoded "././@PaxHeader" name like star does 1043 # instead of the one that POSIX recommends. 1044 info = {} 1045 info["name"] = "././@PaxHeader" 1046 info["type"] = type 1047 info["size"] = len(records) 1048 info["magic"] = POSIX_MAGIC 1049 1050 # Create pax header + record blocks. 1051 return cls._create_header(info, USTAR_FORMAT, "ascii", "replace") + \ 1052 cls._create_payload(records) 1053 1054 @classmethod 1055 def frombuf(cls, buf, encoding, errors): 1056 """Construct a TarInfo object from a 512 byte bytes object. 1057 """ 1058 if len(buf) == 0: 1059 raise EmptyHeaderError("empty header") 1060 if len(buf) != BLOCKSIZE: 1061 raise TruncatedHeaderError("truncated header") 1062 if buf.count(NUL) == BLOCKSIZE: 1063 raise EOFHeaderError("end of file header") 1064 1065 chksum = nti(buf[148:156]) 1066 if chksum not in calc_chksums(buf): 1067 raise InvalidHeaderError("bad checksum") 1068 1069 obj = cls() 1070 obj.name = nts(buf[0:100], encoding, errors) 1071 obj.mode = nti(buf[100:108]) 1072 obj.uid = nti(buf[108:116]) 1073 obj.gid = nti(buf[116:124]) 1074 obj.size = nti(buf[124:136]) 1075 obj.mtime = nti(buf[136:148]) 1076 obj.chksum = chksum 1077 obj.type = buf[156:157] 1078 obj.linkname = nts(buf[157:257], encoding, errors) 1079 obj.uname = nts(buf[265:297], encoding, errors) 1080 obj.gname = nts(buf[297:329], encoding, errors) 1081 obj.devmajor = nti(buf[329:337]) 1082 obj.devminor = nti(buf[337:345]) 1083 prefix = nts(buf[345:500], encoding, errors) 1084 1085 # Old V7 tar format represents a directory as a regular 1086 # file with a trailing slash. 1087 if obj.type == AREGTYPE and obj.name.endswith("/"): 1088 obj.type = DIRTYPE 1089 1090 # The old GNU sparse format occupies some of the unused 1091 # space in the buffer for up to 4 sparse structures. 1092 # Save them for later processing in _proc_sparse(). 1093 if obj.type == GNUTYPE_SPARSE: 1094 pos = 386 1095 structs = [] 1096 for i in range(4): 1097 try: 1098 offset = nti(buf[pos:pos + 12]) 1099 numbytes = nti(buf[pos + 12:pos + 24]) 1100 except ValueError: 1101 break 1102 structs.append((offset, numbytes)) 1103 pos += 24 1104 isextended = bool(buf[482]) 1105 origsize = nti(buf[483:495]) 1106 obj._sparse_structs = (structs, isextended, origsize) 1107 1108 # Remove redundant slashes from directories. 1109 if obj.isdir(): 1110 obj.name = obj.name.rstrip("/") 1111 1112 # Reconstruct a ustar longname. 1113 if prefix and obj.type not in GNU_TYPES: 1114 obj.name = prefix + "/" + obj.name 1115 return obj 1116 1117 @classmethod 1118 def fromtarfile(cls, tarfile): 1119 """Return the next TarInfo object from TarFile object 1120 tarfile. 1121 """ 1122 buf = tarfile.fileobj.read(BLOCKSIZE) 1123 obj = cls.frombuf(buf, tarfile.encoding, tarfile.errors) 1124 obj.offset = tarfile.fileobj.tell() - BLOCKSIZE 1125 return obj._proc_member(tarfile) 1126 1127 #-------------------------------------------------------------------------- 1128 # The following are methods that are called depending on the type of a 1129 # member. The entry point is _proc_member() which can be overridden in a 1130 # subclass to add custom _proc_*() methods. A _proc_*() method MUST 1131 # implement the following 1132 # operations: 1133 # 1. Set self.offset_data to the position where the data blocks begin, 1134 # if there is data that follows. 1135 # 2. Set tarfile.offset to the position where the next member's header will 1136 # begin. 1137 # 3. Return self or another valid TarInfo object. 1138 def _proc_member(self, tarfile): 1139 """Choose the right processing method depending on 1140 the type and call it. 1141 """ 1142 if self.type in (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK): 1143 return self._proc_gnulong(tarfile) 1144 elif self.type == GNUTYPE_SPARSE: 1145 return self._proc_sparse(tarfile) 1146 elif self.type in (XHDTYPE, XGLTYPE, SOLARIS_XHDTYPE): 1147 return self._proc_pax(tarfile) 1148 else: 1149 return self._proc_builtin(tarfile) 1150 1151 def _proc_builtin(self, tarfile): 1152 """Process a builtin type or an unknown type which 1153 will be treated as a regular file. 1154 """ 1155 self.offset_data = tarfile.fileobj.tell() 1156 offset = self.offset_data 1157 if self.isreg() or self.type not in SUPPORTED_TYPES: 1158 # Skip the following data blocks. 1159 offset += self._block(self.size) 1160 tarfile.offset = offset 1161 1162 # Patch the TarInfo object with saved global 1163 # header information. 1164 self._apply_pax_info(tarfile.pax_headers, tarfile.encoding, tarfile.errors) 1165 1166 return self 1167 1168 def _proc_gnulong(self, tarfile): 1169 """Process the blocks that hold a GNU longname 1170 or longlink member. 1171 """ 1172 buf = tarfile.fileobj.read(self._block(self.size)) 1173 1174 # Fetch the next header and process it. 1175 try: 1176 next = self.fromtarfile(tarfile) 1177 except HeaderError as e: 1178 raise SubsequentHeaderError(str(e)) from None 1179 1180 # Patch the TarInfo object from the next header with 1181 # the longname information. 1182 next.offset = self.offset 1183 if self.type == GNUTYPE_LONGNAME: 1184 next.name = nts(buf, tarfile.encoding, tarfile.errors) 1185 elif self.type == GNUTYPE_LONGLINK: 1186 next.linkname = nts(buf, tarfile.encoding, tarfile.errors) 1187 1188 return next 1189 1190 def _proc_sparse(self, tarfile): 1191 """Process a GNU sparse header plus extra headers. 1192 """ 1193 # We already collected some sparse structures in frombuf(). 1194 structs, isextended, origsize = self._sparse_structs 1195 del self._sparse_structs 1196 1197 # Collect sparse structures from extended header blocks. 1198 while isextended: 1199 buf = tarfile.fileobj.read(BLOCKSIZE) 1200 pos = 0 1201 for i in range(21): 1202 try: 1203 offset = nti(buf[pos:pos + 12]) 1204 numbytes = nti(buf[pos + 12:pos + 24]) 1205 except ValueError: 1206 break 1207 if offset and numbytes: 1208 structs.append((offset, numbytes)) 1209 pos += 24 1210 isextended = bool(buf[504]) 1211 self.sparse = structs 1212 1213 self.offset_data = tarfile.fileobj.tell() 1214 tarfile.offset = self.offset_data + self._block(self.size) 1215 self.size = origsize 1216 return self 1217 1218 def _proc_pax(self, tarfile): 1219 """Process an extended or global header as described in 1220 POSIX.1-2008. 1221 """ 1222 # Read the header information. 1223 buf = tarfile.fileobj.read(self._block(self.size)) 1224 1225 # A pax header stores supplemental information for either 1226 # the following file (extended) or all following files 1227 # (global). 1228 if self.type == XGLTYPE: 1229 pax_headers = tarfile.pax_headers 1230 else: 1231 pax_headers = tarfile.pax_headers.copy() 1232 1233 # Check if the pax header contains a hdrcharset field. This tells us 1234 # the encoding of the path, linkpath, uname and gname fields. Normally, 1235 # these fields are UTF-8 encoded but since POSIX.1-2008 tar 1236 # implementations are allowed to store them as raw binary strings if 1237 # the translation to UTF-8 fails. 1238 match = re.search(br"\d+ hdrcharset=([^\n]+)\n", buf) 1239 if match is not None: 1240 pax_headers["hdrcharset"] = match.group(1).decode("utf-8") 1241 1242 # For the time being, we don't care about anything other than "BINARY". 1243 # The only other value that is currently allowed by the standard is 1244 # "ISO-IR 10646 2000 UTF-8" in other words UTF-8. 1245 hdrcharset = pax_headers.get("hdrcharset") 1246 if hdrcharset == "BINARY": 1247 encoding = tarfile.encoding 1248 else: 1249 encoding = "utf-8" 1250 1251 # Parse pax header information. A record looks like that: 1252 # "%d %s=%s\n" % (length, keyword, value). length is the size 1253 # of the complete record including the length field itself and 1254 # the newline. keyword and value are both UTF-8 encoded strings. 1255 regex = re.compile(br"(\d+) ([^=]+)=") 1256 pos = 0 1257 while True: 1258 match = regex.match(buf, pos) 1259 if not match: 1260 break 1261 1262 length, keyword = match.groups() 1263 length = int(length) 1264 if length == 0: 1265 raise InvalidHeaderError("invalid header") 1266 value = buf[match.end(2) + 1:match.start(1) + length - 1] 1267 1268 # Normally, we could just use "utf-8" as the encoding and "strict" 1269 # as the error handler, but we better not take the risk. For 1270 # example, GNU tar <= 1.23 is known to store filenames it cannot 1271 # translate to UTF-8 as raw strings (unfortunately without a 1272 # hdrcharset=BINARY header). 1273 # We first try the strict standard encoding, and if that fails we 1274 # fall back on the user's encoding and error handler. 1275 keyword = self._decode_pax_field(keyword, "utf-8", "utf-8", 1276 tarfile.errors) 1277 if keyword in PAX_NAME_FIELDS: 1278 value = self._decode_pax_field(value, encoding, tarfile.encoding, 1279 tarfile.errors) 1280 else: 1281 value = self._decode_pax_field(value, "utf-8", "utf-8", 1282 tarfile.errors) 1283 1284 pax_headers[keyword] = value 1285 pos += length 1286 1287 # Fetch the next header. 1288 try: 1289 next = self.fromtarfile(tarfile) 1290 except HeaderError as e: 1291 raise SubsequentHeaderError(str(e)) from None 1292 1293 # Process GNU sparse information. 1294 if "GNU.sparse.map" in pax_headers: 1295 # GNU extended sparse format version 0.1. 1296 self._proc_gnusparse_01(next, pax_headers) 1297 1298 elif "GNU.sparse.size" in pax_headers: 1299 # GNU extended sparse format version 0.0. 1300 self._proc_gnusparse_00(next, pax_headers, buf) 1301 1302 elif pax_headers.get("GNU.sparse.major") == "1" and pax_headers.get("GNU.sparse.minor") == "0": 1303 # GNU extended sparse format version 1.0. 1304 self._proc_gnusparse_10(next, pax_headers, tarfile) 1305 1306 if self.type in (XHDTYPE, SOLARIS_XHDTYPE): 1307 # Patch the TarInfo object with the extended header info. 1308 next._apply_pax_info(pax_headers, tarfile.encoding, tarfile.errors) 1309 next.offset = self.offset 1310 1311 if "size" in pax_headers: 1312 # If the extended header replaces the size field, 1313 # we need to recalculate the offset where the next 1314 # header starts. 1315 offset = next.offset_data 1316 if next.isreg() or next.type not in SUPPORTED_TYPES: 1317 offset += next._block(next.size) 1318 tarfile.offset = offset 1319 1320 return next 1321 1322 def _proc_gnusparse_00(self, next, pax_headers, buf): 1323 """Process a GNU tar extended sparse header, version 0.0. 1324 """ 1325 offsets = [] 1326 for match in re.finditer(br"\d+ GNU.sparse.offset=(\d+)\n", buf): 1327 offsets.append(int(match.group(1))) 1328 numbytes = [] 1329 for match in re.finditer(br"\d+ GNU.sparse.numbytes=(\d+)\n", buf): 1330 numbytes.append(int(match.group(1))) 1331 next.sparse = list(zip(offsets, numbytes)) 1332 1333 def _proc_gnusparse_01(self, next, pax_headers): 1334 """Process a GNU tar extended sparse header, version 0.1. 1335 """ 1336 sparse = [int(x) for x in pax_headers["GNU.sparse.map"].split(",")] 1337 next.sparse = list(zip(sparse[::2], sparse[1::2])) 1338 1339 def _proc_gnusparse_10(self, next, pax_headers, tarfile): 1340 """Process a GNU tar extended sparse header, version 1.0. 1341 """ 1342 fields = None 1343 sparse = [] 1344 buf = tarfile.fileobj.read(BLOCKSIZE) 1345 fields, buf = buf.split(b"\n", 1) 1346 fields = int(fields) 1347 while len(sparse) < fields * 2: 1348 if b"\n" not in buf: 1349 buf += tarfile.fileobj.read(BLOCKSIZE) 1350 number, buf = buf.split(b"\n", 1) 1351 sparse.append(int(number)) 1352 next.offset_data = tarfile.fileobj.tell() 1353 next.sparse = list(zip(sparse[::2], sparse[1::2])) 1354 1355 def _apply_pax_info(self, pax_headers, encoding, errors): 1356 """Replace fields with supplemental information from a previous 1357 pax extended or global header. 1358 """ 1359 for keyword, value in pax_headers.items(): 1360 if keyword == "GNU.sparse.name": 1361 setattr(self, "path", value) 1362 elif keyword == "GNU.sparse.size": 1363 setattr(self, "size", int(value)) 1364 elif keyword == "GNU.sparse.realsize": 1365 setattr(self, "size", int(value)) 1366 elif keyword in PAX_FIELDS: 1367 if keyword in PAX_NUMBER_FIELDS: 1368 try: 1369 value = PAX_NUMBER_FIELDS[keyword](value) 1370 except ValueError: 1371 value = 0 1372 if keyword == "path": 1373 value = value.rstrip("/") 1374 setattr(self, keyword, value) 1375 1376 self.pax_headers = pax_headers.copy() 1377 1378 def _decode_pax_field(self, value, encoding, fallback_encoding, fallback_errors): 1379 """Decode a single field from a pax record. 1380 """ 1381 try: 1382 return value.decode(encoding, "strict") 1383 except UnicodeDecodeError: 1384 return value.decode(fallback_encoding, fallback_errors) 1385 1386 def _block(self, count): 1387 """Round up a byte count by BLOCKSIZE and return it, 1388 e.g. _block(834) => 1024. 1389 """ 1390 blocks, remainder = divmod(count, BLOCKSIZE) 1391 if remainder: 1392 blocks += 1 1393 return blocks * BLOCKSIZE 1394 1395 def isreg(self): 1396 'Return True if the Tarinfo object is a regular file.' 1397 return self.type in REGULAR_TYPES 1398 1399 def isfile(self): 1400 'Return True if the Tarinfo object is a regular file.' 1401 return self.isreg() 1402 1403 def isdir(self): 1404 'Return True if it is a directory.' 1405 return self.type == DIRTYPE 1406 1407 def issym(self): 1408 'Return True if it is a symbolic link.' 1409 return self.type == SYMTYPE 1410 1411 def islnk(self): 1412 'Return True if it is a hard link.' 1413 return self.type == LNKTYPE 1414 1415 def ischr(self): 1416 'Return True if it is a character device.' 1417 return self.type == CHRTYPE 1418 1419 def isblk(self): 1420 'Return True if it is a block device.' 1421 return self.type == BLKTYPE 1422 1423 def isfifo(self): 1424 'Return True if it is a FIFO.' 1425 return self.type == FIFOTYPE 1426 1427 def issparse(self): 1428 return self.sparse is not None 1429 1430 def isdev(self): 1431 'Return True if it is one of character device, block device or FIFO.' 1432 return self.type in (CHRTYPE, BLKTYPE, FIFOTYPE) 1433# class TarInfo 1434 1435class TarFile(object): 1436 """The TarFile Class provides an interface to tar archives. 1437 """ 1438 1439 debug = 0 # May be set from 0 (no msgs) to 3 (all msgs) 1440 1441 dereference = False # If true, add content of linked file to the 1442 # tar file, else the link. 1443 1444 ignore_zeros = False # If true, skips empty or invalid blocks and 1445 # continues processing. 1446 1447 errorlevel = 1 # If 0, fatal errors only appear in debug 1448 # messages (if debug >= 0). If > 0, errors 1449 # are passed to the caller as exceptions. 1450 1451 format = DEFAULT_FORMAT # The format to use when creating an archive. 1452 1453 encoding = ENCODING # Encoding for 8-bit character strings. 1454 1455 errors = None # Error handler for unicode conversion. 1456 1457 tarinfo = TarInfo # The default TarInfo class to use. 1458 1459 fileobject = ExFileObject # The file-object for extractfile(). 1460 1461 def __init__(self, name=None, mode="r", fileobj=None, format=None, 1462 tarinfo=None, dereference=None, ignore_zeros=None, encoding=None, 1463 errors="surrogateescape", pax_headers=None, debug=None, 1464 errorlevel=None, copybufsize=None): 1465 """Open an (uncompressed) tar archive `name'. `mode' is either 'r' to 1466 read from an existing archive, 'a' to append data to an existing 1467 file or 'w' to create a new file overwriting an existing one. `mode' 1468 defaults to 'r'. 1469 If `fileobj' is given, it is used for reading or writing data. If it 1470 can be determined, `mode' is overridden by `fileobj's mode. 1471 `fileobj' is not closed, when TarFile is closed. 1472 """ 1473 modes = {"r": "rb", "a": "r+b", "w": "wb", "x": "xb"} 1474 if mode not in modes: 1475 raise ValueError("mode must be 'r', 'a', 'w' or 'x'") 1476 self.mode = mode 1477 self._mode = modes[mode] 1478 1479 if not fileobj: 1480 if self.mode == "a" and not os.path.exists(name): 1481 # Create nonexistent files in append mode. 1482 self.mode = "w" 1483 self._mode = "wb" 1484 fileobj = bltn_open(name, self._mode) 1485 self._extfileobj = False 1486 else: 1487 if (name is None and hasattr(fileobj, "name") and 1488 isinstance(fileobj.name, (str, bytes))): 1489 name = fileobj.name 1490 if hasattr(fileobj, "mode"): 1491 self._mode = fileobj.mode 1492 self._extfileobj = True 1493 self.name = os.path.abspath(name) if name else None 1494 self.fileobj = fileobj 1495 1496 # Init attributes. 1497 if format is not None: 1498 self.format = format 1499 if tarinfo is not None: 1500 self.tarinfo = tarinfo 1501 if dereference is not None: 1502 self.dereference = dereference 1503 if ignore_zeros is not None: 1504 self.ignore_zeros = ignore_zeros 1505 if encoding is not None: 1506 self.encoding = encoding 1507 self.errors = errors 1508 1509 if pax_headers is not None and self.format == PAX_FORMAT: 1510 self.pax_headers = pax_headers 1511 else: 1512 self.pax_headers = {} 1513 1514 if debug is not None: 1515 self.debug = debug 1516 if errorlevel is not None: 1517 self.errorlevel = errorlevel 1518 1519 # Init datastructures. 1520 self.copybufsize = copybufsize 1521 self.closed = False 1522 self.members = [] # list of members as TarInfo objects 1523 self._loaded = False # flag if all members have been read 1524 self.offset = self.fileobj.tell() 1525 # current position in the archive file 1526 self.inodes = {} # dictionary caching the inodes of 1527 # archive members already added 1528 1529 try: 1530 if self.mode == "r": 1531 self.firstmember = None 1532 self.firstmember = self.next() 1533 1534 if self.mode == "a": 1535 # Move to the end of the archive, 1536 # before the first empty block. 1537 while True: 1538 self.fileobj.seek(self.offset) 1539 try: 1540 tarinfo = self.tarinfo.fromtarfile(self) 1541 self.members.append(tarinfo) 1542 except EOFHeaderError: 1543 self.fileobj.seek(self.offset) 1544 break 1545 except HeaderError as e: 1546 raise ReadError(str(e)) from None 1547 1548 if self.mode in ("a", "w", "x"): 1549 self._loaded = True 1550 1551 if self.pax_headers: 1552 buf = self.tarinfo.create_pax_global_header(self.pax_headers.copy()) 1553 self.fileobj.write(buf) 1554 self.offset += len(buf) 1555 except: 1556 if not self._extfileobj: 1557 self.fileobj.close() 1558 self.closed = True 1559 raise 1560 1561 #-------------------------------------------------------------------------- 1562 # Below are the classmethods which act as alternate constructors to the 1563 # TarFile class. The open() method is the only one that is needed for 1564 # public use; it is the "super"-constructor and is able to select an 1565 # adequate "sub"-constructor for a particular compression using the mapping 1566 # from OPEN_METH. 1567 # 1568 # This concept allows one to subclass TarFile without losing the comfort of 1569 # the super-constructor. A sub-constructor is registered and made available 1570 # by adding it to the mapping in OPEN_METH. 1571 1572 @classmethod 1573 def open(cls, name=None, mode="r", fileobj=None, bufsize=RECORDSIZE, **kwargs): 1574 """Open a tar archive for reading, writing or appending. Return 1575 an appropriate TarFile class. 1576 1577 mode: 1578 'r' or 'r:*' open for reading with transparent compression 1579 'r:' open for reading exclusively uncompressed 1580 'r:gz' open for reading with gzip compression 1581 'r:bz2' open for reading with bzip2 compression 1582 'r:xz' open for reading with lzma compression 1583 'a' or 'a:' open for appending, creating the file if necessary 1584 'w' or 'w:' open for writing without compression 1585 'w:gz' open for writing with gzip compression 1586 'w:bz2' open for writing with bzip2 compression 1587 'w:xz' open for writing with lzma compression 1588 1589 'x' or 'x:' create a tarfile exclusively without compression, raise 1590 an exception if the file is already created 1591 'x:gz' create a gzip compressed tarfile, raise an exception 1592 if the file is already created 1593 'x:bz2' create a bzip2 compressed tarfile, raise an exception 1594 if the file is already created 1595 'x:xz' create an lzma compressed tarfile, raise an exception 1596 if the file is already created 1597 1598 'r|*' open a stream of tar blocks with transparent compression 1599 'r|' open an uncompressed stream of tar blocks for reading 1600 'r|gz' open a gzip compressed stream of tar blocks 1601 'r|bz2' open a bzip2 compressed stream of tar blocks 1602 'r|xz' open an lzma compressed stream of tar blocks 1603 'w|' open an uncompressed stream for writing 1604 'w|gz' open a gzip compressed stream for writing 1605 'w|bz2' open a bzip2 compressed stream for writing 1606 'w|xz' open an lzma compressed stream for writing 1607 """ 1608 1609 if not name and not fileobj: 1610 raise ValueError("nothing to open") 1611 1612 if mode in ("r", "r:*"): 1613 # Find out which *open() is appropriate for opening the file. 1614 def not_compressed(comptype): 1615 return cls.OPEN_METH[comptype] == 'taropen' 1616 error_msgs = [] 1617 for comptype in sorted(cls.OPEN_METH, key=not_compressed): 1618 func = getattr(cls, cls.OPEN_METH[comptype]) 1619 if fileobj is not None: 1620 saved_pos = fileobj.tell() 1621 try: 1622 return func(name, "r", fileobj, **kwargs) 1623 except (ReadError, CompressionError) as e: 1624 error_msgs.append(f'- method {comptype}: {e!r}') 1625 if fileobj is not None: 1626 fileobj.seek(saved_pos) 1627 continue 1628 error_msgs_summary = '\n'.join(error_msgs) 1629 raise ReadError(f"file could not be opened successfully:\n{error_msgs_summary}") 1630 1631 elif ":" in mode: 1632 filemode, comptype = mode.split(":", 1) 1633 filemode = filemode or "r" 1634 comptype = comptype or "tar" 1635 1636 # Select the *open() function according to 1637 # given compression. 1638 if comptype in cls.OPEN_METH: 1639 func = getattr(cls, cls.OPEN_METH[comptype]) 1640 else: 1641 raise CompressionError("unknown compression type %r" % comptype) 1642 return func(name, filemode, fileobj, **kwargs) 1643 1644 elif "|" in mode: 1645 filemode, comptype = mode.split("|", 1) 1646 filemode = filemode or "r" 1647 comptype = comptype or "tar" 1648 1649 if filemode not in ("r", "w"): 1650 raise ValueError("mode must be 'r' or 'w'") 1651 1652 stream = _Stream(name, filemode, comptype, fileobj, bufsize) 1653 try: 1654 t = cls(name, filemode, stream, **kwargs) 1655 except: 1656 stream.close() 1657 raise 1658 t._extfileobj = False 1659 return t 1660 1661 elif mode in ("a", "w", "x"): 1662 return cls.taropen(name, mode, fileobj, **kwargs) 1663 1664 raise ValueError("undiscernible mode") 1665 1666 @classmethod 1667 def taropen(cls, name, mode="r", fileobj=None, **kwargs): 1668 """Open uncompressed tar archive name for reading or writing. 1669 """ 1670 if mode not in ("r", "a", "w", "x"): 1671 raise ValueError("mode must be 'r', 'a', 'w' or 'x'") 1672 return cls(name, mode, fileobj, **kwargs) 1673 1674 @classmethod 1675 def gzopen(cls, name, mode="r", fileobj=None, compresslevel=9, **kwargs): 1676 """Open gzip compressed tar archive name for reading or writing. 1677 Appending is not allowed. 1678 """ 1679 if mode not in ("r", "w", "x"): 1680 raise ValueError("mode must be 'r', 'w' or 'x'") 1681 1682 try: 1683 from gzip import GzipFile 1684 except ImportError: 1685 raise CompressionError("gzip module is not available") from None 1686 1687 try: 1688 fileobj = GzipFile(name, mode + "b", compresslevel, fileobj) 1689 except OSError as e: 1690 if fileobj is not None and mode == 'r': 1691 raise ReadError("not a gzip file") from e 1692 raise 1693 1694 try: 1695 t = cls.taropen(name, mode, fileobj, **kwargs) 1696 except OSError as e: 1697 fileobj.close() 1698 if mode == 'r': 1699 raise ReadError("not a gzip file") from e 1700 raise 1701 except: 1702 fileobj.close() 1703 raise 1704 t._extfileobj = False 1705 return t 1706 1707 @classmethod 1708 def bz2open(cls, name, mode="r", fileobj=None, compresslevel=9, **kwargs): 1709 """Open bzip2 compressed tar archive name for reading or writing. 1710 Appending is not allowed. 1711 """ 1712 if mode not in ("r", "w", "x"): 1713 raise ValueError("mode must be 'r', 'w' or 'x'") 1714 1715 try: 1716 from bz2 import BZ2File 1717 except ImportError: 1718 raise CompressionError("bz2 module is not available") from None 1719 1720 fileobj = BZ2File(fileobj or name, mode, compresslevel=compresslevel) 1721 1722 try: 1723 t = cls.taropen(name, mode, fileobj, **kwargs) 1724 except (OSError, EOFError) as e: 1725 fileobj.close() 1726 if mode == 'r': 1727 raise ReadError("not a bzip2 file") from e 1728 raise 1729 except: 1730 fileobj.close() 1731 raise 1732 t._extfileobj = False 1733 return t 1734 1735 @classmethod 1736 def xzopen(cls, name, mode="r", fileobj=None, preset=None, **kwargs): 1737 """Open lzma compressed tar archive name for reading or writing. 1738 Appending is not allowed. 1739 """ 1740 if mode not in ("r", "w", "x"): 1741 raise ValueError("mode must be 'r', 'w' or 'x'") 1742 1743 try: 1744 from lzma import LZMAFile, LZMAError 1745 except ImportError: 1746 raise CompressionError("lzma module is not available") from None 1747 1748 fileobj = LZMAFile(fileobj or name, mode, preset=preset) 1749 1750 try: 1751 t = cls.taropen(name, mode, fileobj, **kwargs) 1752 except (LZMAError, EOFError) as e: 1753 fileobj.close() 1754 if mode == 'r': 1755 raise ReadError("not an lzma file") from e 1756 raise 1757 except: 1758 fileobj.close() 1759 raise 1760 t._extfileobj = False 1761 return t 1762 1763 # All *open() methods are registered here. 1764 OPEN_METH = { 1765 "tar": "taropen", # uncompressed tar 1766 "gz": "gzopen", # gzip compressed tar 1767 "bz2": "bz2open", # bzip2 compressed tar 1768 "xz": "xzopen" # lzma compressed tar 1769 } 1770 1771 #-------------------------------------------------------------------------- 1772 # The public methods which TarFile provides: 1773 1774 def close(self): 1775 """Close the TarFile. In write-mode, two finishing zero blocks are 1776 appended to the archive. 1777 """ 1778 if self.closed: 1779 return 1780 1781 self.closed = True 1782 try: 1783 if self.mode in ("a", "w", "x"): 1784 self.fileobj.write(NUL * (BLOCKSIZE * 2)) 1785 self.offset += (BLOCKSIZE * 2) 1786 # fill up the end with zero-blocks 1787 # (like option -b20 for tar does) 1788 blocks, remainder = divmod(self.offset, RECORDSIZE) 1789 if remainder > 0: 1790 self.fileobj.write(NUL * (RECORDSIZE - remainder)) 1791 finally: 1792 if not self._extfileobj: 1793 self.fileobj.close() 1794 1795 def getmember(self, name): 1796 """Return a TarInfo object for member `name'. If `name' can not be 1797 found in the archive, KeyError is raised. If a member occurs more 1798 than once in the archive, its last occurrence is assumed to be the 1799 most up-to-date version. 1800 """ 1801 tarinfo = self._getmember(name.rstrip('/')) 1802 if tarinfo is None: 1803 raise KeyError("filename %r not found" % name) 1804 return tarinfo 1805 1806 def getmembers(self): 1807 """Return the members of the archive as a list of TarInfo objects. The 1808 list has the same order as the members in the archive. 1809 """ 1810 self._check() 1811 if not self._loaded: # if we want to obtain a list of 1812 self._load() # all members, we first have to 1813 # scan the whole archive. 1814 return self.members 1815 1816 def getnames(self): 1817 """Return the members of the archive as a list of their names. It has 1818 the same order as the list returned by getmembers(). 1819 """ 1820 return [tarinfo.name for tarinfo in self.getmembers()] 1821 1822 def gettarinfo(self, name=None, arcname=None, fileobj=None): 1823 """Create a TarInfo object from the result of os.stat or equivalent 1824 on an existing file. The file is either named by `name', or 1825 specified as a file object `fileobj' with a file descriptor. If 1826 given, `arcname' specifies an alternative name for the file in the 1827 archive, otherwise, the name is taken from the 'name' attribute of 1828 'fileobj', or the 'name' argument. The name should be a text 1829 string. 1830 """ 1831 self._check("awx") 1832 1833 # When fileobj is given, replace name by 1834 # fileobj's real name. 1835 if fileobj is not None: 1836 name = fileobj.name 1837 1838 # Building the name of the member in the archive. 1839 # Backward slashes are converted to forward slashes, 1840 # Absolute paths are turned to relative paths. 1841 if arcname is None: 1842 arcname = name 1843 drv, arcname = os.path.splitdrive(arcname) 1844 arcname = arcname.replace(os.sep, "/") 1845 arcname = arcname.lstrip("/") 1846 1847 # Now, fill the TarInfo object with 1848 # information specific for the file. 1849 tarinfo = self.tarinfo() 1850 tarinfo.tarfile = self # Not needed 1851 1852 # Use os.stat or os.lstat, depending on if symlinks shall be resolved. 1853 if fileobj is None: 1854 if not self.dereference: 1855 statres = os.lstat(name) 1856 else: 1857 statres = os.stat(name) 1858 else: 1859 statres = os.fstat(fileobj.fileno()) 1860 linkname = "" 1861 1862 stmd = statres.st_mode 1863 if stat.S_ISREG(stmd): 1864 inode = (statres.st_ino, statres.st_dev) 1865 if not self.dereference and statres.st_nlink > 1 and \ 1866 inode in self.inodes and arcname != self.inodes[inode]: 1867 # Is it a hardlink to an already 1868 # archived file? 1869 type = LNKTYPE 1870 linkname = self.inodes[inode] 1871 else: 1872 # The inode is added only if its valid. 1873 # For win32 it is always 0. 1874 type = REGTYPE 1875 if inode[0]: 1876 self.inodes[inode] = arcname 1877 elif stat.S_ISDIR(stmd): 1878 type = DIRTYPE 1879 elif stat.S_ISFIFO(stmd): 1880 type = FIFOTYPE 1881 elif stat.S_ISLNK(stmd): 1882 type = SYMTYPE 1883 linkname = os.readlink(name) 1884 elif stat.S_ISCHR(stmd): 1885 type = CHRTYPE 1886 elif stat.S_ISBLK(stmd): 1887 type = BLKTYPE 1888 else: 1889 return None 1890 1891 # Fill the TarInfo object with all 1892 # information we can get. 1893 tarinfo.name = arcname 1894 tarinfo.mode = stmd 1895 tarinfo.uid = statres.st_uid 1896 tarinfo.gid = statres.st_gid 1897 if type == REGTYPE: 1898 tarinfo.size = statres.st_size 1899 else: 1900 tarinfo.size = 0 1901 tarinfo.mtime = statres.st_mtime 1902 tarinfo.type = type 1903 tarinfo.linkname = linkname 1904 if pwd: 1905 try: 1906 tarinfo.uname = pwd.getpwuid(tarinfo.uid)[0] 1907 except KeyError: 1908 pass 1909 if grp: 1910 try: 1911 tarinfo.gname = grp.getgrgid(tarinfo.gid)[0] 1912 except KeyError: 1913 pass 1914 1915 if type in (CHRTYPE, BLKTYPE): 1916 if hasattr(os, "major") and hasattr(os, "minor"): 1917 tarinfo.devmajor = os.major(statres.st_rdev) 1918 tarinfo.devminor = os.minor(statres.st_rdev) 1919 return tarinfo 1920 1921 def list(self, verbose=True, *, members=None): 1922 """Print a table of contents to sys.stdout. If `verbose' is False, only 1923 the names of the members are printed. If it is True, an `ls -l'-like 1924 output is produced. `members' is optional and must be a subset of the 1925 list returned by getmembers(). 1926 """ 1927 self._check() 1928 1929 if members is None: 1930 members = self 1931 for tarinfo in members: 1932 if verbose: 1933 _safe_print(stat.filemode(tarinfo.mode)) 1934 _safe_print("%s/%s" % (tarinfo.uname or tarinfo.uid, 1935 tarinfo.gname or tarinfo.gid)) 1936 if tarinfo.ischr() or tarinfo.isblk(): 1937 _safe_print("%10s" % 1938 ("%d,%d" % (tarinfo.devmajor, tarinfo.devminor))) 1939 else: 1940 _safe_print("%10d" % tarinfo.size) 1941 _safe_print("%d-%02d-%02d %02d:%02d:%02d" \ 1942 % time.localtime(tarinfo.mtime)[:6]) 1943 1944 _safe_print(tarinfo.name + ("/" if tarinfo.isdir() else "")) 1945 1946 if verbose: 1947 if tarinfo.issym(): 1948 _safe_print("-> " + tarinfo.linkname) 1949 if tarinfo.islnk(): 1950 _safe_print("link to " + tarinfo.linkname) 1951 print() 1952 1953 def add(self, name, arcname=None, recursive=True, *, filter=None): 1954 """Add the file `name' to the archive. `name' may be any type of file 1955 (directory, fifo, symbolic link, etc.). If given, `arcname' 1956 specifies an alternative name for the file in the archive. 1957 Directories are added recursively by default. This can be avoided by 1958 setting `recursive' to False. `filter' is a function 1959 that expects a TarInfo object argument and returns the changed 1960 TarInfo object, if it returns None the TarInfo object will be 1961 excluded from the archive. 1962 """ 1963 self._check("awx") 1964 1965 if arcname is None: 1966 arcname = name 1967 1968 # Skip if somebody tries to archive the archive... 1969 if self.name is not None and os.path.abspath(name) == self.name: 1970 self._dbg(2, "tarfile: Skipped %r" % name) 1971 return 1972 1973 self._dbg(1, name) 1974 1975 # Create a TarInfo object from the file. 1976 tarinfo = self.gettarinfo(name, arcname) 1977 1978 if tarinfo is None: 1979 self._dbg(1, "tarfile: Unsupported type %r" % name) 1980 return 1981 1982 # Change or exclude the TarInfo object. 1983 if filter is not None: 1984 tarinfo = filter(tarinfo) 1985 if tarinfo is None: 1986 self._dbg(2, "tarfile: Excluded %r" % name) 1987 return 1988 1989 # Append the tar header and data to the archive. 1990 if tarinfo.isreg(): 1991 with bltn_open(name, "rb") as f: 1992 self.addfile(tarinfo, f) 1993 1994 elif tarinfo.isdir(): 1995 self.addfile(tarinfo) 1996 if recursive: 1997 for f in sorted(os.listdir(name)): 1998 self.add(os.path.join(name, f), os.path.join(arcname, f), 1999 recursive, filter=filter) 2000 2001 else: 2002 self.addfile(tarinfo) 2003 2004 def addfile(self, tarinfo, fileobj=None): 2005 """Add the TarInfo object `tarinfo' to the archive. If `fileobj' is 2006 given, it should be a binary file, and tarinfo.size bytes are read 2007 from it and added to the archive. You can create TarInfo objects 2008 directly, or by using gettarinfo(). 2009 """ 2010 self._check("awx") 2011 2012 tarinfo = copy.copy(tarinfo) 2013 2014 buf = tarinfo.tobuf(self.format, self.encoding, self.errors) 2015 self.fileobj.write(buf) 2016 self.offset += len(buf) 2017 bufsize=self.copybufsize 2018 # If there's data to follow, append it. 2019 if fileobj is not None: 2020 copyfileobj(fileobj, self.fileobj, tarinfo.size, bufsize=bufsize) 2021 blocks, remainder = divmod(tarinfo.size, BLOCKSIZE) 2022 if remainder > 0: 2023 self.fileobj.write(NUL * (BLOCKSIZE - remainder)) 2024 blocks += 1 2025 self.offset += blocks * BLOCKSIZE 2026 2027 self.members.append(tarinfo) 2028 2029 def extractall(self, path=".", members=None, *, numeric_owner=False): 2030 """Extract all members from the archive to the current working 2031 directory and set owner, modification time and permissions on 2032 directories afterwards. `path' specifies a different directory 2033 to extract to. `members' is optional and must be a subset of the 2034 list returned by getmembers(). If `numeric_owner` is True, only 2035 the numbers for user/group names are used and not the names. 2036 """ 2037 directories = [] 2038 2039 if members is None: 2040 members = self 2041 2042 for tarinfo in members: 2043 if tarinfo.isdir(): 2044 # Extract directories with a safe mode. 2045 directories.append(tarinfo) 2046 tarinfo = copy.copy(tarinfo) 2047 tarinfo.mode = 0o700 2048 # Do not set_attrs directories, as we will do that further down 2049 self.extract(tarinfo, path, set_attrs=not tarinfo.isdir(), 2050 numeric_owner=numeric_owner) 2051 2052 # Reverse sort directories. 2053 directories.sort(key=lambda a: a.name) 2054 directories.reverse() 2055 2056 # Set correct owner, mtime and filemode on directories. 2057 for tarinfo in directories: 2058 dirpath = os.path.join(path, tarinfo.name) 2059 try: 2060 self.chown(tarinfo, dirpath, numeric_owner=numeric_owner) 2061 self.utime(tarinfo, dirpath) 2062 self.chmod(tarinfo, dirpath) 2063 except ExtractError as e: 2064 if self.errorlevel > 1: 2065 raise 2066 else: 2067 self._dbg(1, "tarfile: %s" % e) 2068 2069 def extract(self, member, path="", set_attrs=True, *, numeric_owner=False): 2070 """Extract a member from the archive to the current working directory, 2071 using its full name. Its file information is extracted as accurately 2072 as possible. `member' may be a filename or a TarInfo object. You can 2073 specify a different directory using `path'. File attributes (owner, 2074 mtime, mode) are set unless `set_attrs' is False. If `numeric_owner` 2075 is True, only the numbers for user/group names are used and not 2076 the names. 2077 """ 2078 self._check("r") 2079 2080 if isinstance(member, str): 2081 tarinfo = self.getmember(member) 2082 else: 2083 tarinfo = member 2084 2085 # Prepare the link target for makelink(). 2086 if tarinfo.islnk(): 2087 tarinfo._link_target = os.path.join(path, tarinfo.linkname) 2088 2089 try: 2090 self._extract_member(tarinfo, os.path.join(path, tarinfo.name), 2091 set_attrs=set_attrs, 2092 numeric_owner=numeric_owner) 2093 except OSError as e: 2094 if self.errorlevel > 0: 2095 raise 2096 else: 2097 if e.filename is None: 2098 self._dbg(1, "tarfile: %s" % e.strerror) 2099 else: 2100 self._dbg(1, "tarfile: %s %r" % (e.strerror, e.filename)) 2101 except ExtractError as e: 2102 if self.errorlevel > 1: 2103 raise 2104 else: 2105 self._dbg(1, "tarfile: %s" % e) 2106 2107 def extractfile(self, member): 2108 """Extract a member from the archive as a file object. `member' may be 2109 a filename or a TarInfo object. If `member' is a regular file or 2110 a link, an io.BufferedReader object is returned. For all other 2111 existing members, None is returned. If `member' does not appear 2112 in the archive, KeyError is raised. 2113 """ 2114 self._check("r") 2115 2116 if isinstance(member, str): 2117 tarinfo = self.getmember(member) 2118 else: 2119 tarinfo = member 2120 2121 if tarinfo.isreg() or tarinfo.type not in SUPPORTED_TYPES: 2122 # Members with unknown types are treated as regular files. 2123 return self.fileobject(self, tarinfo) 2124 2125 elif tarinfo.islnk() or tarinfo.issym(): 2126 if isinstance(self.fileobj, _Stream): 2127 # A small but ugly workaround for the case that someone tries 2128 # to extract a (sym)link as a file-object from a non-seekable 2129 # stream of tar blocks. 2130 raise StreamError("cannot extract (sym)link as file object") 2131 else: 2132 # A (sym)link's file object is its target's file object. 2133 return self.extractfile(self._find_link_target(tarinfo)) 2134 else: 2135 # If there's no data associated with the member (directory, chrdev, 2136 # blkdev, etc.), return None instead of a file object. 2137 return None 2138 2139 def _extract_member(self, tarinfo, targetpath, set_attrs=True, 2140 numeric_owner=False): 2141 """Extract the TarInfo object tarinfo to a physical 2142 file called targetpath. 2143 """ 2144 # Fetch the TarInfo object for the given name 2145 # and build the destination pathname, replacing 2146 # forward slashes to platform specific separators. 2147 targetpath = targetpath.rstrip("/") 2148 targetpath = targetpath.replace("/", os.sep) 2149 2150 # Create all upper directories. 2151 upperdirs = os.path.dirname(targetpath) 2152 if upperdirs and not os.path.exists(upperdirs): 2153 # Create directories that are not part of the archive with 2154 # default permissions. 2155 os.makedirs(upperdirs) 2156 2157 if tarinfo.islnk() or tarinfo.issym(): 2158 self._dbg(1, "%s -> %s" % (tarinfo.name, tarinfo.linkname)) 2159 else: 2160 self._dbg(1, tarinfo.name) 2161 2162 if tarinfo.isreg(): 2163 self.makefile(tarinfo, targetpath) 2164 elif tarinfo.isdir(): 2165 self.makedir(tarinfo, targetpath) 2166 elif tarinfo.isfifo(): 2167 self.makefifo(tarinfo, targetpath) 2168 elif tarinfo.ischr() or tarinfo.isblk(): 2169 self.makedev(tarinfo, targetpath) 2170 elif tarinfo.islnk() or tarinfo.issym(): 2171 self.makelink(tarinfo, targetpath) 2172 elif tarinfo.type not in SUPPORTED_TYPES: 2173 self.makeunknown(tarinfo, targetpath) 2174 else: 2175 self.makefile(tarinfo, targetpath) 2176 2177 if set_attrs: 2178 self.chown(tarinfo, targetpath, numeric_owner) 2179 if not tarinfo.issym(): 2180 self.chmod(tarinfo, targetpath) 2181 self.utime(tarinfo, targetpath) 2182 2183 #-------------------------------------------------------------------------- 2184 # Below are the different file methods. They are called via 2185 # _extract_member() when extract() is called. They can be replaced in a 2186 # subclass to implement other functionality. 2187 2188 def makedir(self, tarinfo, targetpath): 2189 """Make a directory called targetpath. 2190 """ 2191 try: 2192 # Use a safe mode for the directory, the real mode is set 2193 # later in _extract_member(). 2194 os.mkdir(targetpath, 0o700) 2195 except FileExistsError: 2196 pass 2197 2198 def makefile(self, tarinfo, targetpath): 2199 """Make a file called targetpath. 2200 """ 2201 source = self.fileobj 2202 source.seek(tarinfo.offset_data) 2203 bufsize = self.copybufsize 2204 with bltn_open(targetpath, "wb") as target: 2205 if tarinfo.sparse is not None: 2206 for offset, size in tarinfo.sparse: 2207 target.seek(offset) 2208 copyfileobj(source, target, size, ReadError, bufsize) 2209 target.seek(tarinfo.size) 2210 target.truncate() 2211 else: 2212 copyfileobj(source, target, tarinfo.size, ReadError, bufsize) 2213 2214 def makeunknown(self, tarinfo, targetpath): 2215 """Make a file from a TarInfo object with an unknown type 2216 at targetpath. 2217 """ 2218 self.makefile(tarinfo, targetpath) 2219 self._dbg(1, "tarfile: Unknown file type %r, " \ 2220 "extracted as regular file." % tarinfo.type) 2221 2222 def makefifo(self, tarinfo, targetpath): 2223 """Make a fifo called targetpath. 2224 """ 2225 if hasattr(os, "mkfifo"): 2226 os.mkfifo(targetpath) 2227 else: 2228 raise ExtractError("fifo not supported by system") 2229 2230 def makedev(self, tarinfo, targetpath): 2231 """Make a character or block device called targetpath. 2232 """ 2233 if not hasattr(os, "mknod") or not hasattr(os, "makedev"): 2234 raise ExtractError("special devices not supported by system") 2235 2236 mode = tarinfo.mode 2237 if tarinfo.isblk(): 2238 mode |= stat.S_IFBLK 2239 else: 2240 mode |= stat.S_IFCHR 2241 2242 os.mknod(targetpath, mode, 2243 os.makedev(tarinfo.devmajor, tarinfo.devminor)) 2244 2245 def makelink(self, tarinfo, targetpath): 2246 """Make a (symbolic) link called targetpath. If it cannot be created 2247 (platform limitation), we try to make a copy of the referenced file 2248 instead of a link. 2249 """ 2250 try: 2251 # For systems that support symbolic and hard links. 2252 if tarinfo.issym(): 2253 if os.path.lexists(targetpath): 2254 # Avoid FileExistsError on following os.symlink. 2255 os.unlink(targetpath) 2256 os.symlink(tarinfo.linkname, targetpath) 2257 else: 2258 # See extract(). 2259 if os.path.exists(tarinfo._link_target): 2260 os.link(tarinfo._link_target, targetpath) 2261 else: 2262 self._extract_member(self._find_link_target(tarinfo), 2263 targetpath) 2264 except symlink_exception: 2265 try: 2266 self._extract_member(self._find_link_target(tarinfo), 2267 targetpath) 2268 except KeyError: 2269 raise ExtractError("unable to resolve link inside archive") from None 2270 2271 def chown(self, tarinfo, targetpath, numeric_owner): 2272 """Set owner of targetpath according to tarinfo. If numeric_owner 2273 is True, use .gid/.uid instead of .gname/.uname. If numeric_owner 2274 is False, fall back to .gid/.uid when the search based on name 2275 fails. 2276 """ 2277 if hasattr(os, "geteuid") and os.geteuid() == 0: 2278 # We have to be root to do so. 2279 g = tarinfo.gid 2280 u = tarinfo.uid 2281 if not numeric_owner: 2282 try: 2283 if grp: 2284 g = grp.getgrnam(tarinfo.gname)[2] 2285 except KeyError: 2286 pass 2287 try: 2288 if pwd: 2289 u = pwd.getpwnam(tarinfo.uname)[2] 2290 except KeyError: 2291 pass 2292 try: 2293 if tarinfo.issym() and hasattr(os, "lchown"): 2294 os.lchown(targetpath, u, g) 2295 else: 2296 os.chown(targetpath, u, g) 2297 except OSError as e: 2298 raise ExtractError("could not change owner") from e 2299 2300 def chmod(self, tarinfo, targetpath): 2301 """Set file permissions of targetpath according to tarinfo. 2302 """ 2303 try: 2304 os.chmod(targetpath, tarinfo.mode) 2305 except OSError as e: 2306 raise ExtractError("could not change mode") from e 2307 2308 def utime(self, tarinfo, targetpath): 2309 """Set modification time of targetpath according to tarinfo. 2310 """ 2311 if not hasattr(os, 'utime'): 2312 return 2313 try: 2314 os.utime(targetpath, (tarinfo.mtime, tarinfo.mtime)) 2315 except OSError as e: 2316 raise ExtractError("could not change modification time") from e 2317 2318 #-------------------------------------------------------------------------- 2319 def next(self): 2320 """Return the next member of the archive as a TarInfo object, when 2321 TarFile is opened for reading. Return None if there is no more 2322 available. 2323 """ 2324 self._check("ra") 2325 if self.firstmember is not None: 2326 m = self.firstmember 2327 self.firstmember = None 2328 return m 2329 2330 # Advance the file pointer. 2331 if self.offset != self.fileobj.tell(): 2332 self.fileobj.seek(self.offset - 1) 2333 if not self.fileobj.read(1): 2334 raise ReadError("unexpected end of data") 2335 2336 # Read the next block. 2337 tarinfo = None 2338 while True: 2339 try: 2340 tarinfo = self.tarinfo.fromtarfile(self) 2341 except EOFHeaderError as e: 2342 if self.ignore_zeros: 2343 self._dbg(2, "0x%X: %s" % (self.offset, e)) 2344 self.offset += BLOCKSIZE 2345 continue 2346 except InvalidHeaderError as e: 2347 if self.ignore_zeros: 2348 self._dbg(2, "0x%X: %s" % (self.offset, e)) 2349 self.offset += BLOCKSIZE 2350 continue 2351 elif self.offset == 0: 2352 raise ReadError(str(e)) from None 2353 except EmptyHeaderError: 2354 if self.offset == 0: 2355 raise ReadError("empty file") from None 2356 except TruncatedHeaderError as e: 2357 if self.offset == 0: 2358 raise ReadError(str(e)) from None 2359 except SubsequentHeaderError as e: 2360 raise ReadError(str(e)) from None 2361 except Exception as e: 2362 try: 2363 import zlib 2364 if isinstance(e, zlib.error): 2365 raise ReadError(f'zlib error: {e}') from None 2366 else: 2367 raise e 2368 except ImportError: 2369 raise e 2370 break 2371 2372 if tarinfo is not None: 2373 self.members.append(tarinfo) 2374 else: 2375 self._loaded = True 2376 2377 return tarinfo 2378 2379 #-------------------------------------------------------------------------- 2380 # Little helper methods: 2381 2382 def _getmember(self, name, tarinfo=None, normalize=False): 2383 """Find an archive member by name from bottom to top. 2384 If tarinfo is given, it is used as the starting point. 2385 """ 2386 # Ensure that all members have been loaded. 2387 members = self.getmembers() 2388 2389 # Limit the member search list up to tarinfo. 2390 if tarinfo is not None: 2391 members = members[:members.index(tarinfo)] 2392 2393 if normalize: 2394 name = os.path.normpath(name) 2395 2396 for member in reversed(members): 2397 if normalize: 2398 member_name = os.path.normpath(member.name) 2399 else: 2400 member_name = member.name 2401 2402 if name == member_name: 2403 return member 2404 2405 def _load(self): 2406 """Read through the entire archive file and look for readable 2407 members. 2408 """ 2409 while True: 2410 tarinfo = self.next() 2411 if tarinfo is None: 2412 break 2413 self._loaded = True 2414 2415 def _check(self, mode=None): 2416 """Check if TarFile is still open, and if the operation's mode 2417 corresponds to TarFile's mode. 2418 """ 2419 if self.closed: 2420 raise OSError("%s is closed" % self.__class__.__name__) 2421 if mode is not None and self.mode not in mode: 2422 raise OSError("bad operation for mode %r" % self.mode) 2423 2424 def _find_link_target(self, tarinfo): 2425 """Find the target member of a symlink or hardlink member in the 2426 archive. 2427 """ 2428 if tarinfo.issym(): 2429 # Always search the entire archive. 2430 linkname = "/".join(filter(None, (os.path.dirname(tarinfo.name), tarinfo.linkname))) 2431 limit = None 2432 else: 2433 # Search the archive before the link, because a hard link is 2434 # just a reference to an already archived file. 2435 linkname = tarinfo.linkname 2436 limit = tarinfo 2437 2438 member = self._getmember(linkname, tarinfo=limit, normalize=True) 2439 if member is None: 2440 raise KeyError("linkname %r not found" % linkname) 2441 return member 2442 2443 def __iter__(self): 2444 """Provide an iterator object. 2445 """ 2446 if self._loaded: 2447 yield from self.members 2448 return 2449 2450 # Yield items using TarFile's next() method. 2451 # When all members have been read, set TarFile as _loaded. 2452 index = 0 2453 # Fix for SF #1100429: Under rare circumstances it can 2454 # happen that getmembers() is called during iteration, 2455 # which will have already exhausted the next() method. 2456 if self.firstmember is not None: 2457 tarinfo = self.next() 2458 index += 1 2459 yield tarinfo 2460 2461 while True: 2462 if index < len(self.members): 2463 tarinfo = self.members[index] 2464 elif not self._loaded: 2465 tarinfo = self.next() 2466 if not tarinfo: 2467 self._loaded = True 2468 return 2469 else: 2470 return 2471 index += 1 2472 yield tarinfo 2473 2474 def _dbg(self, level, msg): 2475 """Write debugging output to sys.stderr. 2476 """ 2477 if level <= self.debug: 2478 print(msg, file=sys.stderr) 2479 2480 def __enter__(self): 2481 self._check() 2482 return self 2483 2484 def __exit__(self, type, value, traceback): 2485 if type is None: 2486 self.close() 2487 else: 2488 # An exception occurred. We must not call close() because 2489 # it would try to write end-of-archive blocks and padding. 2490 if not self._extfileobj: 2491 self.fileobj.close() 2492 self.closed = True 2493 2494#-------------------- 2495# exported functions 2496#-------------------- 2497def is_tarfile(name): 2498 """Return True if name points to a tar archive that we 2499 are able to handle, else return False. 2500 2501 'name' should be a string, file, or file-like object. 2502 """ 2503 try: 2504 if hasattr(name, "read"): 2505 t = open(fileobj=name) 2506 else: 2507 t = open(name) 2508 t.close() 2509 return True 2510 except TarError: 2511 return False 2512 2513open = TarFile.open 2514 2515 2516def main(): 2517 import argparse 2518 2519 description = 'A simple command-line interface for tarfile module.' 2520 parser = argparse.ArgumentParser(description=description) 2521 parser.add_argument('-v', '--verbose', action='store_true', default=False, 2522 help='Verbose output') 2523 group = parser.add_mutually_exclusive_group(required=True) 2524 group.add_argument('-l', '--list', metavar='<tarfile>', 2525 help='Show listing of a tarfile') 2526 group.add_argument('-e', '--extract', nargs='+', 2527 metavar=('<tarfile>', '<output_dir>'), 2528 help='Extract tarfile into target dir') 2529 group.add_argument('-c', '--create', nargs='+', 2530 metavar=('<name>', '<file>'), 2531 help='Create tarfile from sources') 2532 group.add_argument('-t', '--test', metavar='<tarfile>', 2533 help='Test if a tarfile is valid') 2534 args = parser.parse_args() 2535 2536 if args.test is not None: 2537 src = args.test 2538 if is_tarfile(src): 2539 with open(src, 'r') as tar: 2540 tar.getmembers() 2541 print(tar.getmembers(), file=sys.stderr) 2542 if args.verbose: 2543 print('{!r} is a tar archive.'.format(src)) 2544 else: 2545 parser.exit(1, '{!r} is not a tar archive.\n'.format(src)) 2546 2547 elif args.list is not None: 2548 src = args.list 2549 if is_tarfile(src): 2550 with TarFile.open(src, 'r:*') as tf: 2551 tf.list(verbose=args.verbose) 2552 else: 2553 parser.exit(1, '{!r} is not a tar archive.\n'.format(src)) 2554 2555 elif args.extract is not None: 2556 if len(args.extract) == 1: 2557 src = args.extract[0] 2558 curdir = os.curdir 2559 elif len(args.extract) == 2: 2560 src, curdir = args.extract 2561 else: 2562 parser.exit(1, parser.format_help()) 2563 2564 if is_tarfile(src): 2565 with TarFile.open(src, 'r:*') as tf: 2566 tf.extractall(path=curdir) 2567 if args.verbose: 2568 if curdir == '.': 2569 msg = '{!r} file is extracted.'.format(src) 2570 else: 2571 msg = ('{!r} file is extracted ' 2572 'into {!r} directory.').format(src, curdir) 2573 print(msg) 2574 else: 2575 parser.exit(1, '{!r} is not a tar archive.\n'.format(src)) 2576 2577 elif args.create is not None: 2578 tar_name = args.create.pop(0) 2579 _, ext = os.path.splitext(tar_name) 2580 compressions = { 2581 # gz 2582 '.gz': 'gz', 2583 '.tgz': 'gz', 2584 # xz 2585 '.xz': 'xz', 2586 '.txz': 'xz', 2587 # bz2 2588 '.bz2': 'bz2', 2589 '.tbz': 'bz2', 2590 '.tbz2': 'bz2', 2591 '.tb2': 'bz2', 2592 } 2593 tar_mode = 'w:' + compressions[ext] if ext in compressions else 'w' 2594 tar_files = args.create 2595 2596 with TarFile.open(tar_name, tar_mode) as tf: 2597 for file_name in tar_files: 2598 tf.add(file_name) 2599 2600 if args.verbose: 2601 print('{!r} file created.'.format(tar_name)) 2602 2603if __name__ == '__main__': 2604 main() 2605