1#!/usr/bin/env python3 2#------------------------------------------------------------------- 3# tarfile.py 4#------------------------------------------------------------------- 5# Copyright (C) 2002 Lars Gustaebel <lars@gustaebel.de> 6# All rights reserved. 7# 8# Permission is hereby granted, free of charge, to any person 9# obtaining a copy of this software and associated documentation 10# files (the "Software"), to deal in the Software without 11# restriction, including without limitation the rights to use, 12# copy, modify, merge, publish, distribute, sublicense, and/or sell 13# copies of the Software, and to permit persons to whom the 14# Software is furnished to do so, subject to the following 15# conditions: 16# 17# The above copyright notice and this permission notice shall be 18# included in all copies or substantial portions of the Software. 19# 20# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 21# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 22# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 23# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 24# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 25# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 26# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 27# OTHER DEALINGS IN THE SOFTWARE. 28# 29"""Read from and write to tar format archives. 30""" 31 32version = "0.9.0" 33__author__ = "Lars Gust\u00e4bel (lars@gustaebel.de)" 34__credits__ = "Gustavo Niemeyer, Niels Gust\u00e4bel, Richard Townsend." 35 36#--------- 37# Imports 38#--------- 39from builtins import open as bltn_open 40import sys 41import os 42import io 43import shutil 44import stat 45import time 46import struct 47import copy 48import re 49import warnings 50 51try: 52 import pwd 53except ImportError: 54 pwd = None 55try: 56 import grp 57except ImportError: 58 grp = None 59 60# os.symlink on Windows prior to 6.0 raises NotImplementedError 61symlink_exception = (AttributeError, NotImplementedError) 62try: 63 # OSError (winerror=1314) will be raised if the caller does not hold the 64 # SeCreateSymbolicLinkPrivilege privilege 65 symlink_exception += (OSError,) 66except NameError: 67 pass 68 69# from tarfile import * 70__all__ = ["TarFile", "TarInfo", "is_tarfile", "TarError", "ReadError", 71 "CompressionError", "StreamError", "ExtractError", "HeaderError", 72 "ENCODING", "USTAR_FORMAT", "GNU_FORMAT", "PAX_FORMAT", 73 "DEFAULT_FORMAT", "open"] 74 75 76#--------------------------------------------------------- 77# tar constants 78#--------------------------------------------------------- 79NUL = b"\0" # the null character 80BLOCKSIZE = 512 # length of processing blocks 81RECORDSIZE = BLOCKSIZE * 20 # length of records 82GNU_MAGIC = b"ustar \0" # magic gnu tar string 83POSIX_MAGIC = b"ustar\x0000" # magic posix tar string 84 85LENGTH_NAME = 100 # maximum length of a filename 86LENGTH_LINK = 100 # maximum length of a linkname 87LENGTH_PREFIX = 155 # maximum length of the prefix field 88 89REGTYPE = b"0" # regular file 90AREGTYPE = b"\0" # regular file 91LNKTYPE = b"1" # link (inside tarfile) 92SYMTYPE = b"2" # symbolic link 93CHRTYPE = b"3" # character special device 94BLKTYPE = b"4" # block special device 95DIRTYPE = b"5" # directory 96FIFOTYPE = b"6" # fifo special device 97CONTTYPE = b"7" # contiguous file 98 99GNUTYPE_LONGNAME = b"L" # GNU tar longname 100GNUTYPE_LONGLINK = b"K" # GNU tar longlink 101GNUTYPE_SPARSE = b"S" # GNU tar sparse file 102 103XHDTYPE = b"x" # POSIX.1-2001 extended header 104XGLTYPE = b"g" # POSIX.1-2001 global header 105SOLARIS_XHDTYPE = b"X" # Solaris extended header 106 107USTAR_FORMAT = 0 # POSIX.1-1988 (ustar) format 108GNU_FORMAT = 1 # GNU tar format 109PAX_FORMAT = 2 # POSIX.1-2001 (pax) format 110DEFAULT_FORMAT = PAX_FORMAT 111 112#--------------------------------------------------------- 113# tarfile constants 114#--------------------------------------------------------- 115# File types that tarfile supports: 116SUPPORTED_TYPES = (REGTYPE, AREGTYPE, LNKTYPE, 117 SYMTYPE, DIRTYPE, FIFOTYPE, 118 CONTTYPE, CHRTYPE, BLKTYPE, 119 GNUTYPE_LONGNAME, GNUTYPE_LONGLINK, 120 GNUTYPE_SPARSE) 121 122# File types that will be treated as a regular file. 123REGULAR_TYPES = (REGTYPE, AREGTYPE, 124 CONTTYPE, GNUTYPE_SPARSE) 125 126# File types that are part of the GNU tar format. 127GNU_TYPES = (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK, 128 GNUTYPE_SPARSE) 129 130# Fields from a pax header that override a TarInfo attribute. 131PAX_FIELDS = ("path", "linkpath", "size", "mtime", 132 "uid", "gid", "uname", "gname") 133 134# Fields from a pax header that are affected by hdrcharset. 135PAX_NAME_FIELDS = {"path", "linkpath", "uname", "gname"} 136 137# Fields in a pax header that are numbers, all other fields 138# are treated as strings. 139PAX_NUMBER_FIELDS = { 140 "atime": float, 141 "ctime": float, 142 "mtime": float, 143 "uid": int, 144 "gid": int, 145 "size": int 146} 147 148#--------------------------------------------------------- 149# initialization 150#--------------------------------------------------------- 151if os.name == "nt": 152 ENCODING = "utf-8" 153else: 154 ENCODING = sys.getfilesystemencoding() 155 156#--------------------------------------------------------- 157# Some useful functions 158#--------------------------------------------------------- 159 160def stn(s, length, encoding, errors): 161 """Convert a string to a null-terminated bytes object. 162 """ 163 if s is None: 164 raise ValueError("metadata cannot contain None") 165 s = s.encode(encoding, errors) 166 return s[:length] + (length - len(s)) * NUL 167 168def nts(s, encoding, errors): 169 """Convert a null-terminated bytes object to a string. 170 """ 171 p = s.find(b"\0") 172 if p != -1: 173 s = s[:p] 174 return s.decode(encoding, errors) 175 176def nti(s): 177 """Convert a number field to a python number. 178 """ 179 # There are two possible encodings for a number field, see 180 # itn() below. 181 if s[0] in (0o200, 0o377): 182 n = 0 183 for i in range(len(s) - 1): 184 n <<= 8 185 n += s[i + 1] 186 if s[0] == 0o377: 187 n = -(256 ** (len(s) - 1) - n) 188 else: 189 try: 190 s = nts(s, "ascii", "strict") 191 n = int(s.strip() or "0", 8) 192 except ValueError: 193 raise InvalidHeaderError("invalid header") 194 return n 195 196def itn(n, digits=8, format=DEFAULT_FORMAT): 197 """Convert a python number to a number field. 198 """ 199 # POSIX 1003.1-1988 requires numbers to be encoded as a string of 200 # octal digits followed by a null-byte, this allows values up to 201 # (8**(digits-1))-1. GNU tar allows storing numbers greater than 202 # that if necessary. A leading 0o200 or 0o377 byte indicate this 203 # particular encoding, the following digits-1 bytes are a big-endian 204 # base-256 representation. This allows values up to (256**(digits-1))-1. 205 # A 0o200 byte indicates a positive number, a 0o377 byte a negative 206 # number. 207 original_n = n 208 n = int(n) 209 if 0 <= n < 8 ** (digits - 1): 210 s = bytes("%0*o" % (digits - 1, n), "ascii") + NUL 211 elif format == GNU_FORMAT and -256 ** (digits - 1) <= n < 256 ** (digits - 1): 212 if n >= 0: 213 s = bytearray([0o200]) 214 else: 215 s = bytearray([0o377]) 216 n = 256 ** digits + n 217 218 for i in range(digits - 1): 219 s.insert(1, n & 0o377) 220 n >>= 8 221 else: 222 raise ValueError("overflow in number field") 223 224 return s 225 226def calc_chksums(buf): 227 """Calculate the checksum for a member's header by summing up all 228 characters except for the chksum field which is treated as if 229 it was filled with spaces. According to the GNU tar sources, 230 some tars (Sun and NeXT) calculate chksum with signed char, 231 which will be different if there are chars in the buffer with 232 the high bit set. So we calculate two checksums, unsigned and 233 signed. 234 """ 235 unsigned_chksum = 256 + sum(struct.unpack_from("148B8x356B", buf)) 236 signed_chksum = 256 + sum(struct.unpack_from("148b8x356b", buf)) 237 return unsigned_chksum, signed_chksum 238 239def copyfileobj(src, dst, length=None, exception=OSError, bufsize=None): 240 """Copy length bytes from fileobj src to fileobj dst. 241 If length is None, copy the entire content. 242 """ 243 bufsize = bufsize or 16 * 1024 244 if length == 0: 245 return 246 if length is None: 247 shutil.copyfileobj(src, dst, bufsize) 248 return 249 250 blocks, remainder = divmod(length, bufsize) 251 for b in range(blocks): 252 buf = src.read(bufsize) 253 if len(buf) < bufsize: 254 raise exception("unexpected end of data") 255 dst.write(buf) 256 257 if remainder != 0: 258 buf = src.read(remainder) 259 if len(buf) < remainder: 260 raise exception("unexpected end of data") 261 dst.write(buf) 262 return 263 264def _safe_print(s): 265 encoding = getattr(sys.stdout, 'encoding', None) 266 if encoding is not None: 267 s = s.encode(encoding, 'backslashreplace').decode(encoding) 268 print(s, end=' ') 269 270 271class TarError(Exception): 272 """Base exception.""" 273 pass 274class ExtractError(TarError): 275 """General exception for extract errors.""" 276 pass 277class ReadError(TarError): 278 """Exception for unreadable tar archives.""" 279 pass 280class CompressionError(TarError): 281 """Exception for unavailable compression methods.""" 282 pass 283class StreamError(TarError): 284 """Exception for unsupported operations on stream-like TarFiles.""" 285 pass 286class HeaderError(TarError): 287 """Base exception for header errors.""" 288 pass 289class EmptyHeaderError(HeaderError): 290 """Exception for empty headers.""" 291 pass 292class TruncatedHeaderError(HeaderError): 293 """Exception for truncated headers.""" 294 pass 295class EOFHeaderError(HeaderError): 296 """Exception for end of file headers.""" 297 pass 298class InvalidHeaderError(HeaderError): 299 """Exception for invalid headers.""" 300 pass 301class SubsequentHeaderError(HeaderError): 302 """Exception for missing and invalid extended headers.""" 303 pass 304 305#--------------------------- 306# internal stream interface 307#--------------------------- 308class _LowLevelFile: 309 """Low-level file object. Supports reading and writing. 310 It is used instead of a regular file object for streaming 311 access. 312 """ 313 314 def __init__(self, name, mode): 315 mode = { 316 "r": os.O_RDONLY, 317 "w": os.O_WRONLY | os.O_CREAT | os.O_TRUNC, 318 }[mode] 319 if hasattr(os, "O_BINARY"): 320 mode |= os.O_BINARY 321 self.fd = os.open(name, mode, 0o666) 322 323 def close(self): 324 os.close(self.fd) 325 326 def read(self, size): 327 return os.read(self.fd, size) 328 329 def write(self, s): 330 os.write(self.fd, s) 331 332class _Stream: 333 """Class that serves as an adapter between TarFile and 334 a stream-like object. The stream-like object only 335 needs to have a read() or write() method and is accessed 336 blockwise. Use of gzip or bzip2 compression is possible. 337 A stream-like object could be for example: sys.stdin, 338 sys.stdout, a socket, a tape device etc. 339 340 _Stream is intended to be used only internally. 341 """ 342 343 def __init__(self, name, mode, comptype, fileobj, bufsize): 344 """Construct a _Stream object. 345 """ 346 self._extfileobj = True 347 if fileobj is None: 348 fileobj = _LowLevelFile(name, mode) 349 self._extfileobj = False 350 351 if comptype == '*': 352 # Enable transparent compression detection for the 353 # stream interface 354 fileobj = _StreamProxy(fileobj) 355 comptype = fileobj.getcomptype() 356 357 self.name = name or "" 358 self.mode = mode 359 self.comptype = comptype 360 self.fileobj = fileobj 361 self.bufsize = bufsize 362 self.buf = b"" 363 self.pos = 0 364 self.closed = False 365 366 try: 367 if comptype == "gz": 368 try: 369 import zlib 370 except ImportError: 371 raise CompressionError("zlib module is not available") from None 372 self.zlib = zlib 373 self.crc = zlib.crc32(b"") 374 if mode == "r": 375 self._init_read_gz() 376 self.exception = zlib.error 377 else: 378 self._init_write_gz() 379 380 elif comptype == "bz2": 381 try: 382 import bz2 383 except ImportError: 384 raise CompressionError("bz2 module is not available") from None 385 if mode == "r": 386 self.dbuf = b"" 387 self.cmp = bz2.BZ2Decompressor() 388 self.exception = OSError 389 else: 390 self.cmp = bz2.BZ2Compressor() 391 392 elif comptype == "xz": 393 try: 394 import lzma 395 except ImportError: 396 raise CompressionError("lzma module is not available") from None 397 if mode == "r": 398 self.dbuf = b"" 399 self.cmp = lzma.LZMADecompressor() 400 self.exception = lzma.LZMAError 401 else: 402 self.cmp = lzma.LZMACompressor() 403 404 elif comptype != "tar": 405 raise CompressionError("unknown compression type %r" % comptype) 406 407 except: 408 if not self._extfileobj: 409 self.fileobj.close() 410 self.closed = True 411 raise 412 413 def __del__(self): 414 if hasattr(self, "closed") and not self.closed: 415 self.close() 416 417 def _init_write_gz(self): 418 """Initialize for writing with gzip compression. 419 """ 420 self.cmp = self.zlib.compressobj(9, self.zlib.DEFLATED, 421 -self.zlib.MAX_WBITS, 422 self.zlib.DEF_MEM_LEVEL, 423 0) 424 timestamp = struct.pack("<L", int(time.time())) 425 self.__write(b"\037\213\010\010" + timestamp + b"\002\377") 426 if self.name.endswith(".gz"): 427 self.name = self.name[:-3] 428 # Honor "directory components removed" from RFC1952 429 self.name = os.path.basename(self.name) 430 # RFC1952 says we must use ISO-8859-1 for the FNAME field. 431 self.__write(self.name.encode("iso-8859-1", "replace") + NUL) 432 433 def write(self, s): 434 """Write string s to the stream. 435 """ 436 if self.comptype == "gz": 437 self.crc = self.zlib.crc32(s, self.crc) 438 self.pos += len(s) 439 if self.comptype != "tar": 440 s = self.cmp.compress(s) 441 self.__write(s) 442 443 def __write(self, s): 444 """Write string s to the stream if a whole new block 445 is ready to be written. 446 """ 447 self.buf += s 448 while len(self.buf) > self.bufsize: 449 self.fileobj.write(self.buf[:self.bufsize]) 450 self.buf = self.buf[self.bufsize:] 451 452 def close(self): 453 """Close the _Stream object. No operation should be 454 done on it afterwards. 455 """ 456 if self.closed: 457 return 458 459 self.closed = True 460 try: 461 if self.mode == "w" and self.comptype != "tar": 462 self.buf += self.cmp.flush() 463 464 if self.mode == "w" and self.buf: 465 self.fileobj.write(self.buf) 466 self.buf = b"" 467 if self.comptype == "gz": 468 self.fileobj.write(struct.pack("<L", self.crc)) 469 self.fileobj.write(struct.pack("<L", self.pos & 0xffffFFFF)) 470 finally: 471 if not self._extfileobj: 472 self.fileobj.close() 473 474 def _init_read_gz(self): 475 """Initialize for reading a gzip compressed fileobj. 476 """ 477 self.cmp = self.zlib.decompressobj(-self.zlib.MAX_WBITS) 478 self.dbuf = b"" 479 480 # taken from gzip.GzipFile with some alterations 481 if self.__read(2) != b"\037\213": 482 raise ReadError("not a gzip file") 483 if self.__read(1) != b"\010": 484 raise CompressionError("unsupported compression method") 485 486 flag = ord(self.__read(1)) 487 self.__read(6) 488 489 if flag & 4: 490 xlen = ord(self.__read(1)) + 256 * ord(self.__read(1)) 491 self.read(xlen) 492 if flag & 8: 493 while True: 494 s = self.__read(1) 495 if not s or s == NUL: 496 break 497 if flag & 16: 498 while True: 499 s = self.__read(1) 500 if not s or s == NUL: 501 break 502 if flag & 2: 503 self.__read(2) 504 505 def tell(self): 506 """Return the stream's file pointer position. 507 """ 508 return self.pos 509 510 def seek(self, pos=0): 511 """Set the stream's file pointer to pos. Negative seeking 512 is forbidden. 513 """ 514 if pos - self.pos >= 0: 515 blocks, remainder = divmod(pos - self.pos, self.bufsize) 516 for i in range(blocks): 517 self.read(self.bufsize) 518 self.read(remainder) 519 else: 520 raise StreamError("seeking backwards is not allowed") 521 return self.pos 522 523 def read(self, size): 524 """Return the next size number of bytes from the stream.""" 525 assert size is not None 526 buf = self._read(size) 527 self.pos += len(buf) 528 return buf 529 530 def _read(self, size): 531 """Return size bytes from the stream. 532 """ 533 if self.comptype == "tar": 534 return self.__read(size) 535 536 c = len(self.dbuf) 537 t = [self.dbuf] 538 while c < size: 539 # Skip underlying buffer to avoid unaligned double buffering. 540 if self.buf: 541 buf = self.buf 542 self.buf = b"" 543 else: 544 buf = self.fileobj.read(self.bufsize) 545 if not buf: 546 break 547 try: 548 buf = self.cmp.decompress(buf) 549 except self.exception as e: 550 raise ReadError("invalid compressed data") from e 551 t.append(buf) 552 c += len(buf) 553 t = b"".join(t) 554 self.dbuf = t[size:] 555 return t[:size] 556 557 def __read(self, size): 558 """Return size bytes from stream. If internal buffer is empty, 559 read another block from the stream. 560 """ 561 c = len(self.buf) 562 t = [self.buf] 563 while c < size: 564 buf = self.fileobj.read(self.bufsize) 565 if not buf: 566 break 567 t.append(buf) 568 c += len(buf) 569 t = b"".join(t) 570 self.buf = t[size:] 571 return t[:size] 572# class _Stream 573 574class _StreamProxy(object): 575 """Small proxy class that enables transparent compression 576 detection for the Stream interface (mode 'r|*'). 577 """ 578 579 def __init__(self, fileobj): 580 self.fileobj = fileobj 581 self.buf = self.fileobj.read(BLOCKSIZE) 582 583 def read(self, size): 584 self.read = self.fileobj.read 585 return self.buf 586 587 def getcomptype(self): 588 if self.buf.startswith(b"\x1f\x8b\x08"): 589 return "gz" 590 elif self.buf[0:3] == b"BZh" and self.buf[4:10] == b"1AY&SY": 591 return "bz2" 592 elif self.buf.startswith((b"\x5d\x00\x00\x80", b"\xfd7zXZ")): 593 return "xz" 594 else: 595 return "tar" 596 597 def close(self): 598 self.fileobj.close() 599# class StreamProxy 600 601#------------------------ 602# Extraction file object 603#------------------------ 604class _FileInFile(object): 605 """A thin wrapper around an existing file object that 606 provides a part of its data as an individual file 607 object. 608 """ 609 610 def __init__(self, fileobj, offset, size, blockinfo=None): 611 self.fileobj = fileobj 612 self.offset = offset 613 self.size = size 614 self.position = 0 615 self.name = getattr(fileobj, "name", None) 616 self.closed = False 617 618 if blockinfo is None: 619 blockinfo = [(0, size)] 620 621 # Construct a map with data and zero blocks. 622 self.map_index = 0 623 self.map = [] 624 lastpos = 0 625 realpos = self.offset 626 for offset, size in blockinfo: 627 if offset > lastpos: 628 self.map.append((False, lastpos, offset, None)) 629 self.map.append((True, offset, offset + size, realpos)) 630 realpos += size 631 lastpos = offset + size 632 if lastpos < self.size: 633 self.map.append((False, lastpos, self.size, None)) 634 635 def flush(self): 636 pass 637 638 def readable(self): 639 return True 640 641 def writable(self): 642 return False 643 644 def seekable(self): 645 return self.fileobj.seekable() 646 647 def tell(self): 648 """Return the current file position. 649 """ 650 return self.position 651 652 def seek(self, position, whence=io.SEEK_SET): 653 """Seek to a position in the file. 654 """ 655 if whence == io.SEEK_SET: 656 self.position = min(max(position, 0), self.size) 657 elif whence == io.SEEK_CUR: 658 if position < 0: 659 self.position = max(self.position + position, 0) 660 else: 661 self.position = min(self.position + position, self.size) 662 elif whence == io.SEEK_END: 663 self.position = max(min(self.size + position, self.size), 0) 664 else: 665 raise ValueError("Invalid argument") 666 return self.position 667 668 def read(self, size=None): 669 """Read data from the file. 670 """ 671 if size is None: 672 size = self.size - self.position 673 else: 674 size = min(size, self.size - self.position) 675 676 buf = b"" 677 while size > 0: 678 while True: 679 data, start, stop, offset = self.map[self.map_index] 680 if start <= self.position < stop: 681 break 682 else: 683 self.map_index += 1 684 if self.map_index == len(self.map): 685 self.map_index = 0 686 length = min(size, stop - self.position) 687 if data: 688 self.fileobj.seek(offset + (self.position - start)) 689 b = self.fileobj.read(length) 690 if len(b) != length: 691 raise ReadError("unexpected end of data") 692 buf += b 693 else: 694 buf += NUL * length 695 size -= length 696 self.position += length 697 return buf 698 699 def readinto(self, b): 700 buf = self.read(len(b)) 701 b[:len(buf)] = buf 702 return len(buf) 703 704 def close(self): 705 self.closed = True 706#class _FileInFile 707 708class ExFileObject(io.BufferedReader): 709 710 def __init__(self, tarfile, tarinfo): 711 fileobj = _FileInFile(tarfile.fileobj, tarinfo.offset_data, 712 tarinfo.size, tarinfo.sparse) 713 super().__init__(fileobj) 714#class ExFileObject 715 716 717#----------------------------- 718# extraction filters (PEP 706) 719#----------------------------- 720 721class FilterError(TarError): 722 pass 723 724class AbsolutePathError(FilterError): 725 def __init__(self, tarinfo): 726 self.tarinfo = tarinfo 727 super().__init__(f'member {tarinfo.name!r} has an absolute path') 728 729class OutsideDestinationError(FilterError): 730 def __init__(self, tarinfo, path): 731 self.tarinfo = tarinfo 732 self._path = path 733 super().__init__(f'{tarinfo.name!r} would be extracted to {path!r}, ' 734 + 'which is outside the destination') 735 736class SpecialFileError(FilterError): 737 def __init__(self, tarinfo): 738 self.tarinfo = tarinfo 739 super().__init__(f'{tarinfo.name!r} is a special file') 740 741class AbsoluteLinkError(FilterError): 742 def __init__(self, tarinfo): 743 self.tarinfo = tarinfo 744 super().__init__(f'{tarinfo.name!r} is a symlink to an absolute path') 745 746class LinkOutsideDestinationError(FilterError): 747 def __init__(self, tarinfo, path): 748 self.tarinfo = tarinfo 749 self._path = path 750 super().__init__(f'{tarinfo.name!r} would link to {path!r}, ' 751 + 'which is outside the destination') 752 753class LinkFallbackError(FilterError): 754 def __init__(self, tarinfo, path): 755 self.tarinfo = tarinfo 756 self._path = path 757 super().__init__(f'link {tarinfo.name!r} would be extracted as a ' 758 + f'copy of {path!r}, which was rejected') 759 760# Errors caused by filters -- both "fatal" and "non-fatal" -- that 761# we consider to be issues with the argument, rather than a bug in the 762# filter function 763_FILTER_ERRORS = (FilterError, OSError, ExtractError) 764 765def _get_filtered_attrs(member, dest_path, for_data=True): 766 new_attrs = {} 767 name = member.name 768 dest_path = os.path.realpath(dest_path, strict=os.path.ALLOW_MISSING) 769 # Strip leading / (tar's directory separator) from filenames. 770 # Include os.sep (target OS directory separator) as well. 771 if name.startswith(('/', os.sep)): 772 name = new_attrs['name'] = member.path.lstrip('/' + os.sep) 773 if os.path.isabs(name): 774 # Path is absolute even after stripping. 775 # For example, 'C:/foo' on Windows. 776 raise AbsolutePathError(member) 777 # Ensure we stay in the destination 778 target_path = os.path.realpath(os.path.join(dest_path, name), 779 strict=os.path.ALLOW_MISSING) 780 if os.path.commonpath([target_path, dest_path]) != dest_path: 781 raise OutsideDestinationError(member, target_path) 782 # Limit permissions (no high bits, and go-w) 783 mode = member.mode 784 if mode is not None: 785 # Strip high bits & group/other write bits 786 mode = mode & 0o755 787 if for_data: 788 # For data, handle permissions & file types 789 if member.isreg() or member.islnk(): 790 if not mode & 0o100: 791 # Clear executable bits if not executable by user 792 mode &= ~0o111 793 # Ensure owner can read & write 794 mode |= 0o600 795 elif member.isdir() or member.issym(): 796 # Ignore mode for directories & symlinks 797 mode = None 798 else: 799 # Reject special files 800 raise SpecialFileError(member) 801 if mode != member.mode: 802 new_attrs['mode'] = mode 803 if for_data: 804 # Ignore ownership for 'data' 805 if member.uid is not None: 806 new_attrs['uid'] = None 807 if member.gid is not None: 808 new_attrs['gid'] = None 809 if member.uname is not None: 810 new_attrs['uname'] = None 811 if member.gname is not None: 812 new_attrs['gname'] = None 813 # Check link destination for 'data' 814 if member.islnk() or member.issym(): 815 if os.path.isabs(member.linkname): 816 raise AbsoluteLinkError(member) 817 normalized = os.path.normpath(member.linkname) 818 if normalized != member.linkname: 819 new_attrs['linkname'] = normalized 820 if member.issym(): 821 target_path = os.path.join(dest_path, 822 os.path.dirname(name), 823 member.linkname) 824 else: 825 target_path = os.path.join(dest_path, 826 member.linkname) 827 target_path = os.path.realpath(target_path, 828 strict=os.path.ALLOW_MISSING) 829 if os.path.commonpath([target_path, dest_path]) != dest_path: 830 raise LinkOutsideDestinationError(member, target_path) 831 return new_attrs 832 833def fully_trusted_filter(member, dest_path): 834 return member 835 836def tar_filter(member, dest_path): 837 new_attrs = _get_filtered_attrs(member, dest_path, False) 838 if new_attrs: 839 return member.replace(**new_attrs, deep=False) 840 return member 841 842def data_filter(member, dest_path): 843 new_attrs = _get_filtered_attrs(member, dest_path, True) 844 if new_attrs: 845 return member.replace(**new_attrs, deep=False) 846 return member 847 848_NAMED_FILTERS = { 849 "fully_trusted": fully_trusted_filter, 850 "tar": tar_filter, 851 "data": data_filter, 852} 853 854#------------------ 855# Exported Classes 856#------------------ 857 858# Sentinel for replace() defaults, meaning "don't change the attribute" 859_KEEP = object() 860 861# Header length is digits followed by a space. 862_header_length_prefix_re = re.compile(br"([0-9]{1,20}) ") 863 864class TarInfo(object): 865 """Informational class which holds the details about an 866 archive member given by a tar header block. 867 TarInfo objects are returned by TarFile.getmember(), 868 TarFile.getmembers() and TarFile.gettarinfo() and are 869 usually created internally. 870 """ 871 872 __slots__ = dict( 873 name = 'Name of the archive member.', 874 mode = 'Permission bits.', 875 uid = 'User ID of the user who originally stored this member.', 876 gid = 'Group ID of the user who originally stored this member.', 877 size = 'Size in bytes.', 878 mtime = 'Time of last modification.', 879 chksum = 'Header checksum.', 880 type = ('File type. type is usually one of these constants: ' 881 'REGTYPE, AREGTYPE, LNKTYPE, SYMTYPE, DIRTYPE, FIFOTYPE, ' 882 'CONTTYPE, CHRTYPE, BLKTYPE, GNUTYPE_SPARSE.'), 883 linkname = ('Name of the target file name, which is only present ' 884 'in TarInfo objects of type LNKTYPE and SYMTYPE.'), 885 uname = 'User name.', 886 gname = 'Group name.', 887 devmajor = 'Device major number.', 888 devminor = 'Device minor number.', 889 offset = 'The tar header starts here.', 890 offset_data = "The file's data starts here.", 891 pax_headers = ('A dictionary containing key-value pairs of an ' 892 'associated pax extended header.'), 893 sparse = 'Sparse member information.', 894 tarfile = None, 895 _sparse_structs = None, 896 _link_target = None, 897 ) 898 899 def __init__(self, name=""): 900 """Construct a TarInfo object. name is the optional name 901 of the member. 902 """ 903 self.name = name # member name 904 self.mode = 0o644 # file permissions 905 self.uid = 0 # user id 906 self.gid = 0 # group id 907 self.size = 0 # file size 908 self.mtime = 0 # modification time 909 self.chksum = 0 # header checksum 910 self.type = REGTYPE # member type 911 self.linkname = "" # link name 912 self.uname = "" # user name 913 self.gname = "" # group name 914 self.devmajor = 0 # device major number 915 self.devminor = 0 # device minor number 916 917 self.offset = 0 # the tar header starts here 918 self.offset_data = 0 # the file's data starts here 919 920 self.sparse = None # sparse member information 921 self.pax_headers = {} # pax header information 922 923 @property 924 def path(self): 925 'In pax headers, "name" is called "path".' 926 return self.name 927 928 @path.setter 929 def path(self, name): 930 self.name = name 931 932 @property 933 def linkpath(self): 934 'In pax headers, "linkname" is called "linkpath".' 935 return self.linkname 936 937 @linkpath.setter 938 def linkpath(self, linkname): 939 self.linkname = linkname 940 941 def __repr__(self): 942 return "<%s %r at %#x>" % (self.__class__.__name__,self.name,id(self)) 943 944 def replace(self, *, 945 name=_KEEP, mtime=_KEEP, mode=_KEEP, linkname=_KEEP, 946 uid=_KEEP, gid=_KEEP, uname=_KEEP, gname=_KEEP, 947 deep=True, _KEEP=_KEEP): 948 """Return a deep copy of self with the given attributes replaced. 949 """ 950 if deep: 951 result = copy.deepcopy(self) 952 else: 953 result = copy.copy(self) 954 if name is not _KEEP: 955 result.name = name 956 if mtime is not _KEEP: 957 result.mtime = mtime 958 if mode is not _KEEP: 959 result.mode = mode 960 if linkname is not _KEEP: 961 result.linkname = linkname 962 if uid is not _KEEP: 963 result.uid = uid 964 if gid is not _KEEP: 965 result.gid = gid 966 if uname is not _KEEP: 967 result.uname = uname 968 if gname is not _KEEP: 969 result.gname = gname 970 return result 971 972 def get_info(self): 973 """Return the TarInfo's attributes as a dictionary. 974 """ 975 if self.mode is None: 976 mode = None 977 else: 978 mode = self.mode & 0o7777 979 info = { 980 "name": self.name, 981 "mode": mode, 982 "uid": self.uid, 983 "gid": self.gid, 984 "size": self.size, 985 "mtime": self.mtime, 986 "chksum": self.chksum, 987 "type": self.type, 988 "linkname": self.linkname, 989 "uname": self.uname, 990 "gname": self.gname, 991 "devmajor": self.devmajor, 992 "devminor": self.devminor 993 } 994 995 if info["type"] == DIRTYPE and not info["name"].endswith("/"): 996 info["name"] += "/" 997 998 return info 999 1000 def tobuf(self, format=DEFAULT_FORMAT, encoding=ENCODING, errors="surrogateescape"): 1001 """Return a tar header as a string of 512 byte blocks. 1002 """ 1003 info = self.get_info() 1004 for name, value in info.items(): 1005 if value is None: 1006 raise ValueError("%s may not be None" % name) 1007 1008 if format == USTAR_FORMAT: 1009 return self.create_ustar_header(info, encoding, errors) 1010 elif format == GNU_FORMAT: 1011 return self.create_gnu_header(info, encoding, errors) 1012 elif format == PAX_FORMAT: 1013 return self.create_pax_header(info, encoding) 1014 else: 1015 raise ValueError("invalid format") 1016 1017 def create_ustar_header(self, info, encoding, errors): 1018 """Return the object as a ustar header block. 1019 """ 1020 info["magic"] = POSIX_MAGIC 1021 1022 if len(info["linkname"].encode(encoding, errors)) > LENGTH_LINK: 1023 raise ValueError("linkname is too long") 1024 1025 if len(info["name"].encode(encoding, errors)) > LENGTH_NAME: 1026 info["prefix"], info["name"] = self._posix_split_name(info["name"], encoding, errors) 1027 1028 return self._create_header(info, USTAR_FORMAT, encoding, errors) 1029 1030 def create_gnu_header(self, info, encoding, errors): 1031 """Return the object as a GNU header block sequence. 1032 """ 1033 info["magic"] = GNU_MAGIC 1034 1035 buf = b"" 1036 if len(info["linkname"].encode(encoding, errors)) > LENGTH_LINK: 1037 buf += self._create_gnu_long_header(info["linkname"], GNUTYPE_LONGLINK, encoding, errors) 1038 1039 if len(info["name"].encode(encoding, errors)) > LENGTH_NAME: 1040 buf += self._create_gnu_long_header(info["name"], GNUTYPE_LONGNAME, encoding, errors) 1041 1042 return buf + self._create_header(info, GNU_FORMAT, encoding, errors) 1043 1044 def create_pax_header(self, info, encoding): 1045 """Return the object as a ustar header block. If it cannot be 1046 represented this way, prepend a pax extended header sequence 1047 with supplement information. 1048 """ 1049 info["magic"] = POSIX_MAGIC 1050 pax_headers = self.pax_headers.copy() 1051 1052 # Test string fields for values that exceed the field length or cannot 1053 # be represented in ASCII encoding. 1054 for name, hname, length in ( 1055 ("name", "path", LENGTH_NAME), ("linkname", "linkpath", LENGTH_LINK), 1056 ("uname", "uname", 32), ("gname", "gname", 32)): 1057 1058 if hname in pax_headers: 1059 # The pax header has priority. 1060 continue 1061 1062 # Try to encode the string as ASCII. 1063 try: 1064 info[name].encode("ascii", "strict") 1065 except UnicodeEncodeError: 1066 pax_headers[hname] = info[name] 1067 continue 1068 1069 if len(info[name]) > length: 1070 pax_headers[hname] = info[name] 1071 1072 # Test number fields for values that exceed the field limit or values 1073 # that like to be stored as float. 1074 for name, digits in (("uid", 8), ("gid", 8), ("size", 12), ("mtime", 12)): 1075 needs_pax = False 1076 1077 val = info[name] 1078 val_is_float = isinstance(val, float) 1079 val_int = round(val) if val_is_float else val 1080 if not 0 <= val_int < 8 ** (digits - 1): 1081 # Avoid overflow. 1082 info[name] = 0 1083 needs_pax = True 1084 elif val_is_float: 1085 # Put rounded value in ustar header, and full 1086 # precision value in pax header. 1087 info[name] = val_int 1088 needs_pax = True 1089 1090 # The existing pax header has priority. 1091 if needs_pax and name not in pax_headers: 1092 pax_headers[name] = str(val) 1093 1094 # Create a pax extended header if necessary. 1095 if pax_headers: 1096 buf = self._create_pax_generic_header(pax_headers, XHDTYPE, encoding) 1097 else: 1098 buf = b"" 1099 1100 return buf + self._create_header(info, USTAR_FORMAT, "ascii", "replace") 1101 1102 @classmethod 1103 def create_pax_global_header(cls, pax_headers): 1104 """Return the object as a pax global header block sequence. 1105 """ 1106 return cls._create_pax_generic_header(pax_headers, XGLTYPE, "utf-8") 1107 1108 def _posix_split_name(self, name, encoding, errors): 1109 """Split a name longer than 100 chars into a prefix 1110 and a name part. 1111 """ 1112 components = name.split("/") 1113 for i in range(1, len(components)): 1114 prefix = "/".join(components[:i]) 1115 name = "/".join(components[i:]) 1116 if len(prefix.encode(encoding, errors)) <= LENGTH_PREFIX and \ 1117 len(name.encode(encoding, errors)) <= LENGTH_NAME: 1118 break 1119 else: 1120 raise ValueError("name is too long") 1121 1122 return prefix, name 1123 1124 @staticmethod 1125 def _create_header(info, format, encoding, errors): 1126 """Return a header block. info is a dictionary with file 1127 information, format must be one of the *_FORMAT constants. 1128 """ 1129 has_device_fields = info.get("type") in (CHRTYPE, BLKTYPE) 1130 if has_device_fields: 1131 devmajor = itn(info.get("devmajor", 0), 8, format) 1132 devminor = itn(info.get("devminor", 0), 8, format) 1133 else: 1134 devmajor = stn("", 8, encoding, errors) 1135 devminor = stn("", 8, encoding, errors) 1136 1137 # None values in metadata should cause ValueError. 1138 # itn()/stn() do this for all fields except type. 1139 filetype = info.get("type", REGTYPE) 1140 if filetype is None: 1141 raise ValueError("TarInfo.type must not be None") 1142 1143 parts = [ 1144 stn(info.get("name", ""), 100, encoding, errors), 1145 itn(info.get("mode", 0) & 0o7777, 8, format), 1146 itn(info.get("uid", 0), 8, format), 1147 itn(info.get("gid", 0), 8, format), 1148 itn(info.get("size", 0), 12, format), 1149 itn(info.get("mtime", 0), 12, format), 1150 b" ", # checksum field 1151 filetype, 1152 stn(info.get("linkname", ""), 100, encoding, errors), 1153 info.get("magic", POSIX_MAGIC), 1154 stn(info.get("uname", ""), 32, encoding, errors), 1155 stn(info.get("gname", ""), 32, encoding, errors), 1156 devmajor, 1157 devminor, 1158 stn(info.get("prefix", ""), 155, encoding, errors) 1159 ] 1160 1161 buf = struct.pack("%ds" % BLOCKSIZE, b"".join(parts)) 1162 chksum = calc_chksums(buf[-BLOCKSIZE:])[0] 1163 buf = buf[:-364] + bytes("%06o\0" % chksum, "ascii") + buf[-357:] 1164 return buf 1165 1166 @staticmethod 1167 def _create_payload(payload): 1168 """Return the string payload filled with zero bytes 1169 up to the next 512 byte border. 1170 """ 1171 blocks, remainder = divmod(len(payload), BLOCKSIZE) 1172 if remainder > 0: 1173 payload += (BLOCKSIZE - remainder) * NUL 1174 return payload 1175 1176 @classmethod 1177 def _create_gnu_long_header(cls, name, type, encoding, errors): 1178 """Return a GNUTYPE_LONGNAME or GNUTYPE_LONGLINK sequence 1179 for name. 1180 """ 1181 name = name.encode(encoding, errors) + NUL 1182 1183 info = {} 1184 info["name"] = "././@LongLink" 1185 info["type"] = type 1186 info["size"] = len(name) 1187 info["magic"] = GNU_MAGIC 1188 1189 # create extended header + name blocks. 1190 return cls._create_header(info, USTAR_FORMAT, encoding, errors) + \ 1191 cls._create_payload(name) 1192 1193 @classmethod 1194 def _create_pax_generic_header(cls, pax_headers, type, encoding): 1195 """Return a POSIX.1-2008 extended or global header sequence 1196 that contains a list of keyword, value pairs. The values 1197 must be strings. 1198 """ 1199 # Check if one of the fields contains surrogate characters and thereby 1200 # forces hdrcharset=BINARY, see _proc_pax() for more information. 1201 binary = False 1202 for keyword, value in pax_headers.items(): 1203 try: 1204 value.encode("utf-8", "strict") 1205 except UnicodeEncodeError: 1206 binary = True 1207 break 1208 1209 records = b"" 1210 if binary: 1211 # Put the hdrcharset field at the beginning of the header. 1212 records += b"21 hdrcharset=BINARY\n" 1213 1214 for keyword, value in pax_headers.items(): 1215 keyword = keyword.encode("utf-8") 1216 if binary: 1217 # Try to restore the original byte representation of `value'. 1218 # Needless to say, that the encoding must match the string. 1219 value = value.encode(encoding, "surrogateescape") 1220 else: 1221 value = value.encode("utf-8") 1222 1223 l = len(keyword) + len(value) + 3 # ' ' + '=' + '\n' 1224 n = p = 0 1225 while True: 1226 n = l + len(str(p)) 1227 if n == p: 1228 break 1229 p = n 1230 records += bytes(str(p), "ascii") + b" " + keyword + b"=" + value + b"\n" 1231 1232 # We use a hardcoded "././@PaxHeader" name like star does 1233 # instead of the one that POSIX recommends. 1234 info = {} 1235 info["name"] = "././@PaxHeader" 1236 info["type"] = type 1237 info["size"] = len(records) 1238 info["magic"] = POSIX_MAGIC 1239 1240 # Create pax header + record blocks. 1241 return cls._create_header(info, USTAR_FORMAT, "ascii", "replace") + \ 1242 cls._create_payload(records) 1243 1244 @classmethod 1245 def frombuf(cls, buf, encoding, errors): 1246 """Construct a TarInfo object from a 512 byte bytes object. 1247 """ 1248 if len(buf) == 0: 1249 raise EmptyHeaderError("empty header") 1250 if len(buf) != BLOCKSIZE: 1251 raise TruncatedHeaderError("truncated header") 1252 if buf.count(NUL) == BLOCKSIZE: 1253 raise EOFHeaderError("end of file header") 1254 1255 chksum = nti(buf[148:156]) 1256 if chksum not in calc_chksums(buf): 1257 raise InvalidHeaderError("bad checksum") 1258 1259 obj = cls() 1260 obj.name = nts(buf[0:100], encoding, errors) 1261 obj.mode = nti(buf[100:108]) 1262 obj.uid = nti(buf[108:116]) 1263 obj.gid = nti(buf[116:124]) 1264 obj.size = nti(buf[124:136]) 1265 obj.mtime = nti(buf[136:148]) 1266 obj.chksum = chksum 1267 obj.type = buf[156:157] 1268 obj.linkname = nts(buf[157:257], encoding, errors) 1269 obj.uname = nts(buf[265:297], encoding, errors) 1270 obj.gname = nts(buf[297:329], encoding, errors) 1271 obj.devmajor = nti(buf[329:337]) 1272 obj.devminor = nti(buf[337:345]) 1273 prefix = nts(buf[345:500], encoding, errors) 1274 1275 # Old V7 tar format represents a directory as a regular 1276 # file with a trailing slash. 1277 if obj.type == AREGTYPE and obj.name.endswith("/"): 1278 obj.type = DIRTYPE 1279 1280 # The old GNU sparse format occupies some of the unused 1281 # space in the buffer for up to 4 sparse structures. 1282 # Save them for later processing in _proc_sparse(). 1283 if obj.type == GNUTYPE_SPARSE: 1284 pos = 386 1285 structs = [] 1286 for i in range(4): 1287 try: 1288 offset = nti(buf[pos:pos + 12]) 1289 numbytes = nti(buf[pos + 12:pos + 24]) 1290 except ValueError: 1291 break 1292 structs.append((offset, numbytes)) 1293 pos += 24 1294 isextended = bool(buf[482]) 1295 origsize = nti(buf[483:495]) 1296 obj._sparse_structs = (structs, isextended, origsize) 1297 1298 # Remove redundant slashes from directories. 1299 if obj.isdir(): 1300 obj.name = obj.name.rstrip("/") 1301 1302 # Reconstruct a ustar longname. 1303 if prefix and obj.type not in GNU_TYPES: 1304 obj.name = prefix + "/" + obj.name 1305 return obj 1306 1307 @classmethod 1308 def fromtarfile(cls, tarfile): 1309 """Return the next TarInfo object from TarFile object 1310 tarfile. 1311 """ 1312 buf = tarfile.fileobj.read(BLOCKSIZE) 1313 obj = cls.frombuf(buf, tarfile.encoding, tarfile.errors) 1314 obj.offset = tarfile.fileobj.tell() - BLOCKSIZE 1315 return obj._proc_member(tarfile) 1316 1317 #-------------------------------------------------------------------------- 1318 # The following are methods that are called depending on the type of a 1319 # member. The entry point is _proc_member() which can be overridden in a 1320 # subclass to add custom _proc_*() methods. A _proc_*() method MUST 1321 # implement the following 1322 # operations: 1323 # 1. Set self.offset_data to the position where the data blocks begin, 1324 # if there is data that follows. 1325 # 2. Set tarfile.offset to the position where the next member's header will 1326 # begin. 1327 # 3. Return self or another valid TarInfo object. 1328 def _proc_member(self, tarfile): 1329 """Choose the right processing method depending on 1330 the type and call it. 1331 """ 1332 if self.type in (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK): 1333 return self._proc_gnulong(tarfile) 1334 elif self.type == GNUTYPE_SPARSE: 1335 return self._proc_sparse(tarfile) 1336 elif self.type in (XHDTYPE, XGLTYPE, SOLARIS_XHDTYPE): 1337 return self._proc_pax(tarfile) 1338 else: 1339 return self._proc_builtin(tarfile) 1340 1341 def _proc_builtin(self, tarfile): 1342 """Process a builtin type or an unknown type which 1343 will be treated as a regular file. 1344 """ 1345 self.offset_data = tarfile.fileobj.tell() 1346 offset = self.offset_data 1347 if self.isreg() or self.type not in SUPPORTED_TYPES: 1348 # Skip the following data blocks. 1349 offset += self._block(self.size) 1350 tarfile.offset = offset 1351 1352 # Patch the TarInfo object with saved global 1353 # header information. 1354 self._apply_pax_info(tarfile.pax_headers, tarfile.encoding, tarfile.errors) 1355 1356 # Remove redundant slashes from directories. This is to be consistent 1357 # with frombuf(). 1358 if self.isdir(): 1359 self.name = self.name.rstrip("/") 1360 1361 return self 1362 1363 def _proc_gnulong(self, tarfile): 1364 """Process the blocks that hold a GNU longname 1365 or longlink member. 1366 """ 1367 buf = tarfile.fileobj.read(self._block(self.size)) 1368 1369 # Fetch the next header and process it. 1370 try: 1371 next = self.fromtarfile(tarfile) 1372 except HeaderError as e: 1373 raise SubsequentHeaderError(str(e)) from None 1374 1375 # Patch the TarInfo object from the next header with 1376 # the longname information. 1377 next.offset = self.offset 1378 if self.type == GNUTYPE_LONGNAME: 1379 next.name = nts(buf, tarfile.encoding, tarfile.errors) 1380 elif self.type == GNUTYPE_LONGLINK: 1381 next.linkname = nts(buf, tarfile.encoding, tarfile.errors) 1382 1383 # Remove redundant slashes from directories. This is to be consistent 1384 # with frombuf(). 1385 if next.isdir(): 1386 next.name = next.name.removesuffix("/") 1387 1388 return next 1389 1390 def _proc_sparse(self, tarfile): 1391 """Process a GNU sparse header plus extra headers. 1392 """ 1393 # We already collected some sparse structures in frombuf(). 1394 structs, isextended, origsize = self._sparse_structs 1395 del self._sparse_structs 1396 1397 # Collect sparse structures from extended header blocks. 1398 while isextended: 1399 buf = tarfile.fileobj.read(BLOCKSIZE) 1400 pos = 0 1401 for i in range(21): 1402 try: 1403 offset = nti(buf[pos:pos + 12]) 1404 numbytes = nti(buf[pos + 12:pos + 24]) 1405 except ValueError: 1406 break 1407 if offset and numbytes: 1408 structs.append((offset, numbytes)) 1409 pos += 24 1410 isextended = bool(buf[504]) 1411 self.sparse = structs 1412 1413 self.offset_data = tarfile.fileobj.tell() 1414 tarfile.offset = self.offset_data + self._block(self.size) 1415 self.size = origsize 1416 return self 1417 1418 def _proc_pax(self, tarfile): 1419 """Process an extended or global header as described in 1420 POSIX.1-2008. 1421 """ 1422 # Read the header information. 1423 buf = tarfile.fileobj.read(self._block(self.size)) 1424 1425 # A pax header stores supplemental information for either 1426 # the following file (extended) or all following files 1427 # (global). 1428 if self.type == XGLTYPE: 1429 pax_headers = tarfile.pax_headers 1430 else: 1431 pax_headers = tarfile.pax_headers.copy() 1432 1433 # Parse pax header information. A record looks like that: 1434 # "%d %s=%s\n" % (length, keyword, value). length is the size 1435 # of the complete record including the length field itself and 1436 # the newline. 1437 pos = 0 1438 encoding = None 1439 raw_headers = [] 1440 while len(buf) > pos and buf[pos] != 0x00: 1441 if not (match := _header_length_prefix_re.match(buf, pos)): 1442 raise InvalidHeaderError("invalid header") 1443 try: 1444 length = int(match.group(1)) 1445 except ValueError: 1446 raise InvalidHeaderError("invalid header") 1447 # Headers must be at least 5 bytes, shortest being '5 x=\n'. 1448 # Value is allowed to be empty. 1449 if length < 5: 1450 raise InvalidHeaderError("invalid header") 1451 if pos + length > len(buf): 1452 raise InvalidHeaderError("invalid header") 1453 1454 header_value_end_offset = match.start(1) + length - 1 # Last byte of the header 1455 keyword_and_value = buf[match.end(1) + 1:header_value_end_offset] 1456 raw_keyword, equals, raw_value = keyword_and_value.partition(b"=") 1457 1458 # Check the framing of the header. The last character must be '\n' (0x0A) 1459 if not raw_keyword or equals != b"=" or buf[header_value_end_offset] != 0x0A: 1460 raise InvalidHeaderError("invalid header") 1461 raw_headers.append((length, raw_keyword, raw_value)) 1462 1463 # Check if the pax header contains a hdrcharset field. This tells us 1464 # the encoding of the path, linkpath, uname and gname fields. Normally, 1465 # these fields are UTF-8 encoded but since POSIX.1-2008 tar 1466 # implementations are allowed to store them as raw binary strings if 1467 # the translation to UTF-8 fails. For the time being, we don't care about 1468 # anything other than "BINARY". The only other value that is currently 1469 # allowed by the standard is "ISO-IR 10646 2000 UTF-8" in other words UTF-8. 1470 # Note that we only follow the initial 'hdrcharset' setting to preserve 1471 # the initial behavior of the 'tarfile' module. 1472 if raw_keyword == b"hdrcharset" and encoding is None: 1473 if raw_value == b"BINARY": 1474 encoding = tarfile.encoding 1475 else: # This branch ensures only the first 'hdrcharset' header is used. 1476 encoding = "utf-8" 1477 1478 pos += length 1479 1480 # If no explicit hdrcharset is set, we use UTF-8 as a default. 1481 if encoding is None: 1482 encoding = "utf-8" 1483 1484 # After parsing the raw headers we can decode them to text. 1485 for length, raw_keyword, raw_value in raw_headers: 1486 # Normally, we could just use "utf-8" as the encoding and "strict" 1487 # as the error handler, but we better not take the risk. For 1488 # example, GNU tar <= 1.23 is known to store filenames it cannot 1489 # translate to UTF-8 as raw strings (unfortunately without a 1490 # hdrcharset=BINARY header). 1491 # We first try the strict standard encoding, and if that fails we 1492 # fall back on the user's encoding and error handler. 1493 keyword = self._decode_pax_field(raw_keyword, "utf-8", "utf-8", 1494 tarfile.errors) 1495 if keyword in PAX_NAME_FIELDS: 1496 value = self._decode_pax_field(raw_value, encoding, tarfile.encoding, 1497 tarfile.errors) 1498 else: 1499 value = self._decode_pax_field(raw_value, "utf-8", "utf-8", 1500 tarfile.errors) 1501 1502 pax_headers[keyword] = value 1503 1504 # Fetch the next header. 1505 try: 1506 next = self.fromtarfile(tarfile) 1507 except HeaderError as e: 1508 raise SubsequentHeaderError(str(e)) from None 1509 1510 # Process GNU sparse information. 1511 if "GNU.sparse.map" in pax_headers: 1512 # GNU extended sparse format version 0.1. 1513 self._proc_gnusparse_01(next, pax_headers) 1514 1515 elif "GNU.sparse.size" in pax_headers: 1516 # GNU extended sparse format version 0.0. 1517 self._proc_gnusparse_00(next, raw_headers) 1518 1519 elif pax_headers.get("GNU.sparse.major") == "1" and pax_headers.get("GNU.sparse.minor") == "0": 1520 # GNU extended sparse format version 1.0. 1521 self._proc_gnusparse_10(next, pax_headers, tarfile) 1522 1523 if self.type in (XHDTYPE, SOLARIS_XHDTYPE): 1524 # Patch the TarInfo object with the extended header info. 1525 next._apply_pax_info(pax_headers, tarfile.encoding, tarfile.errors) 1526 next.offset = self.offset 1527 1528 if "size" in pax_headers: 1529 # If the extended header replaces the size field, 1530 # we need to recalculate the offset where the next 1531 # header starts. 1532 offset = next.offset_data 1533 if next.isreg() or next.type not in SUPPORTED_TYPES: 1534 offset += next._block(next.size) 1535 tarfile.offset = offset 1536 1537 return next 1538 1539 def _proc_gnusparse_00(self, next, raw_headers): 1540 """Process a GNU tar extended sparse header, version 0.0. 1541 """ 1542 offsets = [] 1543 numbytes = [] 1544 for _, keyword, value in raw_headers: 1545 if keyword == b"GNU.sparse.offset": 1546 try: 1547 offsets.append(int(value.decode())) 1548 except ValueError: 1549 raise InvalidHeaderError("invalid header") 1550 1551 elif keyword == b"GNU.sparse.numbytes": 1552 try: 1553 numbytes.append(int(value.decode())) 1554 except ValueError: 1555 raise InvalidHeaderError("invalid header") 1556 1557 next.sparse = list(zip(offsets, numbytes)) 1558 1559 def _proc_gnusparse_01(self, next, pax_headers): 1560 """Process a GNU tar extended sparse header, version 0.1. 1561 """ 1562 sparse = [int(x) for x in pax_headers["GNU.sparse.map"].split(",")] 1563 next.sparse = list(zip(sparse[::2], sparse[1::2])) 1564 1565 def _proc_gnusparse_10(self, next, pax_headers, tarfile): 1566 """Process a GNU tar extended sparse header, version 1.0. 1567 """ 1568 fields = None 1569 sparse = [] 1570 buf = tarfile.fileobj.read(BLOCKSIZE) 1571 fields, buf = buf.split(b"\n", 1) 1572 fields = int(fields) 1573 while len(sparse) < fields * 2: 1574 if b"\n" not in buf: 1575 buf += tarfile.fileobj.read(BLOCKSIZE) 1576 number, buf = buf.split(b"\n", 1) 1577 sparse.append(int(number)) 1578 next.offset_data = tarfile.fileobj.tell() 1579 next.sparse = list(zip(sparse[::2], sparse[1::2])) 1580 1581 def _apply_pax_info(self, pax_headers, encoding, errors): 1582 """Replace fields with supplemental information from a previous 1583 pax extended or global header. 1584 """ 1585 for keyword, value in pax_headers.items(): 1586 if keyword == "GNU.sparse.name": 1587 setattr(self, "path", value) 1588 elif keyword == "GNU.sparse.size": 1589 setattr(self, "size", int(value)) 1590 elif keyword == "GNU.sparse.realsize": 1591 setattr(self, "size", int(value)) 1592 elif keyword in PAX_FIELDS: 1593 if keyword in PAX_NUMBER_FIELDS: 1594 try: 1595 value = PAX_NUMBER_FIELDS[keyword](value) 1596 except ValueError: 1597 value = 0 1598 if keyword == "path": 1599 value = value.rstrip("/") 1600 setattr(self, keyword, value) 1601 1602 self.pax_headers = pax_headers.copy() 1603 1604 def _decode_pax_field(self, value, encoding, fallback_encoding, fallback_errors): 1605 """Decode a single field from a pax record. 1606 """ 1607 try: 1608 return value.decode(encoding, "strict") 1609 except UnicodeDecodeError: 1610 return value.decode(fallback_encoding, fallback_errors) 1611 1612 def _block(self, count): 1613 """Round up a byte count by BLOCKSIZE and return it, 1614 e.g. _block(834) => 1024. 1615 """ 1616 # Only non-negative offsets are allowed 1617 if count < 0: 1618 raise InvalidHeaderError("invalid offset") 1619 blocks, remainder = divmod(count, BLOCKSIZE) 1620 if remainder: 1621 blocks += 1 1622 return blocks * BLOCKSIZE 1623 1624 def isreg(self): 1625 'Return True if the Tarinfo object is a regular file.' 1626 return self.type in REGULAR_TYPES 1627 1628 def isfile(self): 1629 'Return True if the Tarinfo object is a regular file.' 1630 return self.isreg() 1631 1632 def isdir(self): 1633 'Return True if it is a directory.' 1634 return self.type == DIRTYPE 1635 1636 def issym(self): 1637 'Return True if it is a symbolic link.' 1638 return self.type == SYMTYPE 1639 1640 def islnk(self): 1641 'Return True if it is a hard link.' 1642 return self.type == LNKTYPE 1643 1644 def ischr(self): 1645 'Return True if it is a character device.' 1646 return self.type == CHRTYPE 1647 1648 def isblk(self): 1649 'Return True if it is a block device.' 1650 return self.type == BLKTYPE 1651 1652 def isfifo(self): 1653 'Return True if it is a FIFO.' 1654 return self.type == FIFOTYPE 1655 1656 def issparse(self): 1657 return self.sparse is not None 1658 1659 def isdev(self): 1660 'Return True if it is one of character device, block device or FIFO.' 1661 return self.type in (CHRTYPE, BLKTYPE, FIFOTYPE) 1662# class TarInfo 1663 1664class TarFile(object): 1665 """The TarFile Class provides an interface to tar archives. 1666 """ 1667 1668 debug = 0 # May be set from 0 (no msgs) to 3 (all msgs) 1669 1670 dereference = False # If true, add content of linked file to the 1671 # tar file, else the link. 1672 1673 ignore_zeros = False # If true, skips empty or invalid blocks and 1674 # continues processing. 1675 1676 errorlevel = 1 # If 0, fatal errors only appear in debug 1677 # messages (if debug >= 0). If > 0, errors 1678 # are passed to the caller as exceptions. 1679 1680 format = DEFAULT_FORMAT # The format to use when creating an archive. 1681 1682 encoding = ENCODING # Encoding for 8-bit character strings. 1683 1684 errors = None # Error handler for unicode conversion. 1685 1686 tarinfo = TarInfo # The default TarInfo class to use. 1687 1688 fileobject = ExFileObject # The file-object for extractfile(). 1689 1690 extraction_filter = None # The default filter for extraction. 1691 1692 def __init__(self, name=None, mode="r", fileobj=None, format=None, 1693 tarinfo=None, dereference=None, ignore_zeros=None, encoding=None, 1694 errors="surrogateescape", pax_headers=None, debug=None, 1695 errorlevel=None, copybufsize=None): 1696 """Open an (uncompressed) tar archive `name'. `mode' is either 'r' to 1697 read from an existing archive, 'a' to append data to an existing 1698 file or 'w' to create a new file overwriting an existing one. `mode' 1699 defaults to 'r'. 1700 If `fileobj' is given, it is used for reading or writing data. If it 1701 can be determined, `mode' is overridden by `fileobj's mode. 1702 `fileobj' is not closed, when TarFile is closed. 1703 """ 1704 modes = {"r": "rb", "a": "r+b", "w": "wb", "x": "xb"} 1705 if mode not in modes: 1706 raise ValueError("mode must be 'r', 'a', 'w' or 'x'") 1707 self.mode = mode 1708 self._mode = modes[mode] 1709 1710 if not fileobj: 1711 if self.mode == "a" and not os.path.exists(name): 1712 # Create nonexistent files in append mode. 1713 self.mode = "w" 1714 self._mode = "wb" 1715 fileobj = bltn_open(name, self._mode) 1716 self._extfileobj = False 1717 else: 1718 if (name is None and hasattr(fileobj, "name") and 1719 isinstance(fileobj.name, (str, bytes))): 1720 name = fileobj.name 1721 if hasattr(fileobj, "mode"): 1722 self._mode = fileobj.mode 1723 self._extfileobj = True 1724 self.name = os.path.abspath(name) if name else None 1725 self.fileobj = fileobj 1726 1727 # Init attributes. 1728 if format is not None: 1729 self.format = format 1730 if tarinfo is not None: 1731 self.tarinfo = tarinfo 1732 if dereference is not None: 1733 self.dereference = dereference 1734 if ignore_zeros is not None: 1735 self.ignore_zeros = ignore_zeros 1736 if encoding is not None: 1737 self.encoding = encoding 1738 self.errors = errors 1739 1740 if pax_headers is not None and self.format == PAX_FORMAT: 1741 self.pax_headers = pax_headers 1742 else: 1743 self.pax_headers = {} 1744 1745 if debug is not None: 1746 self.debug = debug 1747 if errorlevel is not None: 1748 self.errorlevel = errorlevel 1749 1750 # Init datastructures. 1751 self.copybufsize = copybufsize 1752 self.closed = False 1753 self.members = [] # list of members as TarInfo objects 1754 self._loaded = False # flag if all members have been read 1755 self.offset = self.fileobj.tell() 1756 # current position in the archive file 1757 self.inodes = {} # dictionary caching the inodes of 1758 # archive members already added 1759 1760 try: 1761 if self.mode == "r": 1762 self.firstmember = None 1763 self.firstmember = self.next() 1764 1765 if self.mode == "a": 1766 # Move to the end of the archive, 1767 # before the first empty block. 1768 while True: 1769 self.fileobj.seek(self.offset) 1770 try: 1771 tarinfo = self.tarinfo.fromtarfile(self) 1772 self.members.append(tarinfo) 1773 except EOFHeaderError: 1774 self.fileobj.seek(self.offset) 1775 break 1776 except HeaderError as e: 1777 raise ReadError(str(e)) from None 1778 1779 if self.mode in ("a", "w", "x"): 1780 self._loaded = True 1781 1782 if self.pax_headers: 1783 buf = self.tarinfo.create_pax_global_header(self.pax_headers.copy()) 1784 self.fileobj.write(buf) 1785 self.offset += len(buf) 1786 except: 1787 if not self._extfileobj: 1788 self.fileobj.close() 1789 self.closed = True 1790 raise 1791 1792 #-------------------------------------------------------------------------- 1793 # Below are the classmethods which act as alternate constructors to the 1794 # TarFile class. The open() method is the only one that is needed for 1795 # public use; it is the "super"-constructor and is able to select an 1796 # adequate "sub"-constructor for a particular compression using the mapping 1797 # from OPEN_METH. 1798 # 1799 # This concept allows one to subclass TarFile without losing the comfort of 1800 # the super-constructor. A sub-constructor is registered and made available 1801 # by adding it to the mapping in OPEN_METH. 1802 1803 @classmethod 1804 def open(cls, name=None, mode="r", fileobj=None, bufsize=RECORDSIZE, **kwargs): 1805 """Open a tar archive for reading, writing or appending. Return 1806 an appropriate TarFile class. 1807 1808 mode: 1809 'r' or 'r:*' open for reading with transparent compression 1810 'r:' open for reading exclusively uncompressed 1811 'r:gz' open for reading with gzip compression 1812 'r:bz2' open for reading with bzip2 compression 1813 'r:xz' open for reading with lzma compression 1814 'a' or 'a:' open for appending, creating the file if necessary 1815 'w' or 'w:' open for writing without compression 1816 'w:gz' open for writing with gzip compression 1817 'w:bz2' open for writing with bzip2 compression 1818 'w:xz' open for writing with lzma compression 1819 1820 'x' or 'x:' create a tarfile exclusively without compression, raise 1821 an exception if the file is already created 1822 'x:gz' create a gzip compressed tarfile, raise an exception 1823 if the file is already created 1824 'x:bz2' create a bzip2 compressed tarfile, raise an exception 1825 if the file is already created 1826 'x:xz' create an lzma compressed tarfile, raise an exception 1827 if the file is already created 1828 1829 'r|*' open a stream of tar blocks with transparent compression 1830 'r|' open an uncompressed stream of tar blocks for reading 1831 'r|gz' open a gzip compressed stream of tar blocks 1832 'r|bz2' open a bzip2 compressed stream of tar blocks 1833 'r|xz' open an lzma compressed stream of tar blocks 1834 'w|' open an uncompressed stream for writing 1835 'w|gz' open a gzip compressed stream for writing 1836 'w|bz2' open a bzip2 compressed stream for writing 1837 'w|xz' open an lzma compressed stream for writing 1838 """ 1839 1840 if not name and not fileobj: 1841 raise ValueError("nothing to open") 1842 1843 if mode in ("r", "r:*"): 1844 # Find out which *open() is appropriate for opening the file. 1845 def not_compressed(comptype): 1846 return cls.OPEN_METH[comptype] == 'taropen' 1847 error_msgs = [] 1848 for comptype in sorted(cls.OPEN_METH, key=not_compressed): 1849 func = getattr(cls, cls.OPEN_METH[comptype]) 1850 if fileobj is not None: 1851 saved_pos = fileobj.tell() 1852 try: 1853 return func(name, "r", fileobj, **kwargs) 1854 except (ReadError, CompressionError) as e: 1855 error_msgs.append(f'- method {comptype}: {e!r}') 1856 if fileobj is not None: 1857 fileobj.seek(saved_pos) 1858 continue 1859 error_msgs_summary = '\n'.join(error_msgs) 1860 raise ReadError(f"file could not be opened successfully:\n{error_msgs_summary}") 1861 1862 elif ":" in mode: 1863 filemode, comptype = mode.split(":", 1) 1864 filemode = filemode or "r" 1865 comptype = comptype or "tar" 1866 1867 # Select the *open() function according to 1868 # given compression. 1869 if comptype in cls.OPEN_METH: 1870 func = getattr(cls, cls.OPEN_METH[comptype]) 1871 else: 1872 raise CompressionError("unknown compression type %r" % comptype) 1873 return func(name, filemode, fileobj, **kwargs) 1874 1875 elif "|" in mode: 1876 filemode, comptype = mode.split("|", 1) 1877 filemode = filemode or "r" 1878 comptype = comptype or "tar" 1879 1880 if filemode not in ("r", "w"): 1881 raise ValueError("mode must be 'r' or 'w'") 1882 1883 stream = _Stream(name, filemode, comptype, fileobj, bufsize) 1884 try: 1885 t = cls(name, filemode, stream, **kwargs) 1886 except: 1887 stream.close() 1888 raise 1889 t._extfileobj = False 1890 return t 1891 1892 elif mode in ("a", "w", "x"): 1893 return cls.taropen(name, mode, fileobj, **kwargs) 1894 1895 raise ValueError("undiscernible mode") 1896 1897 @classmethod 1898 def taropen(cls, name, mode="r", fileobj=None, **kwargs): 1899 """Open uncompressed tar archive name for reading or writing. 1900 """ 1901 if mode not in ("r", "a", "w", "x"): 1902 raise ValueError("mode must be 'r', 'a', 'w' or 'x'") 1903 return cls(name, mode, fileobj, **kwargs) 1904 1905 @classmethod 1906 def gzopen(cls, name, mode="r", fileobj=None, compresslevel=9, **kwargs): 1907 """Open gzip compressed tar archive name for reading or writing. 1908 Appending is not allowed. 1909 """ 1910 if mode not in ("r", "w", "x"): 1911 raise ValueError("mode must be 'r', 'w' or 'x'") 1912 1913 try: 1914 from gzip import GzipFile 1915 except ImportError: 1916 raise CompressionError("gzip module is not available") from None 1917 1918 try: 1919 fileobj = GzipFile(name, mode + "b", compresslevel, fileobj) 1920 except OSError as e: 1921 if fileobj is not None and mode == 'r': 1922 raise ReadError("not a gzip file") from e 1923 raise 1924 1925 try: 1926 t = cls.taropen(name, mode, fileobj, **kwargs) 1927 except OSError as e: 1928 fileobj.close() 1929 if mode == 'r': 1930 raise ReadError("not a gzip file") from e 1931 raise 1932 except: 1933 fileobj.close() 1934 raise 1935 t._extfileobj = False 1936 return t 1937 1938 @classmethod 1939 def bz2open(cls, name, mode="r", fileobj=None, compresslevel=9, **kwargs): 1940 """Open bzip2 compressed tar archive name for reading or writing. 1941 Appending is not allowed. 1942 """ 1943 if mode not in ("r", "w", "x"): 1944 raise ValueError("mode must be 'r', 'w' or 'x'") 1945 1946 try: 1947 from bz2 import BZ2File 1948 except ImportError: 1949 raise CompressionError("bz2 module is not available") from None 1950 1951 fileobj = BZ2File(fileobj or name, mode, compresslevel=compresslevel) 1952 1953 try: 1954 t = cls.taropen(name, mode, fileobj, **kwargs) 1955 except (OSError, EOFError) as e: 1956 fileobj.close() 1957 if mode == 'r': 1958 raise ReadError("not a bzip2 file") from e 1959 raise 1960 except: 1961 fileobj.close() 1962 raise 1963 t._extfileobj = False 1964 return t 1965 1966 @classmethod 1967 def xzopen(cls, name, mode="r", fileobj=None, preset=None, **kwargs): 1968 """Open lzma compressed tar archive name for reading or writing. 1969 Appending is not allowed. 1970 """ 1971 if mode not in ("r", "w", "x"): 1972 raise ValueError("mode must be 'r', 'w' or 'x'") 1973 1974 try: 1975 from lzma import LZMAFile, LZMAError 1976 except ImportError: 1977 raise CompressionError("lzma module is not available") from None 1978 1979 fileobj = LZMAFile(fileobj or name, mode, preset=preset) 1980 1981 try: 1982 t = cls.taropen(name, mode, fileobj, **kwargs) 1983 except (LZMAError, EOFError) as e: 1984 fileobj.close() 1985 if mode == 'r': 1986 raise ReadError("not an lzma file") from e 1987 raise 1988 except: 1989 fileobj.close() 1990 raise 1991 t._extfileobj = False 1992 return t 1993 1994 # All *open() methods are registered here. 1995 OPEN_METH = { 1996 "tar": "taropen", # uncompressed tar 1997 "gz": "gzopen", # gzip compressed tar 1998 "bz2": "bz2open", # bzip2 compressed tar 1999 "xz": "xzopen" # lzma compressed tar 2000 } 2001 2002 #-------------------------------------------------------------------------- 2003 # The public methods which TarFile provides: 2004 2005 def close(self): 2006 """Close the TarFile. In write-mode, two finishing zero blocks are 2007 appended to the archive. 2008 """ 2009 if self.closed: 2010 return 2011 2012 self.closed = True 2013 try: 2014 if self.mode in ("a", "w", "x"): 2015 self.fileobj.write(NUL * (BLOCKSIZE * 2)) 2016 self.offset += (BLOCKSIZE * 2) 2017 # fill up the end with zero-blocks 2018 # (like option -b20 for tar does) 2019 blocks, remainder = divmod(self.offset, RECORDSIZE) 2020 if remainder > 0: 2021 self.fileobj.write(NUL * (RECORDSIZE - remainder)) 2022 finally: 2023 if not self._extfileobj: 2024 self.fileobj.close() 2025 2026 def getmember(self, name): 2027 """Return a TarInfo object for member `name'. If `name' can not be 2028 found in the archive, KeyError is raised. If a member occurs more 2029 than once in the archive, its last occurrence is assumed to be the 2030 most up-to-date version. 2031 """ 2032 tarinfo = self._getmember(name.rstrip('/')) 2033 if tarinfo is None: 2034 raise KeyError("filename %r not found" % name) 2035 return tarinfo 2036 2037 def getmembers(self): 2038 """Return the members of the archive as a list of TarInfo objects. The 2039 list has the same order as the members in the archive. 2040 """ 2041 self._check() 2042 if not self._loaded: # if we want to obtain a list of 2043 self._load() # all members, we first have to 2044 # scan the whole archive. 2045 return self.members 2046 2047 def getnames(self): 2048 """Return the members of the archive as a list of their names. It has 2049 the same order as the list returned by getmembers(). 2050 """ 2051 return [tarinfo.name for tarinfo in self.getmembers()] 2052 2053 def gettarinfo(self, name=None, arcname=None, fileobj=None): 2054 """Create a TarInfo object from the result of os.stat or equivalent 2055 on an existing file. The file is either named by `name', or 2056 specified as a file object `fileobj' with a file descriptor. If 2057 given, `arcname' specifies an alternative name for the file in the 2058 archive, otherwise, the name is taken from the 'name' attribute of 2059 'fileobj', or the 'name' argument. The name should be a text 2060 string. 2061 """ 2062 self._check("awx") 2063 2064 # When fileobj is given, replace name by 2065 # fileobj's real name. 2066 if fileobj is not None: 2067 name = fileobj.name 2068 2069 # Building the name of the member in the archive. 2070 # Backward slashes are converted to forward slashes, 2071 # Absolute paths are turned to relative paths. 2072 if arcname is None: 2073 arcname = name 2074 drv, arcname = os.path.splitdrive(arcname) 2075 arcname = arcname.replace(os.sep, "/") 2076 arcname = arcname.lstrip("/") 2077 2078 # Now, fill the TarInfo object with 2079 # information specific for the file. 2080 tarinfo = self.tarinfo() 2081 tarinfo.tarfile = self # Not needed 2082 2083 # Use os.stat or os.lstat, depending on if symlinks shall be resolved. 2084 if fileobj is None: 2085 if not self.dereference: 2086 statres = os.lstat(name) 2087 else: 2088 statres = os.stat(name) 2089 else: 2090 statres = os.fstat(fileobj.fileno()) 2091 linkname = "" 2092 2093 stmd = statres.st_mode 2094 if stat.S_ISREG(stmd): 2095 inode = (statres.st_ino, statres.st_dev) 2096 if not self.dereference and statres.st_nlink > 1 and \ 2097 inode in self.inodes and arcname != self.inodes[inode]: 2098 # Is it a hardlink to an already 2099 # archived file? 2100 type = LNKTYPE 2101 linkname = self.inodes[inode] 2102 else: 2103 # The inode is added only if its valid. 2104 # For win32 it is always 0. 2105 type = REGTYPE 2106 if inode[0]: 2107 self.inodes[inode] = arcname 2108 elif stat.S_ISDIR(stmd): 2109 type = DIRTYPE 2110 elif stat.S_ISFIFO(stmd): 2111 type = FIFOTYPE 2112 elif stat.S_ISLNK(stmd): 2113 type = SYMTYPE 2114 linkname = os.readlink(name) 2115 elif stat.S_ISCHR(stmd): 2116 type = CHRTYPE 2117 elif stat.S_ISBLK(stmd): 2118 type = BLKTYPE 2119 else: 2120 return None 2121 2122 # Fill the TarInfo object with all 2123 # information we can get. 2124 tarinfo.name = arcname 2125 tarinfo.mode = stmd 2126 tarinfo.uid = statres.st_uid 2127 tarinfo.gid = statres.st_gid 2128 if type == REGTYPE: 2129 tarinfo.size = statres.st_size 2130 else: 2131 tarinfo.size = 0 2132 tarinfo.mtime = statres.st_mtime 2133 tarinfo.type = type 2134 tarinfo.linkname = linkname 2135 if pwd: 2136 try: 2137 tarinfo.uname = pwd.getpwuid(tarinfo.uid)[0] 2138 except KeyError: 2139 pass 2140 if grp: 2141 try: 2142 tarinfo.gname = grp.getgrgid(tarinfo.gid)[0] 2143 except KeyError: 2144 pass 2145 2146 if type in (CHRTYPE, BLKTYPE): 2147 if hasattr(os, "major") and hasattr(os, "minor"): 2148 tarinfo.devmajor = os.major(statres.st_rdev) 2149 tarinfo.devminor = os.minor(statres.st_rdev) 2150 return tarinfo 2151 2152 def list(self, verbose=True, *, members=None): 2153 """Print a table of contents to sys.stdout. If `verbose' is False, only 2154 the names of the members are printed. If it is True, an `ls -l'-like 2155 output is produced. `members' is optional and must be a subset of the 2156 list returned by getmembers(). 2157 """ 2158 self._check() 2159 2160 if members is None: 2161 members = self 2162 for tarinfo in members: 2163 if verbose: 2164 if tarinfo.mode is None: 2165 _safe_print("??????????") 2166 else: 2167 _safe_print(stat.filemode(tarinfo.mode)) 2168 _safe_print("%s/%s" % (tarinfo.uname or tarinfo.uid, 2169 tarinfo.gname or tarinfo.gid)) 2170 if tarinfo.ischr() or tarinfo.isblk(): 2171 _safe_print("%10s" % 2172 ("%d,%d" % (tarinfo.devmajor, tarinfo.devminor))) 2173 else: 2174 _safe_print("%10d" % tarinfo.size) 2175 if tarinfo.mtime is None: 2176 _safe_print("????-??-?? ??:??:??") 2177 else: 2178 _safe_print("%d-%02d-%02d %02d:%02d:%02d" \ 2179 % time.localtime(tarinfo.mtime)[:6]) 2180 2181 _safe_print(tarinfo.name + ("/" if tarinfo.isdir() else "")) 2182 2183 if verbose: 2184 if tarinfo.issym(): 2185 _safe_print("-> " + tarinfo.linkname) 2186 if tarinfo.islnk(): 2187 _safe_print("link to " + tarinfo.linkname) 2188 print() 2189 2190 def add(self, name, arcname=None, recursive=True, *, filter=None): 2191 """Add the file `name' to the archive. `name' may be any type of file 2192 (directory, fifo, symbolic link, etc.). If given, `arcname' 2193 specifies an alternative name for the file in the archive. 2194 Directories are added recursively by default. This can be avoided by 2195 setting `recursive' to False. `filter' is a function 2196 that expects a TarInfo object argument and returns the changed 2197 TarInfo object, if it returns None the TarInfo object will be 2198 excluded from the archive. 2199 """ 2200 self._check("awx") 2201 2202 if arcname is None: 2203 arcname = name 2204 2205 # Skip if somebody tries to archive the archive... 2206 if self.name is not None and os.path.abspath(name) == self.name: 2207 self._dbg(2, "tarfile: Skipped %r" % name) 2208 return 2209 2210 self._dbg(1, name) 2211 2212 # Create a TarInfo object from the file. 2213 tarinfo = self.gettarinfo(name, arcname) 2214 2215 if tarinfo is None: 2216 self._dbg(1, "tarfile: Unsupported type %r" % name) 2217 return 2218 2219 # Change or exclude the TarInfo object. 2220 if filter is not None: 2221 tarinfo = filter(tarinfo) 2222 if tarinfo is None: 2223 self._dbg(2, "tarfile: Excluded %r" % name) 2224 return 2225 2226 # Append the tar header and data to the archive. 2227 if tarinfo.isreg(): 2228 with bltn_open(name, "rb") as f: 2229 self.addfile(tarinfo, f) 2230 2231 elif tarinfo.isdir(): 2232 self.addfile(tarinfo) 2233 if recursive: 2234 for f in sorted(os.listdir(name)): 2235 self.add(os.path.join(name, f), os.path.join(arcname, f), 2236 recursive, filter=filter) 2237 2238 else: 2239 self.addfile(tarinfo) 2240 2241 def addfile(self, tarinfo, fileobj=None): 2242 """Add the TarInfo object `tarinfo' to the archive. If `fileobj' is 2243 given, it should be a binary file, and tarinfo.size bytes are read 2244 from it and added to the archive. You can create TarInfo objects 2245 directly, or by using gettarinfo(). 2246 """ 2247 self._check("awx") 2248 2249 tarinfo = copy.copy(tarinfo) 2250 2251 buf = tarinfo.tobuf(self.format, self.encoding, self.errors) 2252 self.fileobj.write(buf) 2253 self.offset += len(buf) 2254 bufsize=self.copybufsize 2255 # If there's data to follow, append it. 2256 if fileobj is not None: 2257 copyfileobj(fileobj, self.fileobj, tarinfo.size, bufsize=bufsize) 2258 blocks, remainder = divmod(tarinfo.size, BLOCKSIZE) 2259 if remainder > 0: 2260 self.fileobj.write(NUL * (BLOCKSIZE - remainder)) 2261 blocks += 1 2262 self.offset += blocks * BLOCKSIZE 2263 2264 self.members.append(tarinfo) 2265 2266 def _get_filter_function(self, filter): 2267 if filter is None: 2268 filter = self.extraction_filter 2269 if filter is None: 2270 return fully_trusted_filter 2271 if isinstance(filter, str): 2272 raise TypeError( 2273 'String names are not supported for ' 2274 + 'TarFile.extraction_filter. Use a function such as ' 2275 + 'tarfile.data_filter directly.') 2276 return filter 2277 if callable(filter): 2278 return filter 2279 try: 2280 return _NAMED_FILTERS[filter] 2281 except KeyError: 2282 raise ValueError(f"filter {filter!r} not found") from None 2283 2284 def extractall(self, path=".", members=None, *, numeric_owner=False, 2285 filter=None): 2286 """Extract all members from the archive to the current working 2287 directory and set owner, modification time and permissions on 2288 directories afterwards. `path' specifies a different directory 2289 to extract to. `members' is optional and must be a subset of the 2290 list returned by getmembers(). If `numeric_owner` is True, only 2291 the numbers for user/group names are used and not the names. 2292 2293 The `filter` function will be called on each member just 2294 before extraction. 2295 It can return a changed TarInfo or None to skip the member. 2296 String names of common filters are accepted. 2297 """ 2298 directories = [] 2299 2300 filter_function = self._get_filter_function(filter) 2301 if members is None: 2302 members = self 2303 2304 for member in members: 2305 tarinfo, unfiltered = self._get_extract_tarinfo( 2306 member, filter_function, path) 2307 if tarinfo is None: 2308 continue 2309 if tarinfo.isdir(): 2310 # For directories, delay setting attributes until later, 2311 # since permissions can interfere with extraction and 2312 # extracting contents can reset mtime. 2313 directories.append(unfiltered) 2314 self._extract_one(tarinfo, path, set_attrs=not tarinfo.isdir(), 2315 numeric_owner=numeric_owner, 2316 filter_function=filter_function) 2317 2318 # Reverse sort directories. 2319 directories.sort(key=lambda a: a.name, reverse=True) 2320 2321 2322 # Set correct owner, mtime and filemode on directories. 2323 for unfiltered in directories: 2324 try: 2325 # Need to re-apply any filter, to take the *current* filesystem 2326 # state into account. 2327 try: 2328 tarinfo = filter_function(unfiltered, path) 2329 except _FILTER_ERRORS as exc: 2330 self._log_no_directory_fixup(unfiltered, repr(exc)) 2331 continue 2332 if tarinfo is None: 2333 self._log_no_directory_fixup(unfiltered, 2334 'excluded by filter') 2335 continue 2336 dirpath = os.path.join(path, tarinfo.name) 2337 try: 2338 lstat = os.lstat(dirpath) 2339 except FileNotFoundError: 2340 self._log_no_directory_fixup(tarinfo, 'missing') 2341 continue 2342 if not stat.S_ISDIR(lstat.st_mode): 2343 # This is no longer a directory; presumably a later 2344 # member overwrote the entry. 2345 self._log_no_directory_fixup(tarinfo, 'not a directory') 2346 continue 2347 self.chown(tarinfo, dirpath, numeric_owner=numeric_owner) 2348 self.utime(tarinfo, dirpath) 2349 self.chmod(tarinfo, dirpath) 2350 except ExtractError as e: 2351 self._handle_nonfatal_error(e) 2352 2353 def _log_no_directory_fixup(self, member, reason): 2354 self._dbg(2, "tarfile: Not fixing up directory %r (%s)" % 2355 (member.name, reason)) 2356 2357 def extract(self, member, path="", set_attrs=True, *, numeric_owner=False, 2358 filter=None): 2359 """Extract a member from the archive to the current working directory, 2360 using its full name. Its file information is extracted as accurately 2361 as possible. `member' may be a filename or a TarInfo object. You can 2362 specify a different directory using `path'. File attributes (owner, 2363 mtime, mode) are set unless `set_attrs' is False. If `numeric_owner` 2364 is True, only the numbers for user/group names are used and not 2365 the names. 2366 2367 The `filter` function will be called before extraction. 2368 It can return a changed TarInfo or None to skip the member. 2369 String names of common filters are accepted. 2370 """ 2371 filter_function = self._get_filter_function(filter) 2372 tarinfo, unfiltered = self._get_extract_tarinfo( 2373 member, filter_function, path) 2374 if tarinfo is not None: 2375 self._extract_one(tarinfo, path, set_attrs, numeric_owner) 2376 2377 def _get_extract_tarinfo(self, member, filter_function, path): 2378 """Get (filtered, unfiltered) TarInfos from *member* 2379 2380 *member* might be a string. 2381 2382 Return (None, None) if not found. 2383 """ 2384 2385 if isinstance(member, str): 2386 unfiltered = self.getmember(member) 2387 else: 2388 unfiltered = member 2389 2390 filtered = None 2391 try: 2392 filtered = filter_function(unfiltered, path) 2393 except (OSError, FilterError) as e: 2394 self._handle_fatal_error(e) 2395 except ExtractError as e: 2396 self._handle_nonfatal_error(e) 2397 if filtered is None: 2398 self._dbg(2, "tarfile: Excluded %r" % unfiltered.name) 2399 return None, None 2400 2401 # Prepare the link target for makelink(). 2402 if filtered.islnk(): 2403 filtered = copy.copy(filtered) 2404 filtered._link_target = os.path.join(path, filtered.linkname) 2405 return filtered, unfiltered 2406 2407 def _extract_one(self, tarinfo, path, set_attrs, numeric_owner, 2408 filter_function=None): 2409 """Extract from filtered tarinfo to disk. 2410 2411 filter_function is only used when extracting a *different* 2412 member (e.g. as fallback to creating a symlink) 2413 """ 2414 self._check("r") 2415 2416 try: 2417 self._extract_member(tarinfo, os.path.join(path, tarinfo.name), 2418 set_attrs=set_attrs, 2419 numeric_owner=numeric_owner, 2420 filter_function=filter_function, 2421 extraction_root=path) 2422 except OSError as e: 2423 self._handle_fatal_error(e) 2424 except ExtractError as e: 2425 self._handle_nonfatal_error(e) 2426 2427 def _handle_nonfatal_error(self, e): 2428 """Handle non-fatal error (ExtractError) according to errorlevel""" 2429 if self.errorlevel > 1: 2430 raise 2431 else: 2432 self._dbg(1, "tarfile: %s" % e) 2433 2434 def _handle_fatal_error(self, e): 2435 """Handle "fatal" error according to self.errorlevel""" 2436 if self.errorlevel > 0: 2437 raise 2438 elif isinstance(e, OSError): 2439 if e.filename is None: 2440 self._dbg(1, "tarfile: %s" % e.strerror) 2441 else: 2442 self._dbg(1, "tarfile: %s %r" % (e.strerror, e.filename)) 2443 else: 2444 self._dbg(1, "tarfile: %s %s" % (type(e).__name__, e)) 2445 2446 def extractfile(self, member): 2447 """Extract a member from the archive as a file object. `member' may be 2448 a filename or a TarInfo object. If `member' is a regular file or 2449 a link, an io.BufferedReader object is returned. For all other 2450 existing members, None is returned. If `member' does not appear 2451 in the archive, KeyError is raised. 2452 """ 2453 self._check("r") 2454 2455 if isinstance(member, str): 2456 tarinfo = self.getmember(member) 2457 else: 2458 tarinfo = member 2459 2460 if tarinfo.isreg() or tarinfo.type not in SUPPORTED_TYPES: 2461 # Members with unknown types are treated as regular files. 2462 return self.fileobject(self, tarinfo) 2463 2464 elif tarinfo.islnk() or tarinfo.issym(): 2465 if isinstance(self.fileobj, _Stream): 2466 # A small but ugly workaround for the case that someone tries 2467 # to extract a (sym)link as a file-object from a non-seekable 2468 # stream of tar blocks. 2469 raise StreamError("cannot extract (sym)link as file object") 2470 else: 2471 # A (sym)link's file object is its target's file object. 2472 return self.extractfile(self._find_link_target(tarinfo)) 2473 else: 2474 # If there's no data associated with the member (directory, chrdev, 2475 # blkdev, etc.), return None instead of a file object. 2476 return None 2477 2478 def _extract_member(self, tarinfo, targetpath, set_attrs=True, 2479 numeric_owner=False, *, filter_function=None, 2480 extraction_root=None): 2481 """Extract the filtered TarInfo object tarinfo to a physical 2482 file called targetpath. 2483 2484 filter_function is only used when extracting a *different* 2485 member (e.g. as fallback to creating a symlink) 2486 """ 2487 # Fetch the TarInfo object for the given name 2488 # and build the destination pathname, replacing 2489 # forward slashes to platform specific separators. 2490 targetpath = targetpath.rstrip("/") 2491 targetpath = targetpath.replace("/", os.sep) 2492 2493 # Create all upper directories. 2494 upperdirs = os.path.dirname(targetpath) 2495 if upperdirs and not os.path.exists(upperdirs): 2496 # Create directories that are not part of the archive with 2497 # default permissions. 2498 os.makedirs(upperdirs) 2499 2500 if tarinfo.islnk() or tarinfo.issym(): 2501 self._dbg(1, "%s -> %s" % (tarinfo.name, tarinfo.linkname)) 2502 else: 2503 self._dbg(1, tarinfo.name) 2504 2505 if tarinfo.isreg(): 2506 self.makefile(tarinfo, targetpath) 2507 elif tarinfo.isdir(): 2508 self.makedir(tarinfo, targetpath) 2509 elif tarinfo.isfifo(): 2510 self.makefifo(tarinfo, targetpath) 2511 elif tarinfo.ischr() or tarinfo.isblk(): 2512 self.makedev(tarinfo, targetpath) 2513 elif tarinfo.islnk() or tarinfo.issym(): 2514 self.makelink_with_filter( 2515 tarinfo, targetpath, 2516 filter_function=filter_function, 2517 extraction_root=extraction_root) 2518 elif tarinfo.type not in SUPPORTED_TYPES: 2519 self.makeunknown(tarinfo, targetpath) 2520 else: 2521 self.makefile(tarinfo, targetpath) 2522 2523 if set_attrs: 2524 self.chown(tarinfo, targetpath, numeric_owner) 2525 if not tarinfo.issym(): 2526 self.chmod(tarinfo, targetpath) 2527 self.utime(tarinfo, targetpath) 2528 2529 #-------------------------------------------------------------------------- 2530 # Below are the different file methods. They are called via 2531 # _extract_member() when extract() is called. They can be replaced in a 2532 # subclass to implement other functionality. 2533 2534 def makedir(self, tarinfo, targetpath): 2535 """Make a directory called targetpath. 2536 """ 2537 try: 2538 if tarinfo.mode is None: 2539 # Use the system's default mode 2540 os.mkdir(targetpath) 2541 else: 2542 # Use a safe mode for the directory, the real mode is set 2543 # later in _extract_member(). 2544 os.mkdir(targetpath, 0o700) 2545 except FileExistsError: 2546 pass 2547 2548 def makefile(self, tarinfo, targetpath): 2549 """Make a file called targetpath. 2550 """ 2551 source = self.fileobj 2552 source.seek(tarinfo.offset_data) 2553 bufsize = self.copybufsize 2554 with bltn_open(targetpath, "wb") as target: 2555 if tarinfo.sparse is not None: 2556 for offset, size in tarinfo.sparse: 2557 target.seek(offset) 2558 copyfileobj(source, target, size, ReadError, bufsize) 2559 target.seek(tarinfo.size) 2560 target.truncate() 2561 else: 2562 copyfileobj(source, target, tarinfo.size, ReadError, bufsize) 2563 2564 def makeunknown(self, tarinfo, targetpath): 2565 """Make a file from a TarInfo object with an unknown type 2566 at targetpath. 2567 """ 2568 self.makefile(tarinfo, targetpath) 2569 self._dbg(1, "tarfile: Unknown file type %r, " \ 2570 "extracted as regular file." % tarinfo.type) 2571 2572 def makefifo(self, tarinfo, targetpath): 2573 """Make a fifo called targetpath. 2574 """ 2575 if hasattr(os, "mkfifo"): 2576 os.mkfifo(targetpath) 2577 else: 2578 raise ExtractError("fifo not supported by system") 2579 2580 def makedev(self, tarinfo, targetpath): 2581 """Make a character or block device called targetpath. 2582 """ 2583 if not hasattr(os, "mknod") or not hasattr(os, "makedev"): 2584 raise ExtractError("special devices not supported by system") 2585 2586 mode = tarinfo.mode 2587 if mode is None: 2588 # Use mknod's default 2589 mode = 0o600 2590 if tarinfo.isblk(): 2591 mode |= stat.S_IFBLK 2592 else: 2593 mode |= stat.S_IFCHR 2594 2595 os.mknod(targetpath, mode, 2596 os.makedev(tarinfo.devmajor, tarinfo.devminor)) 2597 2598 def makelink(self, tarinfo, targetpath): 2599 return self.makelink_with_filter(tarinfo, targetpath, None, None) 2600 2601 def makelink_with_filter(self, tarinfo, targetpath, 2602 filter_function, extraction_root): 2603 """Make a (symbolic) link called targetpath. If it cannot be created 2604 (platform limitation), we try to make a copy of the referenced file 2605 instead of a link. 2606 2607 filter_function is only used when extracting a *different* 2608 member (e.g. as fallback to creating a link). 2609 """ 2610 keyerror_to_extracterror = False 2611 try: 2612 # For systems that support symbolic and hard links. 2613 if tarinfo.issym(): 2614 if os.path.lexists(targetpath): 2615 # Avoid FileExistsError on following os.symlink. 2616 os.unlink(targetpath) 2617 os.symlink(tarinfo.linkname, targetpath) 2618 return 2619 else: 2620 if os.path.exists(tarinfo._link_target): 2621 os.link(tarinfo._link_target, targetpath) 2622 return 2623 except symlink_exception: 2624 keyerror_to_extracterror = True 2625 2626 try: 2627 unfiltered = self._find_link_target(tarinfo) 2628 except KeyError: 2629 if keyerror_to_extracterror: 2630 raise ExtractError( 2631 "unable to resolve link inside archive") from None 2632 else: 2633 raise 2634 2635 if filter_function is None: 2636 filtered = unfiltered 2637 else: 2638 if extraction_root is None: 2639 raise ExtractError( 2640 "makelink_with_filter: if filter_function is not None, " 2641 + "extraction_root must also not be None") 2642 try: 2643 filtered = filter_function(unfiltered, extraction_root) 2644 except _FILTER_ERRORS as cause: 2645 raise LinkFallbackError(tarinfo, unfiltered.name) from cause 2646 if filtered is not None: 2647 self._extract_member(filtered, targetpath, 2648 filter_function=filter_function, 2649 extraction_root=extraction_root) 2650 2651 def chown(self, tarinfo, targetpath, numeric_owner): 2652 """Set owner of targetpath according to tarinfo. If numeric_owner 2653 is True, use .gid/.uid instead of .gname/.uname. If numeric_owner 2654 is False, fall back to .gid/.uid when the search based on name 2655 fails. 2656 """ 2657 if hasattr(os, "geteuid") and os.geteuid() == 0: 2658 # We have to be root to do so. 2659 g = tarinfo.gid 2660 u = tarinfo.uid 2661 if not numeric_owner: 2662 try: 2663 if grp and tarinfo.gname: 2664 g = grp.getgrnam(tarinfo.gname)[2] 2665 except KeyError: 2666 pass 2667 try: 2668 if pwd and tarinfo.uname: 2669 u = pwd.getpwnam(tarinfo.uname)[2] 2670 except KeyError: 2671 pass 2672 if g is None: 2673 g = -1 2674 if u is None: 2675 u = -1 2676 try: 2677 if tarinfo.issym() and hasattr(os, "lchown"): 2678 os.lchown(targetpath, u, g) 2679 else: 2680 os.chown(targetpath, u, g) 2681 except OSError as e: 2682 raise ExtractError("could not change owner") from e 2683 2684 def chmod(self, tarinfo, targetpath): 2685 """Set file permissions of targetpath according to tarinfo. 2686 """ 2687 if tarinfo.mode is None: 2688 return 2689 try: 2690 os.chmod(targetpath, tarinfo.mode) 2691 except OSError as e: 2692 raise ExtractError("could not change mode") from e 2693 2694 def utime(self, tarinfo, targetpath): 2695 """Set modification time of targetpath according to tarinfo. 2696 """ 2697 mtime = tarinfo.mtime 2698 if mtime is None: 2699 return 2700 if not hasattr(os, 'utime'): 2701 return 2702 try: 2703 os.utime(targetpath, (mtime, mtime)) 2704 except OSError as e: 2705 raise ExtractError("could not change modification time") from e 2706 2707 #-------------------------------------------------------------------------- 2708 def next(self): 2709 """Return the next member of the archive as a TarInfo object, when 2710 TarFile is opened for reading. Return None if there is no more 2711 available. 2712 """ 2713 self._check("ra") 2714 if self.firstmember is not None: 2715 m = self.firstmember 2716 self.firstmember = None 2717 return m 2718 2719 # Advance the file pointer. 2720 if self.offset != self.fileobj.tell(): 2721 if self.offset == 0: 2722 return None 2723 self.fileobj.seek(self.offset - 1) 2724 if not self.fileobj.read(1): 2725 raise ReadError("unexpected end of data") 2726 2727 # Read the next block. 2728 tarinfo = None 2729 while True: 2730 try: 2731 tarinfo = self.tarinfo.fromtarfile(self) 2732 except EOFHeaderError as e: 2733 if self.ignore_zeros: 2734 self._dbg(2, "0x%X: %s" % (self.offset, e)) 2735 self.offset += BLOCKSIZE 2736 continue 2737 except InvalidHeaderError as e: 2738 if self.ignore_zeros: 2739 self._dbg(2, "0x%X: %s" % (self.offset, e)) 2740 self.offset += BLOCKSIZE 2741 continue 2742 elif self.offset == 0: 2743 raise ReadError(str(e)) from None 2744 except EmptyHeaderError: 2745 if self.offset == 0: 2746 raise ReadError("empty file") from None 2747 except TruncatedHeaderError as e: 2748 if self.offset == 0: 2749 raise ReadError(str(e)) from None 2750 except SubsequentHeaderError as e: 2751 raise ReadError(str(e)) from None 2752 except Exception as e: 2753 try: 2754 import zlib 2755 if isinstance(e, zlib.error): 2756 raise ReadError(f'zlib error: {e}') from None 2757 else: 2758 raise e 2759 except ImportError: 2760 raise e 2761 break 2762 2763 if tarinfo is not None: 2764 self.members.append(tarinfo) 2765 else: 2766 self._loaded = True 2767 2768 return tarinfo 2769 2770 #-------------------------------------------------------------------------- 2771 # Little helper methods: 2772 2773 def _getmember(self, name, tarinfo=None, normalize=False): 2774 """Find an archive member by name from bottom to top. 2775 If tarinfo is given, it is used as the starting point. 2776 """ 2777 # Ensure that all members have been loaded. 2778 members = self.getmembers() 2779 2780 # Limit the member search list up to tarinfo. 2781 skipping = False 2782 if tarinfo is not None: 2783 try: 2784 index = members.index(tarinfo) 2785 except ValueError: 2786 # The given starting point might be a (modified) copy. 2787 # We'll later skip members until we find an equivalent. 2788 skipping = True 2789 else: 2790 # Happy fast path 2791 members = members[:index] 2792 2793 if normalize: 2794 name = os.path.normpath(name) 2795 2796 for member in reversed(members): 2797 if skipping: 2798 if tarinfo.offset == member.offset: 2799 skipping = False 2800 continue 2801 if normalize: 2802 member_name = os.path.normpath(member.name) 2803 else: 2804 member_name = member.name 2805 2806 if name == member_name: 2807 return member 2808 2809 if skipping: 2810 # Starting point was not found 2811 raise ValueError(tarinfo) 2812 2813 def _load(self): 2814 """Read through the entire archive file and look for readable 2815 members. 2816 """ 2817 while True: 2818 tarinfo = self.next() 2819 if tarinfo is None: 2820 break 2821 self._loaded = True 2822 2823 def _check(self, mode=None): 2824 """Check if TarFile is still open, and if the operation's mode 2825 corresponds to TarFile's mode. 2826 """ 2827 if self.closed: 2828 raise OSError("%s is closed" % self.__class__.__name__) 2829 if mode is not None and self.mode not in mode: 2830 raise OSError("bad operation for mode %r" % self.mode) 2831 2832 def _find_link_target(self, tarinfo): 2833 """Find the target member of a symlink or hardlink member in the 2834 archive. 2835 """ 2836 if tarinfo.issym(): 2837 # Always search the entire archive. 2838 linkname = "/".join(filter(None, (os.path.dirname(tarinfo.name), tarinfo.linkname))) 2839 limit = None 2840 else: 2841 # Search the archive before the link, because a hard link is 2842 # just a reference to an already archived file. 2843 linkname = tarinfo.linkname 2844 limit = tarinfo 2845 2846 member = self._getmember(linkname, tarinfo=limit, normalize=True) 2847 if member is None: 2848 raise KeyError("linkname %r not found" % linkname) 2849 return member 2850 2851 def __iter__(self): 2852 """Provide an iterator object. 2853 """ 2854 if self._loaded: 2855 yield from self.members 2856 return 2857 2858 # Yield items using TarFile's next() method. 2859 # When all members have been read, set TarFile as _loaded. 2860 index = 0 2861 # Fix for SF #1100429: Under rare circumstances it can 2862 # happen that getmembers() is called during iteration, 2863 # which will have already exhausted the next() method. 2864 if self.firstmember is not None: 2865 tarinfo = self.next() 2866 index += 1 2867 yield tarinfo 2868 2869 while True: 2870 if index < len(self.members): 2871 tarinfo = self.members[index] 2872 elif not self._loaded: 2873 tarinfo = self.next() 2874 if not tarinfo: 2875 self._loaded = True 2876 return 2877 else: 2878 return 2879 index += 1 2880 yield tarinfo 2881 2882 def _dbg(self, level, msg): 2883 """Write debugging output to sys.stderr. 2884 """ 2885 if level <= self.debug: 2886 print(msg, file=sys.stderr) 2887 2888 def __enter__(self): 2889 self._check() 2890 return self 2891 2892 def __exit__(self, type, value, traceback): 2893 if type is None: 2894 self.close() 2895 else: 2896 # An exception occurred. We must not call close() because 2897 # it would try to write end-of-archive blocks and padding. 2898 if not self._extfileobj: 2899 self.fileobj.close() 2900 self.closed = True 2901 2902#-------------------- 2903# exported functions 2904#-------------------- 2905 2906def is_tarfile(name): 2907 """Return True if name points to a tar archive that we 2908 are able to handle, else return False. 2909 2910 'name' should be a string, file, or file-like object. 2911 """ 2912 try: 2913 if hasattr(name, "read"): 2914 pos = name.tell() 2915 t = open(fileobj=name) 2916 name.seek(pos) 2917 else: 2918 t = open(name) 2919 t.close() 2920 return True 2921 except TarError: 2922 return False 2923 2924open = TarFile.open 2925 2926 2927def main(): 2928 import argparse 2929 2930 description = 'A simple command-line interface for tarfile module.' 2931 parser = argparse.ArgumentParser(description=description) 2932 parser.add_argument('-v', '--verbose', action='store_true', default=False, 2933 help='Verbose output') 2934 parser.add_argument('--filter', metavar='<filtername>', 2935 choices=_NAMED_FILTERS, 2936 help='Filter for extraction') 2937 2938 group = parser.add_mutually_exclusive_group(required=True) 2939 group.add_argument('-l', '--list', metavar='<tarfile>', 2940 help='Show listing of a tarfile') 2941 group.add_argument('-e', '--extract', nargs='+', 2942 metavar=('<tarfile>', '<output_dir>'), 2943 help='Extract tarfile into target dir') 2944 group.add_argument('-c', '--create', nargs='+', 2945 metavar=('<name>', '<file>'), 2946 help='Create tarfile from sources') 2947 group.add_argument('-t', '--test', metavar='<tarfile>', 2948 help='Test if a tarfile is valid') 2949 2950 args = parser.parse_args() 2951 2952 if args.filter and args.extract is None: 2953 parser.exit(1, '--filter is only valid for extraction\n') 2954 2955 if args.test is not None: 2956 src = args.test 2957 if is_tarfile(src): 2958 with open(src, 'r') as tar: 2959 tar.getmembers() 2960 print(tar.getmembers(), file=sys.stderr) 2961 if args.verbose: 2962 print('{!r} is a tar archive.'.format(src)) 2963 else: 2964 parser.exit(1, '{!r} is not a tar archive.\n'.format(src)) 2965 2966 elif args.list is not None: 2967 src = args.list 2968 if is_tarfile(src): 2969 with TarFile.open(src, 'r:*') as tf: 2970 tf.list(verbose=args.verbose) 2971 else: 2972 parser.exit(1, '{!r} is not a tar archive.\n'.format(src)) 2973 2974 elif args.extract is not None: 2975 if len(args.extract) == 1: 2976 src = args.extract[0] 2977 curdir = os.curdir 2978 elif len(args.extract) == 2: 2979 src, curdir = args.extract 2980 else: 2981 parser.exit(1, parser.format_help()) 2982 2983 if is_tarfile(src): 2984 with TarFile.open(src, 'r:*') as tf: 2985 tf.extractall(path=curdir, filter=args.filter) 2986 if args.verbose: 2987 if curdir == '.': 2988 msg = '{!r} file is extracted.'.format(src) 2989 else: 2990 msg = ('{!r} file is extracted ' 2991 'into {!r} directory.').format(src, curdir) 2992 print(msg) 2993 else: 2994 parser.exit(1, '{!r} is not a tar archive.\n'.format(src)) 2995 2996 elif args.create is not None: 2997 tar_name = args.create.pop(0) 2998 _, ext = os.path.splitext(tar_name) 2999 compressions = { 3000 # gz 3001 '.gz': 'gz', 3002 '.tgz': 'gz', 3003 # xz 3004 '.xz': 'xz', 3005 '.txz': 'xz', 3006 # bz2 3007 '.bz2': 'bz2', 3008 '.tbz': 'bz2', 3009 '.tbz2': 'bz2', 3010 '.tb2': 'bz2', 3011 } 3012 tar_mode = 'w:' + compressions[ext] if ext in compressions else 'w' 3013 tar_files = args.create 3014 3015 with TarFile.open(tar_name, tar_mode) as tf: 3016 for file_name in tar_files: 3017 tf.add(file_name) 3018 3019 if args.verbose: 3020 print('{!r} file created.'.format(tar_name)) 3021 3022if __name__ == '__main__': 3023 main() 3024