1#!/usr/bin/env python3 2#------------------------------------------------------------------- 3# tarfile.py 4#------------------------------------------------------------------- 5# Copyright (C) 2002 Lars Gustaebel <lars@gustaebel.de> 6# All rights reserved. 7# 8# Permission is hereby granted, free of charge, to any person 9# obtaining a copy of this software and associated documentation 10# files (the "Software"), to deal in the Software without 11# restriction, including without limitation the rights to use, 12# copy, modify, merge, publish, distribute, sublicense, and/or sell 13# copies of the Software, and to permit persons to whom the 14# Software is furnished to do so, subject to the following 15# conditions: 16# 17# The above copyright notice and this permission notice shall be 18# included in all copies or substantial portions of the Software. 19# 20# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 21# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 22# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 23# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 24# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 25# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 26# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 27# OTHER DEALINGS IN THE SOFTWARE. 28# 29"""Read from and write to tar format archives. 30""" 31 32version = "0.9.0" 33__author__ = "Lars Gust\u00e4bel (lars@gustaebel.de)" 34__credits__ = "Gustavo Niemeyer, Niels Gust\u00e4bel, Richard Townsend." 35 36#--------- 37# Imports 38#--------- 39from builtins import open as bltn_open 40import sys 41import os 42import io 43import shutil 44import stat 45import time 46import struct 47import copy 48import re 49 50try: 51 import pwd 52except ImportError: 53 pwd = None 54try: 55 import grp 56except ImportError: 57 grp = None 58 59# os.symlink on Windows prior to 6.0 raises NotImplementedError 60symlink_exception = (AttributeError, NotImplementedError) 61try: 62 # OSError (winerror=1314) will be raised if the caller does not hold the 63 # SeCreateSymbolicLinkPrivilege privilege 64 symlink_exception += (OSError,) 65except NameError: 66 pass 67 68# from tarfile import * 69__all__ = ["TarFile", "TarInfo", "is_tarfile", "TarError", "ReadError", 70 "CompressionError", "StreamError", "ExtractError", "HeaderError", 71 "ENCODING", "USTAR_FORMAT", "GNU_FORMAT", "PAX_FORMAT", 72 "DEFAULT_FORMAT", "open"] 73 74#--------------------------------------------------------- 75# tar constants 76#--------------------------------------------------------- 77NUL = b"\0" # the null character 78BLOCKSIZE = 512 # length of processing blocks 79RECORDSIZE = BLOCKSIZE * 20 # length of records 80GNU_MAGIC = b"ustar \0" # magic gnu tar string 81POSIX_MAGIC = b"ustar\x0000" # magic posix tar string 82 83LENGTH_NAME = 100 # maximum length of a filename 84LENGTH_LINK = 100 # maximum length of a linkname 85LENGTH_PREFIX = 155 # maximum length of the prefix field 86 87REGTYPE = b"0" # regular file 88AREGTYPE = b"\0" # regular file 89LNKTYPE = b"1" # link (inside tarfile) 90SYMTYPE = b"2" # symbolic link 91CHRTYPE = b"3" # character special device 92BLKTYPE = b"4" # block special device 93DIRTYPE = b"5" # directory 94FIFOTYPE = b"6" # fifo special device 95CONTTYPE = b"7" # contiguous file 96 97GNUTYPE_LONGNAME = b"L" # GNU tar longname 98GNUTYPE_LONGLINK = b"K" # GNU tar longlink 99GNUTYPE_SPARSE = b"S" # GNU tar sparse file 100 101XHDTYPE = b"x" # POSIX.1-2001 extended header 102XGLTYPE = b"g" # POSIX.1-2001 global header 103SOLARIS_XHDTYPE = b"X" # Solaris extended header 104 105USTAR_FORMAT = 0 # POSIX.1-1988 (ustar) format 106GNU_FORMAT = 1 # GNU tar format 107PAX_FORMAT = 2 # POSIX.1-2001 (pax) format 108DEFAULT_FORMAT = PAX_FORMAT 109 110#--------------------------------------------------------- 111# tarfile constants 112#--------------------------------------------------------- 113# File types that tarfile supports: 114SUPPORTED_TYPES = (REGTYPE, AREGTYPE, LNKTYPE, 115 SYMTYPE, DIRTYPE, FIFOTYPE, 116 CONTTYPE, CHRTYPE, BLKTYPE, 117 GNUTYPE_LONGNAME, GNUTYPE_LONGLINK, 118 GNUTYPE_SPARSE) 119 120# File types that will be treated as a regular file. 121REGULAR_TYPES = (REGTYPE, AREGTYPE, 122 CONTTYPE, GNUTYPE_SPARSE) 123 124# File types that are part of the GNU tar format. 125GNU_TYPES = (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK, 126 GNUTYPE_SPARSE) 127 128# Fields from a pax header that override a TarInfo attribute. 129PAX_FIELDS = ("path", "linkpath", "size", "mtime", 130 "uid", "gid", "uname", "gname") 131 132# Fields from a pax header that are affected by hdrcharset. 133PAX_NAME_FIELDS = {"path", "linkpath", "uname", "gname"} 134 135# Fields in a pax header that are numbers, all other fields 136# are treated as strings. 137PAX_NUMBER_FIELDS = { 138 "atime": float, 139 "ctime": float, 140 "mtime": float, 141 "uid": int, 142 "gid": int, 143 "size": int 144} 145 146#--------------------------------------------------------- 147# initialization 148#--------------------------------------------------------- 149if os.name == "nt": 150 ENCODING = "utf-8" 151else: 152 ENCODING = sys.getfilesystemencoding() 153 154#--------------------------------------------------------- 155# Some useful functions 156#--------------------------------------------------------- 157 158def stn(s, length, encoding, errors): 159 """Convert a string to a null-terminated bytes object. 160 """ 161 s = s.encode(encoding, errors) 162 return s[:length] + (length - len(s)) * NUL 163 164def nts(s, encoding, errors): 165 """Convert a null-terminated bytes object to a string. 166 """ 167 p = s.find(b"\0") 168 if p != -1: 169 s = s[:p] 170 return s.decode(encoding, errors) 171 172def nti(s): 173 """Convert a number field to a python number. 174 """ 175 # There are two possible encodings for a number field, see 176 # itn() below. 177 if s[0] in (0o200, 0o377): 178 n = 0 179 for i in range(len(s) - 1): 180 n <<= 8 181 n += s[i + 1] 182 if s[0] == 0o377: 183 n = -(256 ** (len(s) - 1) - n) 184 else: 185 try: 186 s = nts(s, "ascii", "strict") 187 n = int(s.strip() or "0", 8) 188 except ValueError: 189 raise InvalidHeaderError("invalid header") 190 return n 191 192def itn(n, digits=8, format=DEFAULT_FORMAT): 193 """Convert a python number to a number field. 194 """ 195 # POSIX 1003.1-1988 requires numbers to be encoded as a string of 196 # octal digits followed by a null-byte, this allows values up to 197 # (8**(digits-1))-1. GNU tar allows storing numbers greater than 198 # that if necessary. A leading 0o200 or 0o377 byte indicate this 199 # particular encoding, the following digits-1 bytes are a big-endian 200 # base-256 representation. This allows values up to (256**(digits-1))-1. 201 # A 0o200 byte indicates a positive number, a 0o377 byte a negative 202 # number. 203 original_n = n 204 n = int(n) 205 if 0 <= n < 8 ** (digits - 1): 206 s = bytes("%0*o" % (digits - 1, n), "ascii") + NUL 207 elif format == GNU_FORMAT and -256 ** (digits - 1) <= n < 256 ** (digits - 1): 208 if n >= 0: 209 s = bytearray([0o200]) 210 else: 211 s = bytearray([0o377]) 212 n = 256 ** digits + n 213 214 for i in range(digits - 1): 215 s.insert(1, n & 0o377) 216 n >>= 8 217 else: 218 raise ValueError("overflow in number field") 219 220 return s 221 222def calc_chksums(buf): 223 """Calculate the checksum for a member's header by summing up all 224 characters except for the chksum field which is treated as if 225 it was filled with spaces. According to the GNU tar sources, 226 some tars (Sun and NeXT) calculate chksum with signed char, 227 which will be different if there are chars in the buffer with 228 the high bit set. So we calculate two checksums, unsigned and 229 signed. 230 """ 231 unsigned_chksum = 256 + sum(struct.unpack_from("148B8x356B", buf)) 232 signed_chksum = 256 + sum(struct.unpack_from("148b8x356b", buf)) 233 return unsigned_chksum, signed_chksum 234 235def copyfileobj(src, dst, length=None, exception=OSError, bufsize=None): 236 """Copy length bytes from fileobj src to fileobj dst. 237 If length is None, copy the entire content. 238 """ 239 bufsize = bufsize or 16 * 1024 240 if length == 0: 241 return 242 if length is None: 243 shutil.copyfileobj(src, dst, bufsize) 244 return 245 246 blocks, remainder = divmod(length, bufsize) 247 for b in range(blocks): 248 buf = src.read(bufsize) 249 if len(buf) < bufsize: 250 raise exception("unexpected end of data") 251 dst.write(buf) 252 253 if remainder != 0: 254 buf = src.read(remainder) 255 if len(buf) < remainder: 256 raise exception("unexpected end of data") 257 dst.write(buf) 258 return 259 260def _safe_print(s): 261 encoding = getattr(sys.stdout, 'encoding', None) 262 if encoding is not None: 263 s = s.encode(encoding, 'backslashreplace').decode(encoding) 264 print(s, end=' ') 265 266 267class TarError(Exception): 268 """Base exception.""" 269 pass 270class ExtractError(TarError): 271 """General exception for extract errors.""" 272 pass 273class ReadError(TarError): 274 """Exception for unreadable tar archives.""" 275 pass 276class CompressionError(TarError): 277 """Exception for unavailable compression methods.""" 278 pass 279class StreamError(TarError): 280 """Exception for unsupported operations on stream-like TarFiles.""" 281 pass 282class HeaderError(TarError): 283 """Base exception for header errors.""" 284 pass 285class EmptyHeaderError(HeaderError): 286 """Exception for empty headers.""" 287 pass 288class TruncatedHeaderError(HeaderError): 289 """Exception for truncated headers.""" 290 pass 291class EOFHeaderError(HeaderError): 292 """Exception for end of file headers.""" 293 pass 294class InvalidHeaderError(HeaderError): 295 """Exception for invalid headers.""" 296 pass 297class SubsequentHeaderError(HeaderError): 298 """Exception for missing and invalid extended headers.""" 299 pass 300 301#--------------------------- 302# internal stream interface 303#--------------------------- 304class _LowLevelFile: 305 """Low-level file object. Supports reading and writing. 306 It is used instead of a regular file object for streaming 307 access. 308 """ 309 310 def __init__(self, name, mode): 311 mode = { 312 "r": os.O_RDONLY, 313 "w": os.O_WRONLY | os.O_CREAT | os.O_TRUNC, 314 }[mode] 315 if hasattr(os, "O_BINARY"): 316 mode |= os.O_BINARY 317 self.fd = os.open(name, mode, 0o666) 318 319 def close(self): 320 os.close(self.fd) 321 322 def read(self, size): 323 return os.read(self.fd, size) 324 325 def write(self, s): 326 os.write(self.fd, s) 327 328class _Stream: 329 """Class that serves as an adapter between TarFile and 330 a stream-like object. The stream-like object only 331 needs to have a read() or write() method and is accessed 332 blockwise. Use of gzip or bzip2 compression is possible. 333 A stream-like object could be for example: sys.stdin, 334 sys.stdout, a socket, a tape device etc. 335 336 _Stream is intended to be used only internally. 337 """ 338 339 def __init__(self, name, mode, comptype, fileobj, bufsize): 340 """Construct a _Stream object. 341 """ 342 self._extfileobj = True 343 if fileobj is None: 344 fileobj = _LowLevelFile(name, mode) 345 self._extfileobj = False 346 347 if comptype == '*': 348 # Enable transparent compression detection for the 349 # stream interface 350 fileobj = _StreamProxy(fileobj) 351 comptype = fileobj.getcomptype() 352 353 self.name = name or "" 354 self.mode = mode 355 self.comptype = comptype 356 self.fileobj = fileobj 357 self.bufsize = bufsize 358 self.buf = b"" 359 self.pos = 0 360 self.closed = False 361 362 try: 363 if comptype == "gz": 364 try: 365 import zlib 366 except ImportError: 367 raise CompressionError("zlib module is not available") from None 368 self.zlib = zlib 369 self.crc = zlib.crc32(b"") 370 if mode == "r": 371 self._init_read_gz() 372 self.exception = zlib.error 373 else: 374 self._init_write_gz() 375 376 elif comptype == "bz2": 377 try: 378 import bz2 379 except ImportError: 380 raise CompressionError("bz2 module is not available") from None 381 if mode == "r": 382 self.dbuf = b"" 383 self.cmp = bz2.BZ2Decompressor() 384 self.exception = OSError 385 else: 386 self.cmp = bz2.BZ2Compressor() 387 388 elif comptype == "xz": 389 try: 390 import lzma 391 except ImportError: 392 raise CompressionError("lzma module is not available") from None 393 if mode == "r": 394 self.dbuf = b"" 395 self.cmp = lzma.LZMADecompressor() 396 self.exception = lzma.LZMAError 397 else: 398 self.cmp = lzma.LZMACompressor() 399 400 elif comptype != "tar": 401 raise CompressionError("unknown compression type %r" % comptype) 402 403 except: 404 if not self._extfileobj: 405 self.fileobj.close() 406 self.closed = True 407 raise 408 409 def __del__(self): 410 if hasattr(self, "closed") and not self.closed: 411 self.close() 412 413 def _init_write_gz(self): 414 """Initialize for writing with gzip compression. 415 """ 416 self.cmp = self.zlib.compressobj(9, self.zlib.DEFLATED, 417 -self.zlib.MAX_WBITS, 418 self.zlib.DEF_MEM_LEVEL, 419 0) 420 timestamp = struct.pack("<L", int(time.time())) 421 self.__write(b"\037\213\010\010" + timestamp + b"\002\377") 422 if self.name.endswith(".gz"): 423 self.name = self.name[:-3] 424 # Honor "directory components removed" from RFC1952 425 self.name = os.path.basename(self.name) 426 # RFC1952 says we must use ISO-8859-1 for the FNAME field. 427 self.__write(self.name.encode("iso-8859-1", "replace") + NUL) 428 429 def write(self, s): 430 """Write string s to the stream. 431 """ 432 if self.comptype == "gz": 433 self.crc = self.zlib.crc32(s, self.crc) 434 self.pos += len(s) 435 if self.comptype != "tar": 436 s = self.cmp.compress(s) 437 self.__write(s) 438 439 def __write(self, s): 440 """Write string s to the stream if a whole new block 441 is ready to be written. 442 """ 443 self.buf += s 444 while len(self.buf) > self.bufsize: 445 self.fileobj.write(self.buf[:self.bufsize]) 446 self.buf = self.buf[self.bufsize:] 447 448 def close(self): 449 """Close the _Stream object. No operation should be 450 done on it afterwards. 451 """ 452 if self.closed: 453 return 454 455 self.closed = True 456 try: 457 if self.mode == "w" and self.comptype != "tar": 458 self.buf += self.cmp.flush() 459 460 if self.mode == "w" and self.buf: 461 self.fileobj.write(self.buf) 462 self.buf = b"" 463 if self.comptype == "gz": 464 self.fileobj.write(struct.pack("<L", self.crc)) 465 self.fileobj.write(struct.pack("<L", self.pos & 0xffffFFFF)) 466 finally: 467 if not self._extfileobj: 468 self.fileobj.close() 469 470 def _init_read_gz(self): 471 """Initialize for reading a gzip compressed fileobj. 472 """ 473 self.cmp = self.zlib.decompressobj(-self.zlib.MAX_WBITS) 474 self.dbuf = b"" 475 476 # taken from gzip.GzipFile with some alterations 477 if self.__read(2) != b"\037\213": 478 raise ReadError("not a gzip file") 479 if self.__read(1) != b"\010": 480 raise CompressionError("unsupported compression method") 481 482 flag = ord(self.__read(1)) 483 self.__read(6) 484 485 if flag & 4: 486 xlen = ord(self.__read(1)) + 256 * ord(self.__read(1)) 487 self.read(xlen) 488 if flag & 8: 489 while True: 490 s = self.__read(1) 491 if not s or s == NUL: 492 break 493 if flag & 16: 494 while True: 495 s = self.__read(1) 496 if not s or s == NUL: 497 break 498 if flag & 2: 499 self.__read(2) 500 501 def tell(self): 502 """Return the stream's file pointer position. 503 """ 504 return self.pos 505 506 def seek(self, pos=0): 507 """Set the stream's file pointer to pos. Negative seeking 508 is forbidden. 509 """ 510 if pos - self.pos >= 0: 511 blocks, remainder = divmod(pos - self.pos, self.bufsize) 512 for i in range(blocks): 513 self.read(self.bufsize) 514 self.read(remainder) 515 else: 516 raise StreamError("seeking backwards is not allowed") 517 return self.pos 518 519 def read(self, size): 520 """Return the next size number of bytes from the stream.""" 521 assert size is not None 522 buf = self._read(size) 523 self.pos += len(buf) 524 return buf 525 526 def _read(self, size): 527 """Return size bytes from the stream. 528 """ 529 if self.comptype == "tar": 530 return self.__read(size) 531 532 c = len(self.dbuf) 533 t = [self.dbuf] 534 while c < size: 535 # Skip underlying buffer to avoid unaligned double buffering. 536 if self.buf: 537 buf = self.buf 538 self.buf = b"" 539 else: 540 buf = self.fileobj.read(self.bufsize) 541 if not buf: 542 break 543 try: 544 buf = self.cmp.decompress(buf) 545 except self.exception as e: 546 raise ReadError("invalid compressed data") from e 547 t.append(buf) 548 c += len(buf) 549 t = b"".join(t) 550 self.dbuf = t[size:] 551 return t[:size] 552 553 def __read(self, size): 554 """Return size bytes from stream. If internal buffer is empty, 555 read another block from the stream. 556 """ 557 c = len(self.buf) 558 t = [self.buf] 559 while c < size: 560 buf = self.fileobj.read(self.bufsize) 561 if not buf: 562 break 563 t.append(buf) 564 c += len(buf) 565 t = b"".join(t) 566 self.buf = t[size:] 567 return t[:size] 568# class _Stream 569 570class _StreamProxy(object): 571 """Small proxy class that enables transparent compression 572 detection for the Stream interface (mode 'r|*'). 573 """ 574 575 def __init__(self, fileobj): 576 self.fileobj = fileobj 577 self.buf = self.fileobj.read(BLOCKSIZE) 578 579 def read(self, size): 580 self.read = self.fileobj.read 581 return self.buf 582 583 def getcomptype(self): 584 if self.buf.startswith(b"\x1f\x8b\x08"): 585 return "gz" 586 elif self.buf[0:3] == b"BZh" and self.buf[4:10] == b"1AY&SY": 587 return "bz2" 588 elif self.buf.startswith((b"\x5d\x00\x00\x80", b"\xfd7zXZ")): 589 return "xz" 590 else: 591 return "tar" 592 593 def close(self): 594 self.fileobj.close() 595# class StreamProxy 596 597#------------------------ 598# Extraction file object 599#------------------------ 600class _FileInFile(object): 601 """A thin wrapper around an existing file object that 602 provides a part of its data as an individual file 603 object. 604 """ 605 606 def __init__(self, fileobj, offset, size, blockinfo=None): 607 self.fileobj = fileobj 608 self.offset = offset 609 self.size = size 610 self.position = 0 611 self.name = getattr(fileobj, "name", None) 612 self.closed = False 613 614 if blockinfo is None: 615 blockinfo = [(0, size)] 616 617 # Construct a map with data and zero blocks. 618 self.map_index = 0 619 self.map = [] 620 lastpos = 0 621 realpos = self.offset 622 for offset, size in blockinfo: 623 if offset > lastpos: 624 self.map.append((False, lastpos, offset, None)) 625 self.map.append((True, offset, offset + size, realpos)) 626 realpos += size 627 lastpos = offset + size 628 if lastpos < self.size: 629 self.map.append((False, lastpos, self.size, None)) 630 631 def flush(self): 632 pass 633 634 def readable(self): 635 return True 636 637 def writable(self): 638 return False 639 640 def seekable(self): 641 return self.fileobj.seekable() 642 643 def tell(self): 644 """Return the current file position. 645 """ 646 return self.position 647 648 def seek(self, position, whence=io.SEEK_SET): 649 """Seek to a position in the file. 650 """ 651 if whence == io.SEEK_SET: 652 self.position = min(max(position, 0), self.size) 653 elif whence == io.SEEK_CUR: 654 if position < 0: 655 self.position = max(self.position + position, 0) 656 else: 657 self.position = min(self.position + position, self.size) 658 elif whence == io.SEEK_END: 659 self.position = max(min(self.size + position, self.size), 0) 660 else: 661 raise ValueError("Invalid argument") 662 return self.position 663 664 def read(self, size=None): 665 """Read data from the file. 666 """ 667 if size is None: 668 size = self.size - self.position 669 else: 670 size = min(size, self.size - self.position) 671 672 buf = b"" 673 while size > 0: 674 while True: 675 data, start, stop, offset = self.map[self.map_index] 676 if start <= self.position < stop: 677 break 678 else: 679 self.map_index += 1 680 if self.map_index == len(self.map): 681 self.map_index = 0 682 length = min(size, stop - self.position) 683 if data: 684 self.fileobj.seek(offset + (self.position - start)) 685 b = self.fileobj.read(length) 686 if len(b) != length: 687 raise ReadError("unexpected end of data") 688 buf += b 689 else: 690 buf += NUL * length 691 size -= length 692 self.position += length 693 return buf 694 695 def readinto(self, b): 696 buf = self.read(len(b)) 697 b[:len(buf)] = buf 698 return len(buf) 699 700 def close(self): 701 self.closed = True 702#class _FileInFile 703 704class ExFileObject(io.BufferedReader): 705 706 def __init__(self, tarfile, tarinfo): 707 fileobj = _FileInFile(tarfile.fileobj, tarinfo.offset_data, 708 tarinfo.size, tarinfo.sparse) 709 super().__init__(fileobj) 710#class ExFileObject 711 712#------------------ 713# Exported Classes 714#------------------ 715class TarInfo(object): 716 """Informational class which holds the details about an 717 archive member given by a tar header block. 718 TarInfo objects are returned by TarFile.getmember(), 719 TarFile.getmembers() and TarFile.gettarinfo() and are 720 usually created internally. 721 """ 722 723 __slots__ = dict( 724 name = 'Name of the archive member.', 725 mode = 'Permission bits.', 726 uid = 'User ID of the user who originally stored this member.', 727 gid = 'Group ID of the user who originally stored this member.', 728 size = 'Size in bytes.', 729 mtime = 'Time of last modification.', 730 chksum = 'Header checksum.', 731 type = ('File type. type is usually one of these constants: ' 732 'REGTYPE, AREGTYPE, LNKTYPE, SYMTYPE, DIRTYPE, FIFOTYPE, ' 733 'CONTTYPE, CHRTYPE, BLKTYPE, GNUTYPE_SPARSE.'), 734 linkname = ('Name of the target file name, which is only present ' 735 'in TarInfo objects of type LNKTYPE and SYMTYPE.'), 736 uname = 'User name.', 737 gname = 'Group name.', 738 devmajor = 'Device major number.', 739 devminor = 'Device minor number.', 740 offset = 'The tar header starts here.', 741 offset_data = "The file's data starts here.", 742 pax_headers = ('A dictionary containing key-value pairs of an ' 743 'associated pax extended header.'), 744 sparse = 'Sparse member information.', 745 tarfile = None, 746 _sparse_structs = None, 747 _link_target = None, 748 ) 749 750 def __init__(self, name=""): 751 """Construct a TarInfo object. name is the optional name 752 of the member. 753 """ 754 self.name = name # member name 755 self.mode = 0o644 # file permissions 756 self.uid = 0 # user id 757 self.gid = 0 # group id 758 self.size = 0 # file size 759 self.mtime = 0 # modification time 760 self.chksum = 0 # header checksum 761 self.type = REGTYPE # member type 762 self.linkname = "" # link name 763 self.uname = "" # user name 764 self.gname = "" # group name 765 self.devmajor = 0 # device major number 766 self.devminor = 0 # device minor number 767 768 self.offset = 0 # the tar header starts here 769 self.offset_data = 0 # the file's data starts here 770 771 self.sparse = None # sparse member information 772 self.pax_headers = {} # pax header information 773 774 @property 775 def path(self): 776 'In pax headers, "name" is called "path".' 777 return self.name 778 779 @path.setter 780 def path(self, name): 781 self.name = name 782 783 @property 784 def linkpath(self): 785 'In pax headers, "linkname" is called "linkpath".' 786 return self.linkname 787 788 @linkpath.setter 789 def linkpath(self, linkname): 790 self.linkname = linkname 791 792 def __repr__(self): 793 return "<%s %r at %#x>" % (self.__class__.__name__,self.name,id(self)) 794 795 def get_info(self): 796 """Return the TarInfo's attributes as a dictionary. 797 """ 798 info = { 799 "name": self.name, 800 "mode": self.mode & 0o7777, 801 "uid": self.uid, 802 "gid": self.gid, 803 "size": self.size, 804 "mtime": self.mtime, 805 "chksum": self.chksum, 806 "type": self.type, 807 "linkname": self.linkname, 808 "uname": self.uname, 809 "gname": self.gname, 810 "devmajor": self.devmajor, 811 "devminor": self.devminor 812 } 813 814 if info["type"] == DIRTYPE and not info["name"].endswith("/"): 815 info["name"] += "/" 816 817 return info 818 819 def tobuf(self, format=DEFAULT_FORMAT, encoding=ENCODING, errors="surrogateescape"): 820 """Return a tar header as a string of 512 byte blocks. 821 """ 822 info = self.get_info() 823 824 if format == USTAR_FORMAT: 825 return self.create_ustar_header(info, encoding, errors) 826 elif format == GNU_FORMAT: 827 return self.create_gnu_header(info, encoding, errors) 828 elif format == PAX_FORMAT: 829 return self.create_pax_header(info, encoding) 830 else: 831 raise ValueError("invalid format") 832 833 def create_ustar_header(self, info, encoding, errors): 834 """Return the object as a ustar header block. 835 """ 836 info["magic"] = POSIX_MAGIC 837 838 if len(info["linkname"].encode(encoding, errors)) > LENGTH_LINK: 839 raise ValueError("linkname is too long") 840 841 if len(info["name"].encode(encoding, errors)) > LENGTH_NAME: 842 info["prefix"], info["name"] = self._posix_split_name(info["name"], encoding, errors) 843 844 return self._create_header(info, USTAR_FORMAT, encoding, errors) 845 846 def create_gnu_header(self, info, encoding, errors): 847 """Return the object as a GNU header block sequence. 848 """ 849 info["magic"] = GNU_MAGIC 850 851 buf = b"" 852 if len(info["linkname"].encode(encoding, errors)) > LENGTH_LINK: 853 buf += self._create_gnu_long_header(info["linkname"], GNUTYPE_LONGLINK, encoding, errors) 854 855 if len(info["name"].encode(encoding, errors)) > LENGTH_NAME: 856 buf += self._create_gnu_long_header(info["name"], GNUTYPE_LONGNAME, encoding, errors) 857 858 return buf + self._create_header(info, GNU_FORMAT, encoding, errors) 859 860 def create_pax_header(self, info, encoding): 861 """Return the object as a ustar header block. If it cannot be 862 represented this way, prepend a pax extended header sequence 863 with supplement information. 864 """ 865 info["magic"] = POSIX_MAGIC 866 pax_headers = self.pax_headers.copy() 867 868 # Test string fields for values that exceed the field length or cannot 869 # be represented in ASCII encoding. 870 for name, hname, length in ( 871 ("name", "path", LENGTH_NAME), ("linkname", "linkpath", LENGTH_LINK), 872 ("uname", "uname", 32), ("gname", "gname", 32)): 873 874 if hname in pax_headers: 875 # The pax header has priority. 876 continue 877 878 # Try to encode the string as ASCII. 879 try: 880 info[name].encode("ascii", "strict") 881 except UnicodeEncodeError: 882 pax_headers[hname] = info[name] 883 continue 884 885 if len(info[name]) > length: 886 pax_headers[hname] = info[name] 887 888 # Test number fields for values that exceed the field limit or values 889 # that like to be stored as float. 890 for name, digits in (("uid", 8), ("gid", 8), ("size", 12), ("mtime", 12)): 891 if name in pax_headers: 892 # The pax header has priority. Avoid overflow. 893 info[name] = 0 894 continue 895 896 val = info[name] 897 if not 0 <= val < 8 ** (digits - 1) or isinstance(val, float): 898 pax_headers[name] = str(val) 899 info[name] = 0 900 901 # Create a pax extended header if necessary. 902 if pax_headers: 903 buf = self._create_pax_generic_header(pax_headers, XHDTYPE, encoding) 904 else: 905 buf = b"" 906 907 return buf + self._create_header(info, USTAR_FORMAT, "ascii", "replace") 908 909 @classmethod 910 def create_pax_global_header(cls, pax_headers): 911 """Return the object as a pax global header block sequence. 912 """ 913 return cls._create_pax_generic_header(pax_headers, XGLTYPE, "utf-8") 914 915 def _posix_split_name(self, name, encoding, errors): 916 """Split a name longer than 100 chars into a prefix 917 and a name part. 918 """ 919 components = name.split("/") 920 for i in range(1, len(components)): 921 prefix = "/".join(components[:i]) 922 name = "/".join(components[i:]) 923 if len(prefix.encode(encoding, errors)) <= LENGTH_PREFIX and \ 924 len(name.encode(encoding, errors)) <= LENGTH_NAME: 925 break 926 else: 927 raise ValueError("name is too long") 928 929 return prefix, name 930 931 @staticmethod 932 def _create_header(info, format, encoding, errors): 933 """Return a header block. info is a dictionary with file 934 information, format must be one of the *_FORMAT constants. 935 """ 936 has_device_fields = info.get("type") in (CHRTYPE, BLKTYPE) 937 if has_device_fields: 938 devmajor = itn(info.get("devmajor", 0), 8, format) 939 devminor = itn(info.get("devminor", 0), 8, format) 940 else: 941 devmajor = stn("", 8, encoding, errors) 942 devminor = stn("", 8, encoding, errors) 943 944 parts = [ 945 stn(info.get("name", ""), 100, encoding, errors), 946 itn(info.get("mode", 0) & 0o7777, 8, format), 947 itn(info.get("uid", 0), 8, format), 948 itn(info.get("gid", 0), 8, format), 949 itn(info.get("size", 0), 12, format), 950 itn(info.get("mtime", 0), 12, format), 951 b" ", # checksum field 952 info.get("type", REGTYPE), 953 stn(info.get("linkname", ""), 100, encoding, errors), 954 info.get("magic", POSIX_MAGIC), 955 stn(info.get("uname", ""), 32, encoding, errors), 956 stn(info.get("gname", ""), 32, encoding, errors), 957 devmajor, 958 devminor, 959 stn(info.get("prefix", ""), 155, encoding, errors) 960 ] 961 962 buf = struct.pack("%ds" % BLOCKSIZE, b"".join(parts)) 963 chksum = calc_chksums(buf[-BLOCKSIZE:])[0] 964 buf = buf[:-364] + bytes("%06o\0" % chksum, "ascii") + buf[-357:] 965 return buf 966 967 @staticmethod 968 def _create_payload(payload): 969 """Return the string payload filled with zero bytes 970 up to the next 512 byte border. 971 """ 972 blocks, remainder = divmod(len(payload), BLOCKSIZE) 973 if remainder > 0: 974 payload += (BLOCKSIZE - remainder) * NUL 975 return payload 976 977 @classmethod 978 def _create_gnu_long_header(cls, name, type, encoding, errors): 979 """Return a GNUTYPE_LONGNAME or GNUTYPE_LONGLINK sequence 980 for name. 981 """ 982 name = name.encode(encoding, errors) + NUL 983 984 info = {} 985 info["name"] = "././@LongLink" 986 info["type"] = type 987 info["size"] = len(name) 988 info["magic"] = GNU_MAGIC 989 990 # create extended header + name blocks. 991 return cls._create_header(info, USTAR_FORMAT, encoding, errors) + \ 992 cls._create_payload(name) 993 994 @classmethod 995 def _create_pax_generic_header(cls, pax_headers, type, encoding): 996 """Return a POSIX.1-2008 extended or global header sequence 997 that contains a list of keyword, value pairs. The values 998 must be strings. 999 """ 1000 # Check if one of the fields contains surrogate characters and thereby 1001 # forces hdrcharset=BINARY, see _proc_pax() for more information. 1002 binary = False 1003 for keyword, value in pax_headers.items(): 1004 try: 1005 value.encode("utf-8", "strict") 1006 except UnicodeEncodeError: 1007 binary = True 1008 break 1009 1010 records = b"" 1011 if binary: 1012 # Put the hdrcharset field at the beginning of the header. 1013 records += b"21 hdrcharset=BINARY\n" 1014 1015 for keyword, value in pax_headers.items(): 1016 keyword = keyword.encode("utf-8") 1017 if binary: 1018 # Try to restore the original byte representation of `value'. 1019 # Needless to say, that the encoding must match the string. 1020 value = value.encode(encoding, "surrogateescape") 1021 else: 1022 value = value.encode("utf-8") 1023 1024 l = len(keyword) + len(value) + 3 # ' ' + '=' + '\n' 1025 n = p = 0 1026 while True: 1027 n = l + len(str(p)) 1028 if n == p: 1029 break 1030 p = n 1031 records += bytes(str(p), "ascii") + b" " + keyword + b"=" + value + b"\n" 1032 1033 # We use a hardcoded "././@PaxHeader" name like star does 1034 # instead of the one that POSIX recommends. 1035 info = {} 1036 info["name"] = "././@PaxHeader" 1037 info["type"] = type 1038 info["size"] = len(records) 1039 info["magic"] = POSIX_MAGIC 1040 1041 # Create pax header + record blocks. 1042 return cls._create_header(info, USTAR_FORMAT, "ascii", "replace") + \ 1043 cls._create_payload(records) 1044 1045 @classmethod 1046 def frombuf(cls, buf, encoding, errors): 1047 """Construct a TarInfo object from a 512 byte bytes object. 1048 """ 1049 if len(buf) == 0: 1050 raise EmptyHeaderError("empty header") 1051 if len(buf) != BLOCKSIZE: 1052 raise TruncatedHeaderError("truncated header") 1053 if buf.count(NUL) == BLOCKSIZE: 1054 raise EOFHeaderError("end of file header") 1055 1056 chksum = nti(buf[148:156]) 1057 if chksum not in calc_chksums(buf): 1058 raise InvalidHeaderError("bad checksum") 1059 1060 obj = cls() 1061 obj.name = nts(buf[0:100], encoding, errors) 1062 obj.mode = nti(buf[100:108]) 1063 obj.uid = nti(buf[108:116]) 1064 obj.gid = nti(buf[116:124]) 1065 obj.size = nti(buf[124:136]) 1066 obj.mtime = nti(buf[136:148]) 1067 obj.chksum = chksum 1068 obj.type = buf[156:157] 1069 obj.linkname = nts(buf[157:257], encoding, errors) 1070 obj.uname = nts(buf[265:297], encoding, errors) 1071 obj.gname = nts(buf[297:329], encoding, errors) 1072 obj.devmajor = nti(buf[329:337]) 1073 obj.devminor = nti(buf[337:345]) 1074 prefix = nts(buf[345:500], encoding, errors) 1075 1076 # Old V7 tar format represents a directory as a regular 1077 # file with a trailing slash. 1078 if obj.type == AREGTYPE and obj.name.endswith("/"): 1079 obj.type = DIRTYPE 1080 1081 # The old GNU sparse format occupies some of the unused 1082 # space in the buffer for up to 4 sparse structures. 1083 # Save them for later processing in _proc_sparse(). 1084 if obj.type == GNUTYPE_SPARSE: 1085 pos = 386 1086 structs = [] 1087 for i in range(4): 1088 try: 1089 offset = nti(buf[pos:pos + 12]) 1090 numbytes = nti(buf[pos + 12:pos + 24]) 1091 except ValueError: 1092 break 1093 structs.append((offset, numbytes)) 1094 pos += 24 1095 isextended = bool(buf[482]) 1096 origsize = nti(buf[483:495]) 1097 obj._sparse_structs = (structs, isextended, origsize) 1098 1099 # Remove redundant slashes from directories. 1100 if obj.isdir(): 1101 obj.name = obj.name.rstrip("/") 1102 1103 # Reconstruct a ustar longname. 1104 if prefix and obj.type not in GNU_TYPES: 1105 obj.name = prefix + "/" + obj.name 1106 return obj 1107 1108 @classmethod 1109 def fromtarfile(cls, tarfile): 1110 """Return the next TarInfo object from TarFile object 1111 tarfile. 1112 """ 1113 buf = tarfile.fileobj.read(BLOCKSIZE) 1114 obj = cls.frombuf(buf, tarfile.encoding, tarfile.errors) 1115 obj.offset = tarfile.fileobj.tell() - BLOCKSIZE 1116 return obj._proc_member(tarfile) 1117 1118 #-------------------------------------------------------------------------- 1119 # The following are methods that are called depending on the type of a 1120 # member. The entry point is _proc_member() which can be overridden in a 1121 # subclass to add custom _proc_*() methods. A _proc_*() method MUST 1122 # implement the following 1123 # operations: 1124 # 1. Set self.offset_data to the position where the data blocks begin, 1125 # if there is data that follows. 1126 # 2. Set tarfile.offset to the position where the next member's header will 1127 # begin. 1128 # 3. Return self or another valid TarInfo object. 1129 def _proc_member(self, tarfile): 1130 """Choose the right processing method depending on 1131 the type and call it. 1132 """ 1133 if self.type in (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK): 1134 return self._proc_gnulong(tarfile) 1135 elif self.type == GNUTYPE_SPARSE: 1136 return self._proc_sparse(tarfile) 1137 elif self.type in (XHDTYPE, XGLTYPE, SOLARIS_XHDTYPE): 1138 return self._proc_pax(tarfile) 1139 else: 1140 return self._proc_builtin(tarfile) 1141 1142 def _proc_builtin(self, tarfile): 1143 """Process a builtin type or an unknown type which 1144 will be treated as a regular file. 1145 """ 1146 self.offset_data = tarfile.fileobj.tell() 1147 offset = self.offset_data 1148 if self.isreg() or self.type not in SUPPORTED_TYPES: 1149 # Skip the following data blocks. 1150 offset += self._block(self.size) 1151 tarfile.offset = offset 1152 1153 # Patch the TarInfo object with saved global 1154 # header information. 1155 self._apply_pax_info(tarfile.pax_headers, tarfile.encoding, tarfile.errors) 1156 1157 return self 1158 1159 def _proc_gnulong(self, tarfile): 1160 """Process the blocks that hold a GNU longname 1161 or longlink member. 1162 """ 1163 buf = tarfile.fileobj.read(self._block(self.size)) 1164 1165 # Fetch the next header and process it. 1166 try: 1167 next = self.fromtarfile(tarfile) 1168 except HeaderError as e: 1169 raise SubsequentHeaderError(str(e)) from None 1170 1171 # Patch the TarInfo object from the next header with 1172 # the longname information. 1173 next.offset = self.offset 1174 if self.type == GNUTYPE_LONGNAME: 1175 next.name = nts(buf, tarfile.encoding, tarfile.errors) 1176 elif self.type == GNUTYPE_LONGLINK: 1177 next.linkname = nts(buf, tarfile.encoding, tarfile.errors) 1178 1179 return next 1180 1181 def _proc_sparse(self, tarfile): 1182 """Process a GNU sparse header plus extra headers. 1183 """ 1184 # We already collected some sparse structures in frombuf(). 1185 structs, isextended, origsize = self._sparse_structs 1186 del self._sparse_structs 1187 1188 # Collect sparse structures from extended header blocks. 1189 while isextended: 1190 buf = tarfile.fileobj.read(BLOCKSIZE) 1191 pos = 0 1192 for i in range(21): 1193 try: 1194 offset = nti(buf[pos:pos + 12]) 1195 numbytes = nti(buf[pos + 12:pos + 24]) 1196 except ValueError: 1197 break 1198 if offset and numbytes: 1199 structs.append((offset, numbytes)) 1200 pos += 24 1201 isextended = bool(buf[504]) 1202 self.sparse = structs 1203 1204 self.offset_data = tarfile.fileobj.tell() 1205 tarfile.offset = self.offset_data + self._block(self.size) 1206 self.size = origsize 1207 return self 1208 1209 def _proc_pax(self, tarfile): 1210 """Process an extended or global header as described in 1211 POSIX.1-2008. 1212 """ 1213 # Read the header information. 1214 buf = tarfile.fileobj.read(self._block(self.size)) 1215 1216 # A pax header stores supplemental information for either 1217 # the following file (extended) or all following files 1218 # (global). 1219 if self.type == XGLTYPE: 1220 pax_headers = tarfile.pax_headers 1221 else: 1222 pax_headers = tarfile.pax_headers.copy() 1223 1224 # Check if the pax header contains a hdrcharset field. This tells us 1225 # the encoding of the path, linkpath, uname and gname fields. Normally, 1226 # these fields are UTF-8 encoded but since POSIX.1-2008 tar 1227 # implementations are allowed to store them as raw binary strings if 1228 # the translation to UTF-8 fails. 1229 match = re.search(br"\d+ hdrcharset=([^\n]+)\n", buf) 1230 if match is not None: 1231 pax_headers["hdrcharset"] = match.group(1).decode("utf-8") 1232 1233 # For the time being, we don't care about anything other than "BINARY". 1234 # The only other value that is currently allowed by the standard is 1235 # "ISO-IR 10646 2000 UTF-8" in other words UTF-8. 1236 hdrcharset = pax_headers.get("hdrcharset") 1237 if hdrcharset == "BINARY": 1238 encoding = tarfile.encoding 1239 else: 1240 encoding = "utf-8" 1241 1242 # Parse pax header information. A record looks like that: 1243 # "%d %s=%s\n" % (length, keyword, value). length is the size 1244 # of the complete record including the length field itself and 1245 # the newline. keyword and value are both UTF-8 encoded strings. 1246 regex = re.compile(br"(\d+) ([^=]+)=") 1247 pos = 0 1248 while True: 1249 match = regex.match(buf, pos) 1250 if not match: 1251 break 1252 1253 length, keyword = match.groups() 1254 length = int(length) 1255 if length == 0: 1256 raise InvalidHeaderError("invalid header") 1257 value = buf[match.end(2) + 1:match.start(1) + length - 1] 1258 1259 # Normally, we could just use "utf-8" as the encoding and "strict" 1260 # as the error handler, but we better not take the risk. For 1261 # example, GNU tar <= 1.23 is known to store filenames it cannot 1262 # translate to UTF-8 as raw strings (unfortunately without a 1263 # hdrcharset=BINARY header). 1264 # We first try the strict standard encoding, and if that fails we 1265 # fall back on the user's encoding and error handler. 1266 keyword = self._decode_pax_field(keyword, "utf-8", "utf-8", 1267 tarfile.errors) 1268 if keyword in PAX_NAME_FIELDS: 1269 value = self._decode_pax_field(value, encoding, tarfile.encoding, 1270 tarfile.errors) 1271 else: 1272 value = self._decode_pax_field(value, "utf-8", "utf-8", 1273 tarfile.errors) 1274 1275 pax_headers[keyword] = value 1276 pos += length 1277 1278 # Fetch the next header. 1279 try: 1280 next = self.fromtarfile(tarfile) 1281 except HeaderError as e: 1282 raise SubsequentHeaderError(str(e)) from None 1283 1284 # Process GNU sparse information. 1285 if "GNU.sparse.map" in pax_headers: 1286 # GNU extended sparse format version 0.1. 1287 self._proc_gnusparse_01(next, pax_headers) 1288 1289 elif "GNU.sparse.size" in pax_headers: 1290 # GNU extended sparse format version 0.0. 1291 self._proc_gnusparse_00(next, pax_headers, buf) 1292 1293 elif pax_headers.get("GNU.sparse.major") == "1" and pax_headers.get("GNU.sparse.minor") == "0": 1294 # GNU extended sparse format version 1.0. 1295 self._proc_gnusparse_10(next, pax_headers, tarfile) 1296 1297 if self.type in (XHDTYPE, SOLARIS_XHDTYPE): 1298 # Patch the TarInfo object with the extended header info. 1299 next._apply_pax_info(pax_headers, tarfile.encoding, tarfile.errors) 1300 next.offset = self.offset 1301 1302 if "size" in pax_headers: 1303 # If the extended header replaces the size field, 1304 # we need to recalculate the offset where the next 1305 # header starts. 1306 offset = next.offset_data 1307 if next.isreg() or next.type not in SUPPORTED_TYPES: 1308 offset += next._block(next.size) 1309 tarfile.offset = offset 1310 1311 return next 1312 1313 def _proc_gnusparse_00(self, next, pax_headers, buf): 1314 """Process a GNU tar extended sparse header, version 0.0. 1315 """ 1316 offsets = [] 1317 for match in re.finditer(br"\d+ GNU.sparse.offset=(\d+)\n", buf): 1318 offsets.append(int(match.group(1))) 1319 numbytes = [] 1320 for match in re.finditer(br"\d+ GNU.sparse.numbytes=(\d+)\n", buf): 1321 numbytes.append(int(match.group(1))) 1322 next.sparse = list(zip(offsets, numbytes)) 1323 1324 def _proc_gnusparse_01(self, next, pax_headers): 1325 """Process a GNU tar extended sparse header, version 0.1. 1326 """ 1327 sparse = [int(x) for x in pax_headers["GNU.sparse.map"].split(",")] 1328 next.sparse = list(zip(sparse[::2], sparse[1::2])) 1329 1330 def _proc_gnusparse_10(self, next, pax_headers, tarfile): 1331 """Process a GNU tar extended sparse header, version 1.0. 1332 """ 1333 fields = None 1334 sparse = [] 1335 buf = tarfile.fileobj.read(BLOCKSIZE) 1336 fields, buf = buf.split(b"\n", 1) 1337 fields = int(fields) 1338 while len(sparse) < fields * 2: 1339 if b"\n" not in buf: 1340 buf += tarfile.fileobj.read(BLOCKSIZE) 1341 number, buf = buf.split(b"\n", 1) 1342 sparse.append(int(number)) 1343 next.offset_data = tarfile.fileobj.tell() 1344 next.sparse = list(zip(sparse[::2], sparse[1::2])) 1345 1346 def _apply_pax_info(self, pax_headers, encoding, errors): 1347 """Replace fields with supplemental information from a previous 1348 pax extended or global header. 1349 """ 1350 for keyword, value in pax_headers.items(): 1351 if keyword == "GNU.sparse.name": 1352 setattr(self, "path", value) 1353 elif keyword == "GNU.sparse.size": 1354 setattr(self, "size", int(value)) 1355 elif keyword == "GNU.sparse.realsize": 1356 setattr(self, "size", int(value)) 1357 elif keyword in PAX_FIELDS: 1358 if keyword in PAX_NUMBER_FIELDS: 1359 try: 1360 value = PAX_NUMBER_FIELDS[keyword](value) 1361 except ValueError: 1362 value = 0 1363 if keyword == "path": 1364 value = value.rstrip("/") 1365 setattr(self, keyword, value) 1366 1367 self.pax_headers = pax_headers.copy() 1368 1369 def _decode_pax_field(self, value, encoding, fallback_encoding, fallback_errors): 1370 """Decode a single field from a pax record. 1371 """ 1372 try: 1373 return value.decode(encoding, "strict") 1374 except UnicodeDecodeError: 1375 return value.decode(fallback_encoding, fallback_errors) 1376 1377 def _block(self, count): 1378 """Round up a byte count by BLOCKSIZE and return it, 1379 e.g. _block(834) => 1024. 1380 """ 1381 blocks, remainder = divmod(count, BLOCKSIZE) 1382 if remainder: 1383 blocks += 1 1384 return blocks * BLOCKSIZE 1385 1386 def isreg(self): 1387 'Return True if the Tarinfo object is a regular file.' 1388 return self.type in REGULAR_TYPES 1389 1390 def isfile(self): 1391 'Return True if the Tarinfo object is a regular file.' 1392 return self.isreg() 1393 1394 def isdir(self): 1395 'Return True if it is a directory.' 1396 return self.type == DIRTYPE 1397 1398 def issym(self): 1399 'Return True if it is a symbolic link.' 1400 return self.type == SYMTYPE 1401 1402 def islnk(self): 1403 'Return True if it is a hard link.' 1404 return self.type == LNKTYPE 1405 1406 def ischr(self): 1407 'Return True if it is a character device.' 1408 return self.type == CHRTYPE 1409 1410 def isblk(self): 1411 'Return True if it is a block device.' 1412 return self.type == BLKTYPE 1413 1414 def isfifo(self): 1415 'Return True if it is a FIFO.' 1416 return self.type == FIFOTYPE 1417 1418 def issparse(self): 1419 return self.sparse is not None 1420 1421 def isdev(self): 1422 'Return True if it is one of character device, block device or FIFO.' 1423 return self.type in (CHRTYPE, BLKTYPE, FIFOTYPE) 1424# class TarInfo 1425 1426class TarFile(object): 1427 """The TarFile Class provides an interface to tar archives. 1428 """ 1429 1430 debug = 0 # May be set from 0 (no msgs) to 3 (all msgs) 1431 1432 dereference = False # If true, add content of linked file to the 1433 # tar file, else the link. 1434 1435 ignore_zeros = False # If true, skips empty or invalid blocks and 1436 # continues processing. 1437 1438 errorlevel = 1 # If 0, fatal errors only appear in debug 1439 # messages (if debug >= 0). If > 0, errors 1440 # are passed to the caller as exceptions. 1441 1442 format = DEFAULT_FORMAT # The format to use when creating an archive. 1443 1444 encoding = ENCODING # Encoding for 8-bit character strings. 1445 1446 errors = None # Error handler for unicode conversion. 1447 1448 tarinfo = TarInfo # The default TarInfo class to use. 1449 1450 fileobject = ExFileObject # The file-object for extractfile(). 1451 1452 def __init__(self, name=None, mode="r", fileobj=None, format=None, 1453 tarinfo=None, dereference=None, ignore_zeros=None, encoding=None, 1454 errors="surrogateescape", pax_headers=None, debug=None, 1455 errorlevel=None, copybufsize=None): 1456 """Open an (uncompressed) tar archive `name'. `mode' is either 'r' to 1457 read from an existing archive, 'a' to append data to an existing 1458 file or 'w' to create a new file overwriting an existing one. `mode' 1459 defaults to 'r'. 1460 If `fileobj' is given, it is used for reading or writing data. If it 1461 can be determined, `mode' is overridden by `fileobj's mode. 1462 `fileobj' is not closed, when TarFile is closed. 1463 """ 1464 modes = {"r": "rb", "a": "r+b", "w": "wb", "x": "xb"} 1465 if mode not in modes: 1466 raise ValueError("mode must be 'r', 'a', 'w' or 'x'") 1467 self.mode = mode 1468 self._mode = modes[mode] 1469 1470 if not fileobj: 1471 if self.mode == "a" and not os.path.exists(name): 1472 # Create nonexistent files in append mode. 1473 self.mode = "w" 1474 self._mode = "wb" 1475 fileobj = bltn_open(name, self._mode) 1476 self._extfileobj = False 1477 else: 1478 if (name is None and hasattr(fileobj, "name") and 1479 isinstance(fileobj.name, (str, bytes))): 1480 name = fileobj.name 1481 if hasattr(fileobj, "mode"): 1482 self._mode = fileobj.mode 1483 self._extfileobj = True 1484 self.name = os.path.abspath(name) if name else None 1485 self.fileobj = fileobj 1486 1487 # Init attributes. 1488 if format is not None: 1489 self.format = format 1490 if tarinfo is not None: 1491 self.tarinfo = tarinfo 1492 if dereference is not None: 1493 self.dereference = dereference 1494 if ignore_zeros is not None: 1495 self.ignore_zeros = ignore_zeros 1496 if encoding is not None: 1497 self.encoding = encoding 1498 self.errors = errors 1499 1500 if pax_headers is not None and self.format == PAX_FORMAT: 1501 self.pax_headers = pax_headers 1502 else: 1503 self.pax_headers = {} 1504 1505 if debug is not None: 1506 self.debug = debug 1507 if errorlevel is not None: 1508 self.errorlevel = errorlevel 1509 1510 # Init datastructures. 1511 self.copybufsize = copybufsize 1512 self.closed = False 1513 self.members = [] # list of members as TarInfo objects 1514 self._loaded = False # flag if all members have been read 1515 self.offset = self.fileobj.tell() 1516 # current position in the archive file 1517 self.inodes = {} # dictionary caching the inodes of 1518 # archive members already added 1519 1520 try: 1521 if self.mode == "r": 1522 self.firstmember = None 1523 self.firstmember = self.next() 1524 1525 if self.mode == "a": 1526 # Move to the end of the archive, 1527 # before the first empty block. 1528 while True: 1529 self.fileobj.seek(self.offset) 1530 try: 1531 tarinfo = self.tarinfo.fromtarfile(self) 1532 self.members.append(tarinfo) 1533 except EOFHeaderError: 1534 self.fileobj.seek(self.offset) 1535 break 1536 except HeaderError as e: 1537 raise ReadError(str(e)) from None 1538 1539 if self.mode in ("a", "w", "x"): 1540 self._loaded = True 1541 1542 if self.pax_headers: 1543 buf = self.tarinfo.create_pax_global_header(self.pax_headers.copy()) 1544 self.fileobj.write(buf) 1545 self.offset += len(buf) 1546 except: 1547 if not self._extfileobj: 1548 self.fileobj.close() 1549 self.closed = True 1550 raise 1551 1552 #-------------------------------------------------------------------------- 1553 # Below are the classmethods which act as alternate constructors to the 1554 # TarFile class. The open() method is the only one that is needed for 1555 # public use; it is the "super"-constructor and is able to select an 1556 # adequate "sub"-constructor for a particular compression using the mapping 1557 # from OPEN_METH. 1558 # 1559 # This concept allows one to subclass TarFile without losing the comfort of 1560 # the super-constructor. A sub-constructor is registered and made available 1561 # by adding it to the mapping in OPEN_METH. 1562 1563 @classmethod 1564 def open(cls, name=None, mode="r", fileobj=None, bufsize=RECORDSIZE, **kwargs): 1565 """Open a tar archive for reading, writing or appending. Return 1566 an appropriate TarFile class. 1567 1568 mode: 1569 'r' or 'r:*' open for reading with transparent compression 1570 'r:' open for reading exclusively uncompressed 1571 'r:gz' open for reading with gzip compression 1572 'r:bz2' open for reading with bzip2 compression 1573 'r:xz' open for reading with lzma compression 1574 'a' or 'a:' open for appending, creating the file if necessary 1575 'w' or 'w:' open for writing without compression 1576 'w:gz' open for writing with gzip compression 1577 'w:bz2' open for writing with bzip2 compression 1578 'w:xz' open for writing with lzma compression 1579 1580 'x' or 'x:' create a tarfile exclusively without compression, raise 1581 an exception if the file is already created 1582 'x:gz' create a gzip compressed tarfile, raise an exception 1583 if the file is already created 1584 'x:bz2' create a bzip2 compressed tarfile, raise an exception 1585 if the file is already created 1586 'x:xz' create an lzma compressed tarfile, raise an exception 1587 if the file is already created 1588 1589 'r|*' open a stream of tar blocks with transparent compression 1590 'r|' open an uncompressed stream of tar blocks for reading 1591 'r|gz' open a gzip compressed stream of tar blocks 1592 'r|bz2' open a bzip2 compressed stream of tar blocks 1593 'r|xz' open an lzma compressed stream of tar blocks 1594 'w|' open an uncompressed stream for writing 1595 'w|gz' open a gzip compressed stream for writing 1596 'w|bz2' open a bzip2 compressed stream for writing 1597 'w|xz' open an lzma compressed stream for writing 1598 """ 1599 1600 if not name and not fileobj: 1601 raise ValueError("nothing to open") 1602 1603 if mode in ("r", "r:*"): 1604 # Find out which *open() is appropriate for opening the file. 1605 def not_compressed(comptype): 1606 return cls.OPEN_METH[comptype] == 'taropen' 1607 error_msgs = [] 1608 for comptype in sorted(cls.OPEN_METH, key=not_compressed): 1609 func = getattr(cls, cls.OPEN_METH[comptype]) 1610 if fileobj is not None: 1611 saved_pos = fileobj.tell() 1612 try: 1613 return func(name, "r", fileobj, **kwargs) 1614 except (ReadError, CompressionError) as e: 1615 error_msgs.append(f'- method {comptype}: {e!r}') 1616 if fileobj is not None: 1617 fileobj.seek(saved_pos) 1618 continue 1619 error_msgs_summary = '\n'.join(error_msgs) 1620 raise ReadError(f"file could not be opened successfully:\n{error_msgs_summary}") 1621 1622 elif ":" in mode: 1623 filemode, comptype = mode.split(":", 1) 1624 filemode = filemode or "r" 1625 comptype = comptype or "tar" 1626 1627 # Select the *open() function according to 1628 # given compression. 1629 if comptype in cls.OPEN_METH: 1630 func = getattr(cls, cls.OPEN_METH[comptype]) 1631 else: 1632 raise CompressionError("unknown compression type %r" % comptype) 1633 return func(name, filemode, fileobj, **kwargs) 1634 1635 elif "|" in mode: 1636 filemode, comptype = mode.split("|", 1) 1637 filemode = filemode or "r" 1638 comptype = comptype or "tar" 1639 1640 if filemode not in ("r", "w"): 1641 raise ValueError("mode must be 'r' or 'w'") 1642 1643 stream = _Stream(name, filemode, comptype, fileobj, bufsize) 1644 try: 1645 t = cls(name, filemode, stream, **kwargs) 1646 except: 1647 stream.close() 1648 raise 1649 t._extfileobj = False 1650 return t 1651 1652 elif mode in ("a", "w", "x"): 1653 return cls.taropen(name, mode, fileobj, **kwargs) 1654 1655 raise ValueError("undiscernible mode") 1656 1657 @classmethod 1658 def taropen(cls, name, mode="r", fileobj=None, **kwargs): 1659 """Open uncompressed tar archive name for reading or writing. 1660 """ 1661 if mode not in ("r", "a", "w", "x"): 1662 raise ValueError("mode must be 'r', 'a', 'w' or 'x'") 1663 return cls(name, mode, fileobj, **kwargs) 1664 1665 @classmethod 1666 def gzopen(cls, name, mode="r", fileobj=None, compresslevel=9, **kwargs): 1667 """Open gzip compressed tar archive name for reading or writing. 1668 Appending is not allowed. 1669 """ 1670 if mode not in ("r", "w", "x"): 1671 raise ValueError("mode must be 'r', 'w' or 'x'") 1672 1673 try: 1674 from gzip import GzipFile 1675 except ImportError: 1676 raise CompressionError("gzip module is not available") from None 1677 1678 try: 1679 fileobj = GzipFile(name, mode + "b", compresslevel, fileobj) 1680 except OSError as e: 1681 if fileobj is not None and mode == 'r': 1682 raise ReadError("not a gzip file") from e 1683 raise 1684 1685 try: 1686 t = cls.taropen(name, mode, fileobj, **kwargs) 1687 except OSError as e: 1688 fileobj.close() 1689 if mode == 'r': 1690 raise ReadError("not a gzip file") from e 1691 raise 1692 except: 1693 fileobj.close() 1694 raise 1695 t._extfileobj = False 1696 return t 1697 1698 @classmethod 1699 def bz2open(cls, name, mode="r", fileobj=None, compresslevel=9, **kwargs): 1700 """Open bzip2 compressed tar archive name for reading or writing. 1701 Appending is not allowed. 1702 """ 1703 if mode not in ("r", "w", "x"): 1704 raise ValueError("mode must be 'r', 'w' or 'x'") 1705 1706 try: 1707 from bz2 import BZ2File 1708 except ImportError: 1709 raise CompressionError("bz2 module is not available") from None 1710 1711 fileobj = BZ2File(fileobj or name, mode, compresslevel=compresslevel) 1712 1713 try: 1714 t = cls.taropen(name, mode, fileobj, **kwargs) 1715 except (OSError, EOFError) as e: 1716 fileobj.close() 1717 if mode == 'r': 1718 raise ReadError("not a bzip2 file") from e 1719 raise 1720 except: 1721 fileobj.close() 1722 raise 1723 t._extfileobj = False 1724 return t 1725 1726 @classmethod 1727 def xzopen(cls, name, mode="r", fileobj=None, preset=None, **kwargs): 1728 """Open lzma compressed tar archive name for reading or writing. 1729 Appending is not allowed. 1730 """ 1731 if mode not in ("r", "w", "x"): 1732 raise ValueError("mode must be 'r', 'w' or 'x'") 1733 1734 try: 1735 from lzma import LZMAFile, LZMAError 1736 except ImportError: 1737 raise CompressionError("lzma module is not available") from None 1738 1739 fileobj = LZMAFile(fileobj or name, mode, preset=preset) 1740 1741 try: 1742 t = cls.taropen(name, mode, fileobj, **kwargs) 1743 except (LZMAError, EOFError) as e: 1744 fileobj.close() 1745 if mode == 'r': 1746 raise ReadError("not an lzma file") from e 1747 raise 1748 except: 1749 fileobj.close() 1750 raise 1751 t._extfileobj = False 1752 return t 1753 1754 # All *open() methods are registered here. 1755 OPEN_METH = { 1756 "tar": "taropen", # uncompressed tar 1757 "gz": "gzopen", # gzip compressed tar 1758 "bz2": "bz2open", # bzip2 compressed tar 1759 "xz": "xzopen" # lzma compressed tar 1760 } 1761 1762 #-------------------------------------------------------------------------- 1763 # The public methods which TarFile provides: 1764 1765 def close(self): 1766 """Close the TarFile. In write-mode, two finishing zero blocks are 1767 appended to the archive. 1768 """ 1769 if self.closed: 1770 return 1771 1772 self.closed = True 1773 try: 1774 if self.mode in ("a", "w", "x"): 1775 self.fileobj.write(NUL * (BLOCKSIZE * 2)) 1776 self.offset += (BLOCKSIZE * 2) 1777 # fill up the end with zero-blocks 1778 # (like option -b20 for tar does) 1779 blocks, remainder = divmod(self.offset, RECORDSIZE) 1780 if remainder > 0: 1781 self.fileobj.write(NUL * (RECORDSIZE - remainder)) 1782 finally: 1783 if not self._extfileobj: 1784 self.fileobj.close() 1785 1786 def getmember(self, name): 1787 """Return a TarInfo object for member `name'. If `name' can not be 1788 found in the archive, KeyError is raised. If a member occurs more 1789 than once in the archive, its last occurrence is assumed to be the 1790 most up-to-date version. 1791 """ 1792 tarinfo = self._getmember(name) 1793 if tarinfo is None: 1794 raise KeyError("filename %r not found" % name) 1795 return tarinfo 1796 1797 def getmembers(self): 1798 """Return the members of the archive as a list of TarInfo objects. The 1799 list has the same order as the members in the archive. 1800 """ 1801 self._check() 1802 if not self._loaded: # if we want to obtain a list of 1803 self._load() # all members, we first have to 1804 # scan the whole archive. 1805 return self.members 1806 1807 def getnames(self): 1808 """Return the members of the archive as a list of their names. It has 1809 the same order as the list returned by getmembers(). 1810 """ 1811 return [tarinfo.name for tarinfo in self.getmembers()] 1812 1813 def gettarinfo(self, name=None, arcname=None, fileobj=None): 1814 """Create a TarInfo object from the result of os.stat or equivalent 1815 on an existing file. The file is either named by `name', or 1816 specified as a file object `fileobj' with a file descriptor. If 1817 given, `arcname' specifies an alternative name for the file in the 1818 archive, otherwise, the name is taken from the 'name' attribute of 1819 'fileobj', or the 'name' argument. The name should be a text 1820 string. 1821 """ 1822 self._check("awx") 1823 1824 # When fileobj is given, replace name by 1825 # fileobj's real name. 1826 if fileobj is not None: 1827 name = fileobj.name 1828 1829 # Building the name of the member in the archive. 1830 # Backward slashes are converted to forward slashes, 1831 # Absolute paths are turned to relative paths. 1832 if arcname is None: 1833 arcname = name 1834 drv, arcname = os.path.splitdrive(arcname) 1835 arcname = arcname.replace(os.sep, "/") 1836 arcname = arcname.lstrip("/") 1837 1838 # Now, fill the TarInfo object with 1839 # information specific for the file. 1840 tarinfo = self.tarinfo() 1841 tarinfo.tarfile = self # Not needed 1842 1843 # Use os.stat or os.lstat, depending on if symlinks shall be resolved. 1844 if fileobj is None: 1845 if not self.dereference: 1846 statres = os.lstat(name) 1847 else: 1848 statres = os.stat(name) 1849 else: 1850 statres = os.fstat(fileobj.fileno()) 1851 linkname = "" 1852 1853 stmd = statres.st_mode 1854 if stat.S_ISREG(stmd): 1855 inode = (statres.st_ino, statres.st_dev) 1856 if not self.dereference and statres.st_nlink > 1 and \ 1857 inode in self.inodes and arcname != self.inodes[inode]: 1858 # Is it a hardlink to an already 1859 # archived file? 1860 type = LNKTYPE 1861 linkname = self.inodes[inode] 1862 else: 1863 # The inode is added only if its valid. 1864 # For win32 it is always 0. 1865 type = REGTYPE 1866 if inode[0]: 1867 self.inodes[inode] = arcname 1868 elif stat.S_ISDIR(stmd): 1869 type = DIRTYPE 1870 elif stat.S_ISFIFO(stmd): 1871 type = FIFOTYPE 1872 elif stat.S_ISLNK(stmd): 1873 type = SYMTYPE 1874 linkname = os.readlink(name) 1875 elif stat.S_ISCHR(stmd): 1876 type = CHRTYPE 1877 elif stat.S_ISBLK(stmd): 1878 type = BLKTYPE 1879 else: 1880 return None 1881 1882 # Fill the TarInfo object with all 1883 # information we can get. 1884 tarinfo.name = arcname 1885 tarinfo.mode = stmd 1886 tarinfo.uid = statres.st_uid 1887 tarinfo.gid = statres.st_gid 1888 if type == REGTYPE: 1889 tarinfo.size = statres.st_size 1890 else: 1891 tarinfo.size = 0 1892 tarinfo.mtime = statres.st_mtime 1893 tarinfo.type = type 1894 tarinfo.linkname = linkname 1895 if pwd: 1896 try: 1897 tarinfo.uname = pwd.getpwuid(tarinfo.uid)[0] 1898 except KeyError: 1899 pass 1900 if grp: 1901 try: 1902 tarinfo.gname = grp.getgrgid(tarinfo.gid)[0] 1903 except KeyError: 1904 pass 1905 1906 if type in (CHRTYPE, BLKTYPE): 1907 if hasattr(os, "major") and hasattr(os, "minor"): 1908 tarinfo.devmajor = os.major(statres.st_rdev) 1909 tarinfo.devminor = os.minor(statres.st_rdev) 1910 return tarinfo 1911 1912 def list(self, verbose=True, *, members=None): 1913 """Print a table of contents to sys.stdout. If `verbose' is False, only 1914 the names of the members are printed. If it is True, an `ls -l'-like 1915 output is produced. `members' is optional and must be a subset of the 1916 list returned by getmembers(). 1917 """ 1918 self._check() 1919 1920 if members is None: 1921 members = self 1922 for tarinfo in members: 1923 if verbose: 1924 _safe_print(stat.filemode(tarinfo.mode)) 1925 _safe_print("%s/%s" % (tarinfo.uname or tarinfo.uid, 1926 tarinfo.gname or tarinfo.gid)) 1927 if tarinfo.ischr() or tarinfo.isblk(): 1928 _safe_print("%10s" % 1929 ("%d,%d" % (tarinfo.devmajor, tarinfo.devminor))) 1930 else: 1931 _safe_print("%10d" % tarinfo.size) 1932 _safe_print("%d-%02d-%02d %02d:%02d:%02d" \ 1933 % time.localtime(tarinfo.mtime)[:6]) 1934 1935 _safe_print(tarinfo.name + ("/" if tarinfo.isdir() else "")) 1936 1937 if verbose: 1938 if tarinfo.issym(): 1939 _safe_print("-> " + tarinfo.linkname) 1940 if tarinfo.islnk(): 1941 _safe_print("link to " + tarinfo.linkname) 1942 print() 1943 1944 def add(self, name, arcname=None, recursive=True, *, filter=None): 1945 """Add the file `name' to the archive. `name' may be any type of file 1946 (directory, fifo, symbolic link, etc.). If given, `arcname' 1947 specifies an alternative name for the file in the archive. 1948 Directories are added recursively by default. This can be avoided by 1949 setting `recursive' to False. `filter' is a function 1950 that expects a TarInfo object argument and returns the changed 1951 TarInfo object, if it returns None the TarInfo object will be 1952 excluded from the archive. 1953 """ 1954 self._check("awx") 1955 1956 if arcname is None: 1957 arcname = name 1958 1959 # Skip if somebody tries to archive the archive... 1960 if self.name is not None and os.path.abspath(name) == self.name: 1961 self._dbg(2, "tarfile: Skipped %r" % name) 1962 return 1963 1964 self._dbg(1, name) 1965 1966 # Create a TarInfo object from the file. 1967 tarinfo = self.gettarinfo(name, arcname) 1968 1969 if tarinfo is None: 1970 self._dbg(1, "tarfile: Unsupported type %r" % name) 1971 return 1972 1973 # Change or exclude the TarInfo object. 1974 if filter is not None: 1975 tarinfo = filter(tarinfo) 1976 if tarinfo is None: 1977 self._dbg(2, "tarfile: Excluded %r" % name) 1978 return 1979 1980 # Append the tar header and data to the archive. 1981 if tarinfo.isreg(): 1982 with bltn_open(name, "rb") as f: 1983 self.addfile(tarinfo, f) 1984 1985 elif tarinfo.isdir(): 1986 self.addfile(tarinfo) 1987 if recursive: 1988 for f in sorted(os.listdir(name)): 1989 self.add(os.path.join(name, f), os.path.join(arcname, f), 1990 recursive, filter=filter) 1991 1992 else: 1993 self.addfile(tarinfo) 1994 1995 def addfile(self, tarinfo, fileobj=None): 1996 """Add the TarInfo object `tarinfo' to the archive. If `fileobj' is 1997 given, it should be a binary file, and tarinfo.size bytes are read 1998 from it and added to the archive. You can create TarInfo objects 1999 directly, or by using gettarinfo(). 2000 """ 2001 self._check("awx") 2002 2003 tarinfo = copy.copy(tarinfo) 2004 2005 buf = tarinfo.tobuf(self.format, self.encoding, self.errors) 2006 self.fileobj.write(buf) 2007 self.offset += len(buf) 2008 bufsize=self.copybufsize 2009 # If there's data to follow, append it. 2010 if fileobj is not None: 2011 copyfileobj(fileobj, self.fileobj, tarinfo.size, bufsize=bufsize) 2012 blocks, remainder = divmod(tarinfo.size, BLOCKSIZE) 2013 if remainder > 0: 2014 self.fileobj.write(NUL * (BLOCKSIZE - remainder)) 2015 blocks += 1 2016 self.offset += blocks * BLOCKSIZE 2017 2018 self.members.append(tarinfo) 2019 2020 def extractall(self, path=".", members=None, *, numeric_owner=False): 2021 """Extract all members from the archive to the current working 2022 directory and set owner, modification time and permissions on 2023 directories afterwards. `path' specifies a different directory 2024 to extract to. `members' is optional and must be a subset of the 2025 list returned by getmembers(). If `numeric_owner` is True, only 2026 the numbers for user/group names are used and not the names. 2027 """ 2028 directories = [] 2029 2030 if members is None: 2031 members = self 2032 2033 for tarinfo in members: 2034 if tarinfo.isdir(): 2035 # Extract directories with a safe mode. 2036 directories.append(tarinfo) 2037 tarinfo = copy.copy(tarinfo) 2038 tarinfo.mode = 0o700 2039 # Do not set_attrs directories, as we will do that further down 2040 self.extract(tarinfo, path, set_attrs=not tarinfo.isdir(), 2041 numeric_owner=numeric_owner) 2042 2043 # Reverse sort directories. 2044 directories.sort(key=lambda a: a.name) 2045 directories.reverse() 2046 2047 # Set correct owner, mtime and filemode on directories. 2048 for tarinfo in directories: 2049 dirpath = os.path.join(path, tarinfo.name) 2050 try: 2051 self.chown(tarinfo, dirpath, numeric_owner=numeric_owner) 2052 self.utime(tarinfo, dirpath) 2053 self.chmod(tarinfo, dirpath) 2054 except ExtractError as e: 2055 if self.errorlevel > 1: 2056 raise 2057 else: 2058 self._dbg(1, "tarfile: %s" % e) 2059 2060 def extract(self, member, path="", set_attrs=True, *, numeric_owner=False): 2061 """Extract a member from the archive to the current working directory, 2062 using its full name. Its file information is extracted as accurately 2063 as possible. `member' may be a filename or a TarInfo object. You can 2064 specify a different directory using `path'. File attributes (owner, 2065 mtime, mode) are set unless `set_attrs' is False. If `numeric_owner` 2066 is True, only the numbers for user/group names are used and not 2067 the names. 2068 """ 2069 self._check("r") 2070 2071 if isinstance(member, str): 2072 tarinfo = self.getmember(member) 2073 else: 2074 tarinfo = member 2075 2076 # Prepare the link target for makelink(). 2077 if tarinfo.islnk(): 2078 tarinfo._link_target = os.path.join(path, tarinfo.linkname) 2079 2080 try: 2081 self._extract_member(tarinfo, os.path.join(path, tarinfo.name), 2082 set_attrs=set_attrs, 2083 numeric_owner=numeric_owner) 2084 except OSError as e: 2085 if self.errorlevel > 0: 2086 raise 2087 else: 2088 if e.filename is None: 2089 self._dbg(1, "tarfile: %s" % e.strerror) 2090 else: 2091 self._dbg(1, "tarfile: %s %r" % (e.strerror, e.filename)) 2092 except ExtractError as e: 2093 if self.errorlevel > 1: 2094 raise 2095 else: 2096 self._dbg(1, "tarfile: %s" % e) 2097 2098 def extractfile(self, member): 2099 """Extract a member from the archive as a file object. `member' may be 2100 a filename or a TarInfo object. If `member' is a regular file or 2101 a link, an io.BufferedReader object is returned. For all other 2102 existing members, None is returned. If `member' does not appear 2103 in the archive, KeyError is raised. 2104 """ 2105 self._check("r") 2106 2107 if isinstance(member, str): 2108 tarinfo = self.getmember(member) 2109 else: 2110 tarinfo = member 2111 2112 if tarinfo.isreg() or tarinfo.type not in SUPPORTED_TYPES: 2113 # Members with unknown types are treated as regular files. 2114 return self.fileobject(self, tarinfo) 2115 2116 elif tarinfo.islnk() or tarinfo.issym(): 2117 if isinstance(self.fileobj, _Stream): 2118 # A small but ugly workaround for the case that someone tries 2119 # to extract a (sym)link as a file-object from a non-seekable 2120 # stream of tar blocks. 2121 raise StreamError("cannot extract (sym)link as file object") 2122 else: 2123 # A (sym)link's file object is its target's file object. 2124 return self.extractfile(self._find_link_target(tarinfo)) 2125 else: 2126 # If there's no data associated with the member (directory, chrdev, 2127 # blkdev, etc.), return None instead of a file object. 2128 return None 2129 2130 def _extract_member(self, tarinfo, targetpath, set_attrs=True, 2131 numeric_owner=False): 2132 """Extract the TarInfo object tarinfo to a physical 2133 file called targetpath. 2134 """ 2135 # Fetch the TarInfo object for the given name 2136 # and build the destination pathname, replacing 2137 # forward slashes to platform specific separators. 2138 targetpath = targetpath.rstrip("/") 2139 targetpath = targetpath.replace("/", os.sep) 2140 2141 # Create all upper directories. 2142 upperdirs = os.path.dirname(targetpath) 2143 if upperdirs and not os.path.exists(upperdirs): 2144 # Create directories that are not part of the archive with 2145 # default permissions. 2146 os.makedirs(upperdirs) 2147 2148 if tarinfo.islnk() or tarinfo.issym(): 2149 self._dbg(1, "%s -> %s" % (tarinfo.name, tarinfo.linkname)) 2150 else: 2151 self._dbg(1, tarinfo.name) 2152 2153 if tarinfo.isreg(): 2154 self.makefile(tarinfo, targetpath) 2155 elif tarinfo.isdir(): 2156 self.makedir(tarinfo, targetpath) 2157 elif tarinfo.isfifo(): 2158 self.makefifo(tarinfo, targetpath) 2159 elif tarinfo.ischr() or tarinfo.isblk(): 2160 self.makedev(tarinfo, targetpath) 2161 elif tarinfo.islnk() or tarinfo.issym(): 2162 self.makelink(tarinfo, targetpath) 2163 elif tarinfo.type not in SUPPORTED_TYPES: 2164 self.makeunknown(tarinfo, targetpath) 2165 else: 2166 self.makefile(tarinfo, targetpath) 2167 2168 if set_attrs: 2169 self.chown(tarinfo, targetpath, numeric_owner) 2170 if not tarinfo.issym(): 2171 self.chmod(tarinfo, targetpath) 2172 self.utime(tarinfo, targetpath) 2173 2174 #-------------------------------------------------------------------------- 2175 # Below are the different file methods. They are called via 2176 # _extract_member() when extract() is called. They can be replaced in a 2177 # subclass to implement other functionality. 2178 2179 def makedir(self, tarinfo, targetpath): 2180 """Make a directory called targetpath. 2181 """ 2182 try: 2183 # Use a safe mode for the directory, the real mode is set 2184 # later in _extract_member(). 2185 os.mkdir(targetpath, 0o700) 2186 except FileExistsError: 2187 pass 2188 2189 def makefile(self, tarinfo, targetpath): 2190 """Make a file called targetpath. 2191 """ 2192 source = self.fileobj 2193 source.seek(tarinfo.offset_data) 2194 bufsize = self.copybufsize 2195 with bltn_open(targetpath, "wb") as target: 2196 if tarinfo.sparse is not None: 2197 for offset, size in tarinfo.sparse: 2198 target.seek(offset) 2199 copyfileobj(source, target, size, ReadError, bufsize) 2200 target.seek(tarinfo.size) 2201 target.truncate() 2202 else: 2203 copyfileobj(source, target, tarinfo.size, ReadError, bufsize) 2204 2205 def makeunknown(self, tarinfo, targetpath): 2206 """Make a file from a TarInfo object with an unknown type 2207 at targetpath. 2208 """ 2209 self.makefile(tarinfo, targetpath) 2210 self._dbg(1, "tarfile: Unknown file type %r, " \ 2211 "extracted as regular file." % tarinfo.type) 2212 2213 def makefifo(self, tarinfo, targetpath): 2214 """Make a fifo called targetpath. 2215 """ 2216 if hasattr(os, "mkfifo"): 2217 os.mkfifo(targetpath) 2218 else: 2219 raise ExtractError("fifo not supported by system") 2220 2221 def makedev(self, tarinfo, targetpath): 2222 """Make a character or block device called targetpath. 2223 """ 2224 if not hasattr(os, "mknod") or not hasattr(os, "makedev"): 2225 raise ExtractError("special devices not supported by system") 2226 2227 mode = tarinfo.mode 2228 if tarinfo.isblk(): 2229 mode |= stat.S_IFBLK 2230 else: 2231 mode |= stat.S_IFCHR 2232 2233 os.mknod(targetpath, mode, 2234 os.makedev(tarinfo.devmajor, tarinfo.devminor)) 2235 2236 def makelink(self, tarinfo, targetpath): 2237 """Make a (symbolic) link called targetpath. If it cannot be created 2238 (platform limitation), we try to make a copy of the referenced file 2239 instead of a link. 2240 """ 2241 try: 2242 # For systems that support symbolic and hard links. 2243 if tarinfo.issym(): 2244 if os.path.lexists(targetpath): 2245 # Avoid FileExistsError on following os.symlink. 2246 os.unlink(targetpath) 2247 os.symlink(tarinfo.linkname, targetpath) 2248 else: 2249 # See extract(). 2250 if os.path.exists(tarinfo._link_target): 2251 os.link(tarinfo._link_target, targetpath) 2252 else: 2253 self._extract_member(self._find_link_target(tarinfo), 2254 targetpath) 2255 except symlink_exception: 2256 try: 2257 self._extract_member(self._find_link_target(tarinfo), 2258 targetpath) 2259 except KeyError: 2260 raise ExtractError("unable to resolve link inside archive") from None 2261 2262 def chown(self, tarinfo, targetpath, numeric_owner): 2263 """Set owner of targetpath according to tarinfo. If numeric_owner 2264 is True, use .gid/.uid instead of .gname/.uname. If numeric_owner 2265 is False, fall back to .gid/.uid when the search based on name 2266 fails. 2267 """ 2268 if hasattr(os, "geteuid") and os.geteuid() == 0: 2269 # We have to be root to do so. 2270 g = tarinfo.gid 2271 u = tarinfo.uid 2272 if not numeric_owner: 2273 try: 2274 if grp: 2275 g = grp.getgrnam(tarinfo.gname)[2] 2276 except KeyError: 2277 pass 2278 try: 2279 if pwd: 2280 u = pwd.getpwnam(tarinfo.uname)[2] 2281 except KeyError: 2282 pass 2283 try: 2284 if tarinfo.issym() and hasattr(os, "lchown"): 2285 os.lchown(targetpath, u, g) 2286 else: 2287 os.chown(targetpath, u, g) 2288 except OSError as e: 2289 raise ExtractError("could not change owner") from e 2290 2291 def chmod(self, tarinfo, targetpath): 2292 """Set file permissions of targetpath according to tarinfo. 2293 """ 2294 try: 2295 os.chmod(targetpath, tarinfo.mode) 2296 except OSError as e: 2297 raise ExtractError("could not change mode") from e 2298 2299 def utime(self, tarinfo, targetpath): 2300 """Set modification time of targetpath according to tarinfo. 2301 """ 2302 if not hasattr(os, 'utime'): 2303 return 2304 try: 2305 os.utime(targetpath, (tarinfo.mtime, tarinfo.mtime)) 2306 except OSError as e: 2307 raise ExtractError("could not change modification time") from e 2308 2309 #-------------------------------------------------------------------------- 2310 def next(self): 2311 """Return the next member of the archive as a TarInfo object, when 2312 TarFile is opened for reading. Return None if there is no more 2313 available. 2314 """ 2315 self._check("ra") 2316 if self.firstmember is not None: 2317 m = self.firstmember 2318 self.firstmember = None 2319 return m 2320 2321 # Advance the file pointer. 2322 if self.offset != self.fileobj.tell(): 2323 self.fileobj.seek(self.offset - 1) 2324 if not self.fileobj.read(1): 2325 raise ReadError("unexpected end of data") 2326 2327 # Read the next block. 2328 tarinfo = None 2329 while True: 2330 try: 2331 tarinfo = self.tarinfo.fromtarfile(self) 2332 except EOFHeaderError as e: 2333 if self.ignore_zeros: 2334 self._dbg(2, "0x%X: %s" % (self.offset, e)) 2335 self.offset += BLOCKSIZE 2336 continue 2337 except InvalidHeaderError as e: 2338 if self.ignore_zeros: 2339 self._dbg(2, "0x%X: %s" % (self.offset, e)) 2340 self.offset += BLOCKSIZE 2341 continue 2342 elif self.offset == 0: 2343 raise ReadError(str(e)) from None 2344 except EmptyHeaderError: 2345 if self.offset == 0: 2346 raise ReadError("empty file") from None 2347 except TruncatedHeaderError as e: 2348 if self.offset == 0: 2349 raise ReadError(str(e)) from None 2350 except SubsequentHeaderError as e: 2351 raise ReadError(str(e)) from None 2352 except Exception as e: 2353 try: 2354 import zlib 2355 if isinstance(e, zlib.error): 2356 raise ReadError(f'zlib error: {e}') from None 2357 else: 2358 raise e 2359 except ImportError: 2360 raise e 2361 break 2362 2363 if tarinfo is not None: 2364 self.members.append(tarinfo) 2365 else: 2366 self._loaded = True 2367 2368 return tarinfo 2369 2370 #-------------------------------------------------------------------------- 2371 # Little helper methods: 2372 2373 def _getmember(self, name, tarinfo=None, normalize=False): 2374 """Find an archive member by name from bottom to top. 2375 If tarinfo is given, it is used as the starting point. 2376 """ 2377 # Ensure that all members have been loaded. 2378 members = self.getmembers() 2379 2380 # Limit the member search list up to tarinfo. 2381 if tarinfo is not None: 2382 members = members[:members.index(tarinfo)] 2383 2384 if normalize: 2385 name = os.path.normpath(name) 2386 2387 for member in reversed(members): 2388 if normalize: 2389 member_name = os.path.normpath(member.name) 2390 else: 2391 member_name = member.name 2392 2393 if name == member_name: 2394 return member 2395 2396 def _load(self): 2397 """Read through the entire archive file and look for readable 2398 members. 2399 """ 2400 while True: 2401 tarinfo = self.next() 2402 if tarinfo is None: 2403 break 2404 self._loaded = True 2405 2406 def _check(self, mode=None): 2407 """Check if TarFile is still open, and if the operation's mode 2408 corresponds to TarFile's mode. 2409 """ 2410 if self.closed: 2411 raise OSError("%s is closed" % self.__class__.__name__) 2412 if mode is not None and self.mode not in mode: 2413 raise OSError("bad operation for mode %r" % self.mode) 2414 2415 def _find_link_target(self, tarinfo): 2416 """Find the target member of a symlink or hardlink member in the 2417 archive. 2418 """ 2419 if tarinfo.issym(): 2420 # Always search the entire archive. 2421 linkname = "/".join(filter(None, (os.path.dirname(tarinfo.name), tarinfo.linkname))) 2422 limit = None 2423 else: 2424 # Search the archive before the link, because a hard link is 2425 # just a reference to an already archived file. 2426 linkname = tarinfo.linkname 2427 limit = tarinfo 2428 2429 member = self._getmember(linkname, tarinfo=limit, normalize=True) 2430 if member is None: 2431 raise KeyError("linkname %r not found" % linkname) 2432 return member 2433 2434 def __iter__(self): 2435 """Provide an iterator object. 2436 """ 2437 if self._loaded: 2438 yield from self.members 2439 return 2440 2441 # Yield items using TarFile's next() method. 2442 # When all members have been read, set TarFile as _loaded. 2443 index = 0 2444 # Fix for SF #1100429: Under rare circumstances it can 2445 # happen that getmembers() is called during iteration, 2446 # which will have already exhausted the next() method. 2447 if self.firstmember is not None: 2448 tarinfo = self.next() 2449 index += 1 2450 yield tarinfo 2451 2452 while True: 2453 if index < len(self.members): 2454 tarinfo = self.members[index] 2455 elif not self._loaded: 2456 tarinfo = self.next() 2457 if not tarinfo: 2458 self._loaded = True 2459 return 2460 else: 2461 return 2462 index += 1 2463 yield tarinfo 2464 2465 def _dbg(self, level, msg): 2466 """Write debugging output to sys.stderr. 2467 """ 2468 if level <= self.debug: 2469 print(msg, file=sys.stderr) 2470 2471 def __enter__(self): 2472 self._check() 2473 return self 2474 2475 def __exit__(self, type, value, traceback): 2476 if type is None: 2477 self.close() 2478 else: 2479 # An exception occurred. We must not call close() because 2480 # it would try to write end-of-archive blocks and padding. 2481 if not self._extfileobj: 2482 self.fileobj.close() 2483 self.closed = True 2484 2485#-------------------- 2486# exported functions 2487#-------------------- 2488def is_tarfile(name): 2489 """Return True if name points to a tar archive that we 2490 are able to handle, else return False. 2491 2492 'name' should be a string, file, or file-like object. 2493 """ 2494 try: 2495 if hasattr(name, "read"): 2496 t = open(fileobj=name) 2497 else: 2498 t = open(name) 2499 t.close() 2500 return True 2501 except TarError: 2502 return False 2503 2504open = TarFile.open 2505 2506 2507def main(): 2508 import argparse 2509 2510 description = 'A simple command-line interface for tarfile module.' 2511 parser = argparse.ArgumentParser(description=description) 2512 parser.add_argument('-v', '--verbose', action='store_true', default=False, 2513 help='Verbose output') 2514 group = parser.add_mutually_exclusive_group(required=True) 2515 group.add_argument('-l', '--list', metavar='<tarfile>', 2516 help='Show listing of a tarfile') 2517 group.add_argument('-e', '--extract', nargs='+', 2518 metavar=('<tarfile>', '<output_dir>'), 2519 help='Extract tarfile into target dir') 2520 group.add_argument('-c', '--create', nargs='+', 2521 metavar=('<name>', '<file>'), 2522 help='Create tarfile from sources') 2523 group.add_argument('-t', '--test', metavar='<tarfile>', 2524 help='Test if a tarfile is valid') 2525 args = parser.parse_args() 2526 2527 if args.test is not None: 2528 src = args.test 2529 if is_tarfile(src): 2530 with open(src, 'r') as tar: 2531 tar.getmembers() 2532 print(tar.getmembers(), file=sys.stderr) 2533 if args.verbose: 2534 print('{!r} is a tar archive.'.format(src)) 2535 else: 2536 parser.exit(1, '{!r} is not a tar archive.\n'.format(src)) 2537 2538 elif args.list is not None: 2539 src = args.list 2540 if is_tarfile(src): 2541 with TarFile.open(src, 'r:*') as tf: 2542 tf.list(verbose=args.verbose) 2543 else: 2544 parser.exit(1, '{!r} is not a tar archive.\n'.format(src)) 2545 2546 elif args.extract is not None: 2547 if len(args.extract) == 1: 2548 src = args.extract[0] 2549 curdir = os.curdir 2550 elif len(args.extract) == 2: 2551 src, curdir = args.extract 2552 else: 2553 parser.exit(1, parser.format_help()) 2554 2555 if is_tarfile(src): 2556 with TarFile.open(src, 'r:*') as tf: 2557 tf.extractall(path=curdir) 2558 if args.verbose: 2559 if curdir == '.': 2560 msg = '{!r} file is extracted.'.format(src) 2561 else: 2562 msg = ('{!r} file is extracted ' 2563 'into {!r} directory.').format(src, curdir) 2564 print(msg) 2565 else: 2566 parser.exit(1, '{!r} is not a tar archive.\n'.format(src)) 2567 2568 elif args.create is not None: 2569 tar_name = args.create.pop(0) 2570 _, ext = os.path.splitext(tar_name) 2571 compressions = { 2572 # gz 2573 '.gz': 'gz', 2574 '.tgz': 'gz', 2575 # xz 2576 '.xz': 'xz', 2577 '.txz': 'xz', 2578 # bz2 2579 '.bz2': 'bz2', 2580 '.tbz': 'bz2', 2581 '.tbz2': 'bz2', 2582 '.tb2': 'bz2', 2583 } 2584 tar_mode = 'w:' + compressions[ext] if ext in compressions else 'w' 2585 tar_files = args.create 2586 2587 with TarFile.open(tar_name, tar_mode) as tf: 2588 for file_name in tar_files: 2589 tf.add(file_name) 2590 2591 if args.verbose: 2592 print('{!r} file created.'.format(tar_name)) 2593 2594if __name__ == '__main__': 2595 main() 2596