1#!/usr/bin/env python3 2#------------------------------------------------------------------- 3# tarfile.py 4#------------------------------------------------------------------- 5# Copyright (C) 2002 Lars Gustaebel <lars@gustaebel.de> 6# All rights reserved. 7# 8# Permission is hereby granted, free of charge, to any person 9# obtaining a copy of this software and associated documentation 10# files (the "Software"), to deal in the Software without 11# restriction, including without limitation the rights to use, 12# copy, modify, merge, publish, distribute, sublicense, and/or sell 13# copies of the Software, and to permit persons to whom the 14# Software is furnished to do so, subject to the following 15# conditions: 16# 17# The above copyright notice and this permission notice shall be 18# included in all copies or substantial portions of the Software. 19# 20# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 21# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 22# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 23# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 24# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 25# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 26# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 27# OTHER DEALINGS IN THE SOFTWARE. 28# 29"""Read from and write to tar format archives. 30""" 31 32version = "0.9.0" 33__author__ = "Lars Gust\u00e4bel (lars@gustaebel.de)" 34__credits__ = "Gustavo Niemeyer, Niels Gust\u00e4bel, Richard Townsend." 35 36#--------- 37# Imports 38#--------- 39from builtins import open as bltn_open 40import sys 41import os 42import io 43import shutil 44import stat 45import time 46import struct 47import copy 48import re 49 50try: 51 import pwd 52except ImportError: 53 pwd = None 54try: 55 import grp 56except ImportError: 57 grp = None 58 59# os.symlink on Windows prior to 6.0 raises NotImplementedError 60symlink_exception = (AttributeError, NotImplementedError) 61try: 62 # OSError (winerror=1314) will be raised if the caller does not hold the 63 # SeCreateSymbolicLinkPrivilege privilege 64 symlink_exception += (OSError,) 65except NameError: 66 pass 67 68# from tarfile import * 69__all__ = ["TarFile", "TarInfo", "is_tarfile", "TarError", "ReadError", 70 "CompressionError", "StreamError", "ExtractError", "HeaderError", 71 "ENCODING", "USTAR_FORMAT", "GNU_FORMAT", "PAX_FORMAT", 72 "DEFAULT_FORMAT", "open"] 73 74#--------------------------------------------------------- 75# tar constants 76#--------------------------------------------------------- 77NUL = b"\0" # the null character 78BLOCKSIZE = 512 # length of processing blocks 79RECORDSIZE = BLOCKSIZE * 20 # length of records 80GNU_MAGIC = b"ustar \0" # magic gnu tar string 81POSIX_MAGIC = b"ustar\x0000" # magic posix tar string 82 83LENGTH_NAME = 100 # maximum length of a filename 84LENGTH_LINK = 100 # maximum length of a linkname 85LENGTH_PREFIX = 155 # maximum length of the prefix field 86 87REGTYPE = b"0" # regular file 88AREGTYPE = b"\0" # regular file 89LNKTYPE = b"1" # link (inside tarfile) 90SYMTYPE = b"2" # symbolic link 91CHRTYPE = b"3" # character special device 92BLKTYPE = b"4" # block special device 93DIRTYPE = b"5" # directory 94FIFOTYPE = b"6" # fifo special device 95CONTTYPE = b"7" # contiguous file 96 97GNUTYPE_LONGNAME = b"L" # GNU tar longname 98GNUTYPE_LONGLINK = b"K" # GNU tar longlink 99GNUTYPE_SPARSE = b"S" # GNU tar sparse file 100 101XHDTYPE = b"x" # POSIX.1-2001 extended header 102XGLTYPE = b"g" # POSIX.1-2001 global header 103SOLARIS_XHDTYPE = b"X" # Solaris extended header 104 105USTAR_FORMAT = 0 # POSIX.1-1988 (ustar) format 106GNU_FORMAT = 1 # GNU tar format 107PAX_FORMAT = 2 # POSIX.1-2001 (pax) format 108DEFAULT_FORMAT = PAX_FORMAT 109 110#--------------------------------------------------------- 111# tarfile constants 112#--------------------------------------------------------- 113# File types that tarfile supports: 114SUPPORTED_TYPES = (REGTYPE, AREGTYPE, LNKTYPE, 115 SYMTYPE, DIRTYPE, FIFOTYPE, 116 CONTTYPE, CHRTYPE, BLKTYPE, 117 GNUTYPE_LONGNAME, GNUTYPE_LONGLINK, 118 GNUTYPE_SPARSE) 119 120# File types that will be treated as a regular file. 121REGULAR_TYPES = (REGTYPE, AREGTYPE, 122 CONTTYPE, GNUTYPE_SPARSE) 123 124# File types that are part of the GNU tar format. 125GNU_TYPES = (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK, 126 GNUTYPE_SPARSE) 127 128# Fields from a pax header that override a TarInfo attribute. 129PAX_FIELDS = ("path", "linkpath", "size", "mtime", 130 "uid", "gid", "uname", "gname") 131 132# Fields from a pax header that are affected by hdrcharset. 133PAX_NAME_FIELDS = {"path", "linkpath", "uname", "gname"} 134 135# Fields in a pax header that are numbers, all other fields 136# are treated as strings. 137PAX_NUMBER_FIELDS = { 138 "atime": float, 139 "ctime": float, 140 "mtime": float, 141 "uid": int, 142 "gid": int, 143 "size": int 144} 145 146#--------------------------------------------------------- 147# initialization 148#--------------------------------------------------------- 149if os.name == "nt": 150 ENCODING = "utf-8" 151else: 152 ENCODING = sys.getfilesystemencoding() 153 154#--------------------------------------------------------- 155# Some useful functions 156#--------------------------------------------------------- 157 158def stn(s, length, encoding, errors): 159 """Convert a string to a null-terminated bytes object. 160 """ 161 s = s.encode(encoding, errors) 162 return s[:length] + (length - len(s)) * NUL 163 164def nts(s, encoding, errors): 165 """Convert a null-terminated bytes object to a string. 166 """ 167 p = s.find(b"\0") 168 if p != -1: 169 s = s[:p] 170 return s.decode(encoding, errors) 171 172def nti(s): 173 """Convert a number field to a python number. 174 """ 175 # There are two possible encodings for a number field, see 176 # itn() below. 177 if s[0] in (0o200, 0o377): 178 n = 0 179 for i in range(len(s) - 1): 180 n <<= 8 181 n += s[i + 1] 182 if s[0] == 0o377: 183 n = -(256 ** (len(s) - 1) - n) 184 else: 185 try: 186 s = nts(s, "ascii", "strict") 187 n = int(s.strip() or "0", 8) 188 except ValueError: 189 raise InvalidHeaderError("invalid header") 190 return n 191 192def itn(n, digits=8, format=DEFAULT_FORMAT): 193 """Convert a python number to a number field. 194 """ 195 # POSIX 1003.1-1988 requires numbers to be encoded as a string of 196 # octal digits followed by a null-byte, this allows values up to 197 # (8**(digits-1))-1. GNU tar allows storing numbers greater than 198 # that if necessary. A leading 0o200 or 0o377 byte indicate this 199 # particular encoding, the following digits-1 bytes are a big-endian 200 # base-256 representation. This allows values up to (256**(digits-1))-1. 201 # A 0o200 byte indicates a positive number, a 0o377 byte a negative 202 # number. 203 n = int(n) 204 if 0 <= n < 8 ** (digits - 1): 205 s = bytes("%0*o" % (digits - 1, n), "ascii") + NUL 206 elif format == GNU_FORMAT and -256 ** (digits - 1) <= n < 256 ** (digits - 1): 207 if n >= 0: 208 s = bytearray([0o200]) 209 else: 210 s = bytearray([0o377]) 211 n = 256 ** digits + n 212 213 for i in range(digits - 1): 214 s.insert(1, n & 0o377) 215 n >>= 8 216 else: 217 raise ValueError("overflow in number field") 218 219 return s 220 221def calc_chksums(buf): 222 """Calculate the checksum for a member's header by summing up all 223 characters except for the chksum field which is treated as if 224 it was filled with spaces. According to the GNU tar sources, 225 some tars (Sun and NeXT) calculate chksum with signed char, 226 which will be different if there are chars in the buffer with 227 the high bit set. So we calculate two checksums, unsigned and 228 signed. 229 """ 230 unsigned_chksum = 256 + sum(struct.unpack_from("148B8x356B", buf)) 231 signed_chksum = 256 + sum(struct.unpack_from("148b8x356b", buf)) 232 return unsigned_chksum, signed_chksum 233 234def copyfileobj(src, dst, length=None, exception=OSError, bufsize=None): 235 """Copy length bytes from fileobj src to fileobj dst. 236 If length is None, copy the entire content. 237 """ 238 bufsize = bufsize or 16 * 1024 239 if length == 0: 240 return 241 if length is None: 242 shutil.copyfileobj(src, dst, bufsize) 243 return 244 245 blocks, remainder = divmod(length, bufsize) 246 for b in range(blocks): 247 buf = src.read(bufsize) 248 if len(buf) < bufsize: 249 raise exception("unexpected end of data") 250 dst.write(buf) 251 252 if remainder != 0: 253 buf = src.read(remainder) 254 if len(buf) < remainder: 255 raise exception("unexpected end of data") 256 dst.write(buf) 257 return 258 259def _safe_print(s): 260 encoding = getattr(sys.stdout, 'encoding', None) 261 if encoding is not None: 262 s = s.encode(encoding, 'backslashreplace').decode(encoding) 263 print(s, end=' ') 264 265 266class TarError(Exception): 267 """Base exception.""" 268 pass 269class ExtractError(TarError): 270 """General exception for extract errors.""" 271 pass 272class ReadError(TarError): 273 """Exception for unreadable tar archives.""" 274 pass 275class CompressionError(TarError): 276 """Exception for unavailable compression methods.""" 277 pass 278class StreamError(TarError): 279 """Exception for unsupported operations on stream-like TarFiles.""" 280 pass 281class HeaderError(TarError): 282 """Base exception for header errors.""" 283 pass 284class EmptyHeaderError(HeaderError): 285 """Exception for empty headers.""" 286 pass 287class TruncatedHeaderError(HeaderError): 288 """Exception for truncated headers.""" 289 pass 290class EOFHeaderError(HeaderError): 291 """Exception for end of file headers.""" 292 pass 293class InvalidHeaderError(HeaderError): 294 """Exception for invalid headers.""" 295 pass 296class SubsequentHeaderError(HeaderError): 297 """Exception for missing and invalid extended headers.""" 298 pass 299 300#--------------------------- 301# internal stream interface 302#--------------------------- 303class _LowLevelFile: 304 """Low-level file object. Supports reading and writing. 305 It is used instead of a regular file object for streaming 306 access. 307 """ 308 309 def __init__(self, name, mode): 310 mode = { 311 "r": os.O_RDONLY, 312 "w": os.O_WRONLY | os.O_CREAT | os.O_TRUNC, 313 }[mode] 314 if hasattr(os, "O_BINARY"): 315 mode |= os.O_BINARY 316 self.fd = os.open(name, mode, 0o666) 317 318 def close(self): 319 os.close(self.fd) 320 321 def read(self, size): 322 return os.read(self.fd, size) 323 324 def write(self, s): 325 os.write(self.fd, s) 326 327class _Stream: 328 """Class that serves as an adapter between TarFile and 329 a stream-like object. The stream-like object only 330 needs to have a read() or write() method and is accessed 331 blockwise. Use of gzip or bzip2 compression is possible. 332 A stream-like object could be for example: sys.stdin, 333 sys.stdout, a socket, a tape device etc. 334 335 _Stream is intended to be used only internally. 336 """ 337 338 def __init__(self, name, mode, comptype, fileobj, bufsize): 339 """Construct a _Stream object. 340 """ 341 self._extfileobj = True 342 if fileobj is None: 343 fileobj = _LowLevelFile(name, mode) 344 self._extfileobj = False 345 346 if comptype == '*': 347 # Enable transparent compression detection for the 348 # stream interface 349 fileobj = _StreamProxy(fileobj) 350 comptype = fileobj.getcomptype() 351 352 self.name = name or "" 353 self.mode = mode 354 self.comptype = comptype 355 self.fileobj = fileobj 356 self.bufsize = bufsize 357 self.buf = b"" 358 self.pos = 0 359 self.closed = False 360 361 try: 362 if comptype == "gz": 363 try: 364 import zlib 365 except ImportError: 366 raise CompressionError("zlib module is not available") 367 self.zlib = zlib 368 self.crc = zlib.crc32(b"") 369 if mode == "r": 370 self._init_read_gz() 371 self.exception = zlib.error 372 else: 373 self._init_write_gz() 374 375 elif comptype == "bz2": 376 try: 377 import bz2 378 except ImportError: 379 raise CompressionError("bz2 module is not available") 380 if mode == "r": 381 self.dbuf = b"" 382 self.cmp = bz2.BZ2Decompressor() 383 self.exception = OSError 384 else: 385 self.cmp = bz2.BZ2Compressor() 386 387 elif comptype == "xz": 388 try: 389 import lzma 390 except ImportError: 391 raise CompressionError("lzma module is not available") 392 if mode == "r": 393 self.dbuf = b"" 394 self.cmp = lzma.LZMADecompressor() 395 self.exception = lzma.LZMAError 396 else: 397 self.cmp = lzma.LZMACompressor() 398 399 elif comptype != "tar": 400 raise CompressionError("unknown compression type %r" % comptype) 401 402 except: 403 if not self._extfileobj: 404 self.fileobj.close() 405 self.closed = True 406 raise 407 408 def __del__(self): 409 if hasattr(self, "closed") and not self.closed: 410 self.close() 411 412 def _init_write_gz(self): 413 """Initialize for writing with gzip compression. 414 """ 415 self.cmp = self.zlib.compressobj(9, self.zlib.DEFLATED, 416 -self.zlib.MAX_WBITS, 417 self.zlib.DEF_MEM_LEVEL, 418 0) 419 timestamp = struct.pack("<L", int(time.time())) 420 self.__write(b"\037\213\010\010" + timestamp + b"\002\377") 421 if self.name.endswith(".gz"): 422 self.name = self.name[:-3] 423 # RFC1952 says we must use ISO-8859-1 for the FNAME field. 424 self.__write(self.name.encode("iso-8859-1", "replace") + NUL) 425 426 def write(self, s): 427 """Write string s to the stream. 428 """ 429 if self.comptype == "gz": 430 self.crc = self.zlib.crc32(s, self.crc) 431 self.pos += len(s) 432 if self.comptype != "tar": 433 s = self.cmp.compress(s) 434 self.__write(s) 435 436 def __write(self, s): 437 """Write string s to the stream if a whole new block 438 is ready to be written. 439 """ 440 self.buf += s 441 while len(self.buf) > self.bufsize: 442 self.fileobj.write(self.buf[:self.bufsize]) 443 self.buf = self.buf[self.bufsize:] 444 445 def close(self): 446 """Close the _Stream object. No operation should be 447 done on it afterwards. 448 """ 449 if self.closed: 450 return 451 452 self.closed = True 453 try: 454 if self.mode == "w" and self.comptype != "tar": 455 self.buf += self.cmp.flush() 456 457 if self.mode == "w" and self.buf: 458 self.fileobj.write(self.buf) 459 self.buf = b"" 460 if self.comptype == "gz": 461 self.fileobj.write(struct.pack("<L", self.crc)) 462 self.fileobj.write(struct.pack("<L", self.pos & 0xffffFFFF)) 463 finally: 464 if not self._extfileobj: 465 self.fileobj.close() 466 467 def _init_read_gz(self): 468 """Initialize for reading a gzip compressed fileobj. 469 """ 470 self.cmp = self.zlib.decompressobj(-self.zlib.MAX_WBITS) 471 self.dbuf = b"" 472 473 # taken from gzip.GzipFile with some alterations 474 if self.__read(2) != b"\037\213": 475 raise ReadError("not a gzip file") 476 if self.__read(1) != b"\010": 477 raise CompressionError("unsupported compression method") 478 479 flag = ord(self.__read(1)) 480 self.__read(6) 481 482 if flag & 4: 483 xlen = ord(self.__read(1)) + 256 * ord(self.__read(1)) 484 self.read(xlen) 485 if flag & 8: 486 while True: 487 s = self.__read(1) 488 if not s or s == NUL: 489 break 490 if flag & 16: 491 while True: 492 s = self.__read(1) 493 if not s or s == NUL: 494 break 495 if flag & 2: 496 self.__read(2) 497 498 def tell(self): 499 """Return the stream's file pointer position. 500 """ 501 return self.pos 502 503 def seek(self, pos=0): 504 """Set the stream's file pointer to pos. Negative seeking 505 is forbidden. 506 """ 507 if pos - self.pos >= 0: 508 blocks, remainder = divmod(pos - self.pos, self.bufsize) 509 for i in range(blocks): 510 self.read(self.bufsize) 511 self.read(remainder) 512 else: 513 raise StreamError("seeking backwards is not allowed") 514 return self.pos 515 516 def read(self, size): 517 """Return the next size number of bytes from the stream.""" 518 assert size is not None 519 buf = self._read(size) 520 self.pos += len(buf) 521 return buf 522 523 def _read(self, size): 524 """Return size bytes from the stream. 525 """ 526 if self.comptype == "tar": 527 return self.__read(size) 528 529 c = len(self.dbuf) 530 t = [self.dbuf] 531 while c < size: 532 # Skip underlying buffer to avoid unaligned double buffering. 533 if self.buf: 534 buf = self.buf 535 self.buf = b"" 536 else: 537 buf = self.fileobj.read(self.bufsize) 538 if not buf: 539 break 540 try: 541 buf = self.cmp.decompress(buf) 542 except self.exception: 543 raise ReadError("invalid compressed data") 544 t.append(buf) 545 c += len(buf) 546 t = b"".join(t) 547 self.dbuf = t[size:] 548 return t[:size] 549 550 def __read(self, size): 551 """Return size bytes from stream. If internal buffer is empty, 552 read another block from the stream. 553 """ 554 c = len(self.buf) 555 t = [self.buf] 556 while c < size: 557 buf = self.fileobj.read(self.bufsize) 558 if not buf: 559 break 560 t.append(buf) 561 c += len(buf) 562 t = b"".join(t) 563 self.buf = t[size:] 564 return t[:size] 565# class _Stream 566 567class _StreamProxy(object): 568 """Small proxy class that enables transparent compression 569 detection for the Stream interface (mode 'r|*'). 570 """ 571 572 def __init__(self, fileobj): 573 self.fileobj = fileobj 574 self.buf = self.fileobj.read(BLOCKSIZE) 575 576 def read(self, size): 577 self.read = self.fileobj.read 578 return self.buf 579 580 def getcomptype(self): 581 if self.buf.startswith(b"\x1f\x8b\x08"): 582 return "gz" 583 elif self.buf[0:3] == b"BZh" and self.buf[4:10] == b"1AY&SY": 584 return "bz2" 585 elif self.buf.startswith((b"\x5d\x00\x00\x80", b"\xfd7zXZ")): 586 return "xz" 587 else: 588 return "tar" 589 590 def close(self): 591 self.fileobj.close() 592# class StreamProxy 593 594#------------------------ 595# Extraction file object 596#------------------------ 597class _FileInFile(object): 598 """A thin wrapper around an existing file object that 599 provides a part of its data as an individual file 600 object. 601 """ 602 603 def __init__(self, fileobj, offset, size, blockinfo=None): 604 self.fileobj = fileobj 605 self.offset = offset 606 self.size = size 607 self.position = 0 608 self.name = getattr(fileobj, "name", None) 609 self.closed = False 610 611 if blockinfo is None: 612 blockinfo = [(0, size)] 613 614 # Construct a map with data and zero blocks. 615 self.map_index = 0 616 self.map = [] 617 lastpos = 0 618 realpos = self.offset 619 for offset, size in blockinfo: 620 if offset > lastpos: 621 self.map.append((False, lastpos, offset, None)) 622 self.map.append((True, offset, offset + size, realpos)) 623 realpos += size 624 lastpos = offset + size 625 if lastpos < self.size: 626 self.map.append((False, lastpos, self.size, None)) 627 628 def flush(self): 629 pass 630 631 def readable(self): 632 return True 633 634 def writable(self): 635 return False 636 637 def seekable(self): 638 return self.fileobj.seekable() 639 640 def tell(self): 641 """Return the current file position. 642 """ 643 return self.position 644 645 def seek(self, position, whence=io.SEEK_SET): 646 """Seek to a position in the file. 647 """ 648 if whence == io.SEEK_SET: 649 self.position = min(max(position, 0), self.size) 650 elif whence == io.SEEK_CUR: 651 if position < 0: 652 self.position = max(self.position + position, 0) 653 else: 654 self.position = min(self.position + position, self.size) 655 elif whence == io.SEEK_END: 656 self.position = max(min(self.size + position, self.size), 0) 657 else: 658 raise ValueError("Invalid argument") 659 return self.position 660 661 def read(self, size=None): 662 """Read data from the file. 663 """ 664 if size is None: 665 size = self.size - self.position 666 else: 667 size = min(size, self.size - self.position) 668 669 buf = b"" 670 while size > 0: 671 while True: 672 data, start, stop, offset = self.map[self.map_index] 673 if start <= self.position < stop: 674 break 675 else: 676 self.map_index += 1 677 if self.map_index == len(self.map): 678 self.map_index = 0 679 length = min(size, stop - self.position) 680 if data: 681 self.fileobj.seek(offset + (self.position - start)) 682 b = self.fileobj.read(length) 683 if len(b) != length: 684 raise ReadError("unexpected end of data") 685 buf += b 686 else: 687 buf += NUL * length 688 size -= length 689 self.position += length 690 return buf 691 692 def readinto(self, b): 693 buf = self.read(len(b)) 694 b[:len(buf)] = buf 695 return len(buf) 696 697 def close(self): 698 self.closed = True 699#class _FileInFile 700 701class ExFileObject(io.BufferedReader): 702 703 def __init__(self, tarfile, tarinfo): 704 fileobj = _FileInFile(tarfile.fileobj, tarinfo.offset_data, 705 tarinfo.size, tarinfo.sparse) 706 super().__init__(fileobj) 707#class ExFileObject 708 709#------------------ 710# Exported Classes 711#------------------ 712class TarInfo(object): 713 """Informational class which holds the details about an 714 archive member given by a tar header block. 715 TarInfo objects are returned by TarFile.getmember(), 716 TarFile.getmembers() and TarFile.gettarinfo() and are 717 usually created internally. 718 """ 719 720 __slots__ = dict( 721 name = 'Name of the archive member.', 722 mode = 'Permission bits.', 723 uid = 'User ID of the user who originally stored this member.', 724 gid = 'Group ID of the user who originally stored this member.', 725 size = 'Size in bytes.', 726 mtime = 'Time of last modification.', 727 chksum = 'Header checksum.', 728 type = ('File type. type is usually one of these constants: ' 729 'REGTYPE, AREGTYPE, LNKTYPE, SYMTYPE, DIRTYPE, FIFOTYPE, ' 730 'CONTTYPE, CHRTYPE, BLKTYPE, GNUTYPE_SPARSE.'), 731 linkname = ('Name of the target file name, which is only present ' 732 'in TarInfo objects of type LNKTYPE and SYMTYPE.'), 733 uname = 'User name.', 734 gname = 'Group name.', 735 devmajor = 'Device major number.', 736 devminor = 'Device minor number.', 737 offset = 'The tar header starts here.', 738 offset_data = "The file's data starts here.", 739 pax_headers = ('A dictionary containing key-value pairs of an ' 740 'associated pax extended header.'), 741 sparse = 'Sparse member information.', 742 tarfile = None, 743 _sparse_structs = None, 744 _link_target = None, 745 ) 746 747 def __init__(self, name=""): 748 """Construct a TarInfo object. name is the optional name 749 of the member. 750 """ 751 self.name = name # member name 752 self.mode = 0o644 # file permissions 753 self.uid = 0 # user id 754 self.gid = 0 # group id 755 self.size = 0 # file size 756 self.mtime = 0 # modification time 757 self.chksum = 0 # header checksum 758 self.type = REGTYPE # member type 759 self.linkname = "" # link name 760 self.uname = "" # user name 761 self.gname = "" # group name 762 self.devmajor = 0 # device major number 763 self.devminor = 0 # device minor number 764 765 self.offset = 0 # the tar header starts here 766 self.offset_data = 0 # the file's data starts here 767 768 self.sparse = None # sparse member information 769 self.pax_headers = {} # pax header information 770 771 @property 772 def path(self): 773 'In pax headers, "name" is called "path".' 774 return self.name 775 776 @path.setter 777 def path(self, name): 778 self.name = name 779 780 @property 781 def linkpath(self): 782 'In pax headers, "linkname" is called "linkpath".' 783 return self.linkname 784 785 @linkpath.setter 786 def linkpath(self, linkname): 787 self.linkname = linkname 788 789 def __repr__(self): 790 return "<%s %r at %#x>" % (self.__class__.__name__,self.name,id(self)) 791 792 def get_info(self): 793 """Return the TarInfo's attributes as a dictionary. 794 """ 795 info = { 796 "name": self.name, 797 "mode": self.mode & 0o7777, 798 "uid": self.uid, 799 "gid": self.gid, 800 "size": self.size, 801 "mtime": self.mtime, 802 "chksum": self.chksum, 803 "type": self.type, 804 "linkname": self.linkname, 805 "uname": self.uname, 806 "gname": self.gname, 807 "devmajor": self.devmajor, 808 "devminor": self.devminor 809 } 810 811 if info["type"] == DIRTYPE and not info["name"].endswith("/"): 812 info["name"] += "/" 813 814 return info 815 816 def tobuf(self, format=DEFAULT_FORMAT, encoding=ENCODING, errors="surrogateescape"): 817 """Return a tar header as a string of 512 byte blocks. 818 """ 819 info = self.get_info() 820 821 if format == USTAR_FORMAT: 822 return self.create_ustar_header(info, encoding, errors) 823 elif format == GNU_FORMAT: 824 return self.create_gnu_header(info, encoding, errors) 825 elif format == PAX_FORMAT: 826 return self.create_pax_header(info, encoding) 827 else: 828 raise ValueError("invalid format") 829 830 def create_ustar_header(self, info, encoding, errors): 831 """Return the object as a ustar header block. 832 """ 833 info["magic"] = POSIX_MAGIC 834 835 if len(info["linkname"].encode(encoding, errors)) > LENGTH_LINK: 836 raise ValueError("linkname is too long") 837 838 if len(info["name"].encode(encoding, errors)) > LENGTH_NAME: 839 info["prefix"], info["name"] = self._posix_split_name(info["name"], encoding, errors) 840 841 return self._create_header(info, USTAR_FORMAT, encoding, errors) 842 843 def create_gnu_header(self, info, encoding, errors): 844 """Return the object as a GNU header block sequence. 845 """ 846 info["magic"] = GNU_MAGIC 847 848 buf = b"" 849 if len(info["linkname"].encode(encoding, errors)) > LENGTH_LINK: 850 buf += self._create_gnu_long_header(info["linkname"], GNUTYPE_LONGLINK, encoding, errors) 851 852 if len(info["name"].encode(encoding, errors)) > LENGTH_NAME: 853 buf += self._create_gnu_long_header(info["name"], GNUTYPE_LONGNAME, encoding, errors) 854 855 return buf + self._create_header(info, GNU_FORMAT, encoding, errors) 856 857 def create_pax_header(self, info, encoding): 858 """Return the object as a ustar header block. If it cannot be 859 represented this way, prepend a pax extended header sequence 860 with supplement information. 861 """ 862 info["magic"] = POSIX_MAGIC 863 pax_headers = self.pax_headers.copy() 864 865 # Test string fields for values that exceed the field length or cannot 866 # be represented in ASCII encoding. 867 for name, hname, length in ( 868 ("name", "path", LENGTH_NAME), ("linkname", "linkpath", LENGTH_LINK), 869 ("uname", "uname", 32), ("gname", "gname", 32)): 870 871 if hname in pax_headers: 872 # The pax header has priority. 873 continue 874 875 # Try to encode the string as ASCII. 876 try: 877 info[name].encode("ascii", "strict") 878 except UnicodeEncodeError: 879 pax_headers[hname] = info[name] 880 continue 881 882 if len(info[name]) > length: 883 pax_headers[hname] = info[name] 884 885 # Test number fields for values that exceed the field limit or values 886 # that like to be stored as float. 887 for name, digits in (("uid", 8), ("gid", 8), ("size", 12), ("mtime", 12)): 888 if name in pax_headers: 889 # The pax header has priority. Avoid overflow. 890 info[name] = 0 891 continue 892 893 val = info[name] 894 if not 0 <= val < 8 ** (digits - 1) or isinstance(val, float): 895 pax_headers[name] = str(val) 896 info[name] = 0 897 898 # Create a pax extended header if necessary. 899 if pax_headers: 900 buf = self._create_pax_generic_header(pax_headers, XHDTYPE, encoding) 901 else: 902 buf = b"" 903 904 return buf + self._create_header(info, USTAR_FORMAT, "ascii", "replace") 905 906 @classmethod 907 def create_pax_global_header(cls, pax_headers): 908 """Return the object as a pax global header block sequence. 909 """ 910 return cls._create_pax_generic_header(pax_headers, XGLTYPE, "utf-8") 911 912 def _posix_split_name(self, name, encoding, errors): 913 """Split a name longer than 100 chars into a prefix 914 and a name part. 915 """ 916 components = name.split("/") 917 for i in range(1, len(components)): 918 prefix = "/".join(components[:i]) 919 name = "/".join(components[i:]) 920 if len(prefix.encode(encoding, errors)) <= LENGTH_PREFIX and \ 921 len(name.encode(encoding, errors)) <= LENGTH_NAME: 922 break 923 else: 924 raise ValueError("name is too long") 925 926 return prefix, name 927 928 @staticmethod 929 def _create_header(info, format, encoding, errors): 930 """Return a header block. info is a dictionary with file 931 information, format must be one of the *_FORMAT constants. 932 """ 933 parts = [ 934 stn(info.get("name", ""), 100, encoding, errors), 935 itn(info.get("mode", 0) & 0o7777, 8, format), 936 itn(info.get("uid", 0), 8, format), 937 itn(info.get("gid", 0), 8, format), 938 itn(info.get("size", 0), 12, format), 939 itn(info.get("mtime", 0), 12, format), 940 b" ", # checksum field 941 info.get("type", REGTYPE), 942 stn(info.get("linkname", ""), 100, encoding, errors), 943 info.get("magic", POSIX_MAGIC), 944 stn(info.get("uname", ""), 32, encoding, errors), 945 stn(info.get("gname", ""), 32, encoding, errors), 946 itn(info.get("devmajor", 0), 8, format), 947 itn(info.get("devminor", 0), 8, format), 948 stn(info.get("prefix", ""), 155, encoding, errors) 949 ] 950 951 buf = struct.pack("%ds" % BLOCKSIZE, b"".join(parts)) 952 chksum = calc_chksums(buf[-BLOCKSIZE:])[0] 953 buf = buf[:-364] + bytes("%06o\0" % chksum, "ascii") + buf[-357:] 954 return buf 955 956 @staticmethod 957 def _create_payload(payload): 958 """Return the string payload filled with zero bytes 959 up to the next 512 byte border. 960 """ 961 blocks, remainder = divmod(len(payload), BLOCKSIZE) 962 if remainder > 0: 963 payload += (BLOCKSIZE - remainder) * NUL 964 return payload 965 966 @classmethod 967 def _create_gnu_long_header(cls, name, type, encoding, errors): 968 """Return a GNUTYPE_LONGNAME or GNUTYPE_LONGLINK sequence 969 for name. 970 """ 971 name = name.encode(encoding, errors) + NUL 972 973 info = {} 974 info["name"] = "././@LongLink" 975 info["type"] = type 976 info["size"] = len(name) 977 info["magic"] = GNU_MAGIC 978 979 # create extended header + name blocks. 980 return cls._create_header(info, USTAR_FORMAT, encoding, errors) + \ 981 cls._create_payload(name) 982 983 @classmethod 984 def _create_pax_generic_header(cls, pax_headers, type, encoding): 985 """Return a POSIX.1-2008 extended or global header sequence 986 that contains a list of keyword, value pairs. The values 987 must be strings. 988 """ 989 # Check if one of the fields contains surrogate characters and thereby 990 # forces hdrcharset=BINARY, see _proc_pax() for more information. 991 binary = False 992 for keyword, value in pax_headers.items(): 993 try: 994 value.encode("utf-8", "strict") 995 except UnicodeEncodeError: 996 binary = True 997 break 998 999 records = b"" 1000 if binary: 1001 # Put the hdrcharset field at the beginning of the header. 1002 records += b"21 hdrcharset=BINARY\n" 1003 1004 for keyword, value in pax_headers.items(): 1005 keyword = keyword.encode("utf-8") 1006 if binary: 1007 # Try to restore the original byte representation of `value'. 1008 # Needless to say, that the encoding must match the string. 1009 value = value.encode(encoding, "surrogateescape") 1010 else: 1011 value = value.encode("utf-8") 1012 1013 l = len(keyword) + len(value) + 3 # ' ' + '=' + '\n' 1014 n = p = 0 1015 while True: 1016 n = l + len(str(p)) 1017 if n == p: 1018 break 1019 p = n 1020 records += bytes(str(p), "ascii") + b" " + keyword + b"=" + value + b"\n" 1021 1022 # We use a hardcoded "././@PaxHeader" name like star does 1023 # instead of the one that POSIX recommends. 1024 info = {} 1025 info["name"] = "././@PaxHeader" 1026 info["type"] = type 1027 info["size"] = len(records) 1028 info["magic"] = POSIX_MAGIC 1029 1030 # Create pax header + record blocks. 1031 return cls._create_header(info, USTAR_FORMAT, "ascii", "replace") + \ 1032 cls._create_payload(records) 1033 1034 @classmethod 1035 def frombuf(cls, buf, encoding, errors): 1036 """Construct a TarInfo object from a 512 byte bytes object. 1037 """ 1038 if len(buf) == 0: 1039 raise EmptyHeaderError("empty header") 1040 if len(buf) != BLOCKSIZE: 1041 raise TruncatedHeaderError("truncated header") 1042 if buf.count(NUL) == BLOCKSIZE: 1043 raise EOFHeaderError("end of file header") 1044 1045 chksum = nti(buf[148:156]) 1046 if chksum not in calc_chksums(buf): 1047 raise InvalidHeaderError("bad checksum") 1048 1049 obj = cls() 1050 obj.name = nts(buf[0:100], encoding, errors) 1051 obj.mode = nti(buf[100:108]) 1052 obj.uid = nti(buf[108:116]) 1053 obj.gid = nti(buf[116:124]) 1054 obj.size = nti(buf[124:136]) 1055 obj.mtime = nti(buf[136:148]) 1056 obj.chksum = chksum 1057 obj.type = buf[156:157] 1058 obj.linkname = nts(buf[157:257], encoding, errors) 1059 obj.uname = nts(buf[265:297], encoding, errors) 1060 obj.gname = nts(buf[297:329], encoding, errors) 1061 obj.devmajor = nti(buf[329:337]) 1062 obj.devminor = nti(buf[337:345]) 1063 prefix = nts(buf[345:500], encoding, errors) 1064 1065 # Old V7 tar format represents a directory as a regular 1066 # file with a trailing slash. 1067 if obj.type == AREGTYPE and obj.name.endswith("/"): 1068 obj.type = DIRTYPE 1069 1070 # The old GNU sparse format occupies some of the unused 1071 # space in the buffer for up to 4 sparse structures. 1072 # Save them for later processing in _proc_sparse(). 1073 if obj.type == GNUTYPE_SPARSE: 1074 pos = 386 1075 structs = [] 1076 for i in range(4): 1077 try: 1078 offset = nti(buf[pos:pos + 12]) 1079 numbytes = nti(buf[pos + 12:pos + 24]) 1080 except ValueError: 1081 break 1082 structs.append((offset, numbytes)) 1083 pos += 24 1084 isextended = bool(buf[482]) 1085 origsize = nti(buf[483:495]) 1086 obj._sparse_structs = (structs, isextended, origsize) 1087 1088 # Remove redundant slashes from directories. 1089 if obj.isdir(): 1090 obj.name = obj.name.rstrip("/") 1091 1092 # Reconstruct a ustar longname. 1093 if prefix and obj.type not in GNU_TYPES: 1094 obj.name = prefix + "/" + obj.name 1095 return obj 1096 1097 @classmethod 1098 def fromtarfile(cls, tarfile): 1099 """Return the next TarInfo object from TarFile object 1100 tarfile. 1101 """ 1102 buf = tarfile.fileobj.read(BLOCKSIZE) 1103 obj = cls.frombuf(buf, tarfile.encoding, tarfile.errors) 1104 obj.offset = tarfile.fileobj.tell() - BLOCKSIZE 1105 return obj._proc_member(tarfile) 1106 1107 #-------------------------------------------------------------------------- 1108 # The following are methods that are called depending on the type of a 1109 # member. The entry point is _proc_member() which can be overridden in a 1110 # subclass to add custom _proc_*() methods. A _proc_*() method MUST 1111 # implement the following 1112 # operations: 1113 # 1. Set self.offset_data to the position where the data blocks begin, 1114 # if there is data that follows. 1115 # 2. Set tarfile.offset to the position where the next member's header will 1116 # begin. 1117 # 3. Return self or another valid TarInfo object. 1118 def _proc_member(self, tarfile): 1119 """Choose the right processing method depending on 1120 the type and call it. 1121 """ 1122 if self.type in (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK): 1123 return self._proc_gnulong(tarfile) 1124 elif self.type == GNUTYPE_SPARSE: 1125 return self._proc_sparse(tarfile) 1126 elif self.type in (XHDTYPE, XGLTYPE, SOLARIS_XHDTYPE): 1127 return self._proc_pax(tarfile) 1128 else: 1129 return self._proc_builtin(tarfile) 1130 1131 def _proc_builtin(self, tarfile): 1132 """Process a builtin type or an unknown type which 1133 will be treated as a regular file. 1134 """ 1135 self.offset_data = tarfile.fileobj.tell() 1136 offset = self.offset_data 1137 if self.isreg() or self.type not in SUPPORTED_TYPES: 1138 # Skip the following data blocks. 1139 offset += self._block(self.size) 1140 tarfile.offset = offset 1141 1142 # Patch the TarInfo object with saved global 1143 # header information. 1144 self._apply_pax_info(tarfile.pax_headers, tarfile.encoding, tarfile.errors) 1145 1146 return self 1147 1148 def _proc_gnulong(self, tarfile): 1149 """Process the blocks that hold a GNU longname 1150 or longlink member. 1151 """ 1152 buf = tarfile.fileobj.read(self._block(self.size)) 1153 1154 # Fetch the next header and process it. 1155 try: 1156 next = self.fromtarfile(tarfile) 1157 except HeaderError: 1158 raise SubsequentHeaderError("missing or bad subsequent header") 1159 1160 # Patch the TarInfo object from the next header with 1161 # the longname information. 1162 next.offset = self.offset 1163 if self.type == GNUTYPE_LONGNAME: 1164 next.name = nts(buf, tarfile.encoding, tarfile.errors) 1165 elif self.type == GNUTYPE_LONGLINK: 1166 next.linkname = nts(buf, tarfile.encoding, tarfile.errors) 1167 1168 return next 1169 1170 def _proc_sparse(self, tarfile): 1171 """Process a GNU sparse header plus extra headers. 1172 """ 1173 # We already collected some sparse structures in frombuf(). 1174 structs, isextended, origsize = self._sparse_structs 1175 del self._sparse_structs 1176 1177 # Collect sparse structures from extended header blocks. 1178 while isextended: 1179 buf = tarfile.fileobj.read(BLOCKSIZE) 1180 pos = 0 1181 for i in range(21): 1182 try: 1183 offset = nti(buf[pos:pos + 12]) 1184 numbytes = nti(buf[pos + 12:pos + 24]) 1185 except ValueError: 1186 break 1187 if offset and numbytes: 1188 structs.append((offset, numbytes)) 1189 pos += 24 1190 isextended = bool(buf[504]) 1191 self.sparse = structs 1192 1193 self.offset_data = tarfile.fileobj.tell() 1194 tarfile.offset = self.offset_data + self._block(self.size) 1195 self.size = origsize 1196 return self 1197 1198 def _proc_pax(self, tarfile): 1199 """Process an extended or global header as described in 1200 POSIX.1-2008. 1201 """ 1202 # Read the header information. 1203 buf = tarfile.fileobj.read(self._block(self.size)) 1204 1205 # A pax header stores supplemental information for either 1206 # the following file (extended) or all following files 1207 # (global). 1208 if self.type == XGLTYPE: 1209 pax_headers = tarfile.pax_headers 1210 else: 1211 pax_headers = tarfile.pax_headers.copy() 1212 1213 # Check if the pax header contains a hdrcharset field. This tells us 1214 # the encoding of the path, linkpath, uname and gname fields. Normally, 1215 # these fields are UTF-8 encoded but since POSIX.1-2008 tar 1216 # implementations are allowed to store them as raw binary strings if 1217 # the translation to UTF-8 fails. 1218 match = re.search(br"\d+ hdrcharset=([^\n]+)\n", buf) 1219 if match is not None: 1220 pax_headers["hdrcharset"] = match.group(1).decode("utf-8") 1221 1222 # For the time being, we don't care about anything other than "BINARY". 1223 # The only other value that is currently allowed by the standard is 1224 # "ISO-IR 10646 2000 UTF-8" in other words UTF-8. 1225 hdrcharset = pax_headers.get("hdrcharset") 1226 if hdrcharset == "BINARY": 1227 encoding = tarfile.encoding 1228 else: 1229 encoding = "utf-8" 1230 1231 # Parse pax header information. A record looks like that: 1232 # "%d %s=%s\n" % (length, keyword, value). length is the size 1233 # of the complete record including the length field itself and 1234 # the newline. keyword and value are both UTF-8 encoded strings. 1235 regex = re.compile(br"(\d+) ([^=]+)=") 1236 pos = 0 1237 while True: 1238 match = regex.match(buf, pos) 1239 if not match: 1240 break 1241 1242 length, keyword = match.groups() 1243 length = int(length) 1244 if length == 0: 1245 raise InvalidHeaderError("invalid header") 1246 value = buf[match.end(2) + 1:match.start(1) + length - 1] 1247 1248 # Normally, we could just use "utf-8" as the encoding and "strict" 1249 # as the error handler, but we better not take the risk. For 1250 # example, GNU tar <= 1.23 is known to store filenames it cannot 1251 # translate to UTF-8 as raw strings (unfortunately without a 1252 # hdrcharset=BINARY header). 1253 # We first try the strict standard encoding, and if that fails we 1254 # fall back on the user's encoding and error handler. 1255 keyword = self._decode_pax_field(keyword, "utf-8", "utf-8", 1256 tarfile.errors) 1257 if keyword in PAX_NAME_FIELDS: 1258 value = self._decode_pax_field(value, encoding, tarfile.encoding, 1259 tarfile.errors) 1260 else: 1261 value = self._decode_pax_field(value, "utf-8", "utf-8", 1262 tarfile.errors) 1263 1264 pax_headers[keyword] = value 1265 pos += length 1266 1267 # Fetch the next header. 1268 try: 1269 next = self.fromtarfile(tarfile) 1270 except HeaderError: 1271 raise SubsequentHeaderError("missing or bad subsequent header") 1272 1273 # Process GNU sparse information. 1274 if "GNU.sparse.map" in pax_headers: 1275 # GNU extended sparse format version 0.1. 1276 self._proc_gnusparse_01(next, pax_headers) 1277 1278 elif "GNU.sparse.size" in pax_headers: 1279 # GNU extended sparse format version 0.0. 1280 self._proc_gnusparse_00(next, pax_headers, buf) 1281 1282 elif pax_headers.get("GNU.sparse.major") == "1" and pax_headers.get("GNU.sparse.minor") == "0": 1283 # GNU extended sparse format version 1.0. 1284 self._proc_gnusparse_10(next, pax_headers, tarfile) 1285 1286 if self.type in (XHDTYPE, SOLARIS_XHDTYPE): 1287 # Patch the TarInfo object with the extended header info. 1288 next._apply_pax_info(pax_headers, tarfile.encoding, tarfile.errors) 1289 next.offset = self.offset 1290 1291 if "size" in pax_headers: 1292 # If the extended header replaces the size field, 1293 # we need to recalculate the offset where the next 1294 # header starts. 1295 offset = next.offset_data 1296 if next.isreg() or next.type not in SUPPORTED_TYPES: 1297 offset += next._block(next.size) 1298 tarfile.offset = offset 1299 1300 return next 1301 1302 def _proc_gnusparse_00(self, next, pax_headers, buf): 1303 """Process a GNU tar extended sparse header, version 0.0. 1304 """ 1305 offsets = [] 1306 for match in re.finditer(br"\d+ GNU.sparse.offset=(\d+)\n", buf): 1307 offsets.append(int(match.group(1))) 1308 numbytes = [] 1309 for match in re.finditer(br"\d+ GNU.sparse.numbytes=(\d+)\n", buf): 1310 numbytes.append(int(match.group(1))) 1311 next.sparse = list(zip(offsets, numbytes)) 1312 1313 def _proc_gnusparse_01(self, next, pax_headers): 1314 """Process a GNU tar extended sparse header, version 0.1. 1315 """ 1316 sparse = [int(x) for x in pax_headers["GNU.sparse.map"].split(",")] 1317 next.sparse = list(zip(sparse[::2], sparse[1::2])) 1318 1319 def _proc_gnusparse_10(self, next, pax_headers, tarfile): 1320 """Process a GNU tar extended sparse header, version 1.0. 1321 """ 1322 fields = None 1323 sparse = [] 1324 buf = tarfile.fileobj.read(BLOCKSIZE) 1325 fields, buf = buf.split(b"\n", 1) 1326 fields = int(fields) 1327 while len(sparse) < fields * 2: 1328 if b"\n" not in buf: 1329 buf += tarfile.fileobj.read(BLOCKSIZE) 1330 number, buf = buf.split(b"\n", 1) 1331 sparse.append(int(number)) 1332 next.offset_data = tarfile.fileobj.tell() 1333 next.sparse = list(zip(sparse[::2], sparse[1::2])) 1334 1335 def _apply_pax_info(self, pax_headers, encoding, errors): 1336 """Replace fields with supplemental information from a previous 1337 pax extended or global header. 1338 """ 1339 for keyword, value in pax_headers.items(): 1340 if keyword == "GNU.sparse.name": 1341 setattr(self, "path", value) 1342 elif keyword == "GNU.sparse.size": 1343 setattr(self, "size", int(value)) 1344 elif keyword == "GNU.sparse.realsize": 1345 setattr(self, "size", int(value)) 1346 elif keyword in PAX_FIELDS: 1347 if keyword in PAX_NUMBER_FIELDS: 1348 try: 1349 value = PAX_NUMBER_FIELDS[keyword](value) 1350 except ValueError: 1351 value = 0 1352 if keyword == "path": 1353 value = value.rstrip("/") 1354 setattr(self, keyword, value) 1355 1356 self.pax_headers = pax_headers.copy() 1357 1358 def _decode_pax_field(self, value, encoding, fallback_encoding, fallback_errors): 1359 """Decode a single field from a pax record. 1360 """ 1361 try: 1362 return value.decode(encoding, "strict") 1363 except UnicodeDecodeError: 1364 return value.decode(fallback_encoding, fallback_errors) 1365 1366 def _block(self, count): 1367 """Round up a byte count by BLOCKSIZE and return it, 1368 e.g. _block(834) => 1024. 1369 """ 1370 blocks, remainder = divmod(count, BLOCKSIZE) 1371 if remainder: 1372 blocks += 1 1373 return blocks * BLOCKSIZE 1374 1375 def isreg(self): 1376 'Return True if the Tarinfo object is a regular file.' 1377 return self.type in REGULAR_TYPES 1378 1379 def isfile(self): 1380 'Return True if the Tarinfo object is a regular file.' 1381 return self.isreg() 1382 1383 def isdir(self): 1384 'Return True if it is a directory.' 1385 return self.type == DIRTYPE 1386 1387 def issym(self): 1388 'Return True if it is a symbolic link.' 1389 return self.type == SYMTYPE 1390 1391 def islnk(self): 1392 'Return True if it is a hard link.' 1393 return self.type == LNKTYPE 1394 1395 def ischr(self): 1396 'Return True if it is a character device.' 1397 return self.type == CHRTYPE 1398 1399 def isblk(self): 1400 'Return True if it is a block device.' 1401 return self.type == BLKTYPE 1402 1403 def isfifo(self): 1404 'Return True if it is a FIFO.' 1405 return self.type == FIFOTYPE 1406 1407 def issparse(self): 1408 return self.sparse is not None 1409 1410 def isdev(self): 1411 'Return True if it is one of character device, block device or FIFO.' 1412 return self.type in (CHRTYPE, BLKTYPE, FIFOTYPE) 1413# class TarInfo 1414 1415class TarFile(object): 1416 """The TarFile Class provides an interface to tar archives. 1417 """ 1418 1419 debug = 0 # May be set from 0 (no msgs) to 3 (all msgs) 1420 1421 dereference = False # If true, add content of linked file to the 1422 # tar file, else the link. 1423 1424 ignore_zeros = False # If true, skips empty or invalid blocks and 1425 # continues processing. 1426 1427 errorlevel = 1 # If 0, fatal errors only appear in debug 1428 # messages (if debug >= 0). If > 0, errors 1429 # are passed to the caller as exceptions. 1430 1431 format = DEFAULT_FORMAT # The format to use when creating an archive. 1432 1433 encoding = ENCODING # Encoding for 8-bit character strings. 1434 1435 errors = None # Error handler for unicode conversion. 1436 1437 tarinfo = TarInfo # The default TarInfo class to use. 1438 1439 fileobject = ExFileObject # The file-object for extractfile(). 1440 1441 def __init__(self, name=None, mode="r", fileobj=None, format=None, 1442 tarinfo=None, dereference=None, ignore_zeros=None, encoding=None, 1443 errors="surrogateescape", pax_headers=None, debug=None, 1444 errorlevel=None, copybufsize=None): 1445 """Open an (uncompressed) tar archive `name'. `mode' is either 'r' to 1446 read from an existing archive, 'a' to append data to an existing 1447 file or 'w' to create a new file overwriting an existing one. `mode' 1448 defaults to 'r'. 1449 If `fileobj' is given, it is used for reading or writing data. If it 1450 can be determined, `mode' is overridden by `fileobj's mode. 1451 `fileobj' is not closed, when TarFile is closed. 1452 """ 1453 modes = {"r": "rb", "a": "r+b", "w": "wb", "x": "xb"} 1454 if mode not in modes: 1455 raise ValueError("mode must be 'r', 'a', 'w' or 'x'") 1456 self.mode = mode 1457 self._mode = modes[mode] 1458 1459 if not fileobj: 1460 if self.mode == "a" and not os.path.exists(name): 1461 # Create nonexistent files in append mode. 1462 self.mode = "w" 1463 self._mode = "wb" 1464 fileobj = bltn_open(name, self._mode) 1465 self._extfileobj = False 1466 else: 1467 if (name is None and hasattr(fileobj, "name") and 1468 isinstance(fileobj.name, (str, bytes))): 1469 name = fileobj.name 1470 if hasattr(fileobj, "mode"): 1471 self._mode = fileobj.mode 1472 self._extfileobj = True 1473 self.name = os.path.abspath(name) if name else None 1474 self.fileobj = fileobj 1475 1476 # Init attributes. 1477 if format is not None: 1478 self.format = format 1479 if tarinfo is not None: 1480 self.tarinfo = tarinfo 1481 if dereference is not None: 1482 self.dereference = dereference 1483 if ignore_zeros is not None: 1484 self.ignore_zeros = ignore_zeros 1485 if encoding is not None: 1486 self.encoding = encoding 1487 self.errors = errors 1488 1489 if pax_headers is not None and self.format == PAX_FORMAT: 1490 self.pax_headers = pax_headers 1491 else: 1492 self.pax_headers = {} 1493 1494 if debug is not None: 1495 self.debug = debug 1496 if errorlevel is not None: 1497 self.errorlevel = errorlevel 1498 1499 # Init datastructures. 1500 self.copybufsize = copybufsize 1501 self.closed = False 1502 self.members = [] # list of members as TarInfo objects 1503 self._loaded = False # flag if all members have been read 1504 self.offset = self.fileobj.tell() 1505 # current position in the archive file 1506 self.inodes = {} # dictionary caching the inodes of 1507 # archive members already added 1508 1509 try: 1510 if self.mode == "r": 1511 self.firstmember = None 1512 self.firstmember = self.next() 1513 1514 if self.mode == "a": 1515 # Move to the end of the archive, 1516 # before the first empty block. 1517 while True: 1518 self.fileobj.seek(self.offset) 1519 try: 1520 tarinfo = self.tarinfo.fromtarfile(self) 1521 self.members.append(tarinfo) 1522 except EOFHeaderError: 1523 self.fileobj.seek(self.offset) 1524 break 1525 except HeaderError as e: 1526 raise ReadError(str(e)) 1527 1528 if self.mode in ("a", "w", "x"): 1529 self._loaded = True 1530 1531 if self.pax_headers: 1532 buf = self.tarinfo.create_pax_global_header(self.pax_headers.copy()) 1533 self.fileobj.write(buf) 1534 self.offset += len(buf) 1535 except: 1536 if not self._extfileobj: 1537 self.fileobj.close() 1538 self.closed = True 1539 raise 1540 1541 #-------------------------------------------------------------------------- 1542 # Below are the classmethods which act as alternate constructors to the 1543 # TarFile class. The open() method is the only one that is needed for 1544 # public use; it is the "super"-constructor and is able to select an 1545 # adequate "sub"-constructor for a particular compression using the mapping 1546 # from OPEN_METH. 1547 # 1548 # This concept allows one to subclass TarFile without losing the comfort of 1549 # the super-constructor. A sub-constructor is registered and made available 1550 # by adding it to the mapping in OPEN_METH. 1551 1552 @classmethod 1553 def open(cls, name=None, mode="r", fileobj=None, bufsize=RECORDSIZE, **kwargs): 1554 """Open a tar archive for reading, writing or appending. Return 1555 an appropriate TarFile class. 1556 1557 mode: 1558 'r' or 'r:*' open for reading with transparent compression 1559 'r:' open for reading exclusively uncompressed 1560 'r:gz' open for reading with gzip compression 1561 'r:bz2' open for reading with bzip2 compression 1562 'r:xz' open for reading with lzma compression 1563 'a' or 'a:' open for appending, creating the file if necessary 1564 'w' or 'w:' open for writing without compression 1565 'w:gz' open for writing with gzip compression 1566 'w:bz2' open for writing with bzip2 compression 1567 'w:xz' open for writing with lzma compression 1568 1569 'x' or 'x:' create a tarfile exclusively without compression, raise 1570 an exception if the file is already created 1571 'x:gz' create a gzip compressed tarfile, raise an exception 1572 if the file is already created 1573 'x:bz2' create a bzip2 compressed tarfile, raise an exception 1574 if the file is already created 1575 'x:xz' create an lzma compressed tarfile, raise an exception 1576 if the file is already created 1577 1578 'r|*' open a stream of tar blocks with transparent compression 1579 'r|' open an uncompressed stream of tar blocks for reading 1580 'r|gz' open a gzip compressed stream of tar blocks 1581 'r|bz2' open a bzip2 compressed stream of tar blocks 1582 'r|xz' open an lzma compressed stream of tar blocks 1583 'w|' open an uncompressed stream for writing 1584 'w|gz' open a gzip compressed stream for writing 1585 'w|bz2' open a bzip2 compressed stream for writing 1586 'w|xz' open an lzma compressed stream for writing 1587 """ 1588 1589 if not name and not fileobj: 1590 raise ValueError("nothing to open") 1591 1592 if mode in ("r", "r:*"): 1593 # Find out which *open() is appropriate for opening the file. 1594 def not_compressed(comptype): 1595 return cls.OPEN_METH[comptype] == 'taropen' 1596 for comptype in sorted(cls.OPEN_METH, key=not_compressed): 1597 func = getattr(cls, cls.OPEN_METH[comptype]) 1598 if fileobj is not None: 1599 saved_pos = fileobj.tell() 1600 try: 1601 return func(name, "r", fileobj, **kwargs) 1602 except (ReadError, CompressionError): 1603 if fileobj is not None: 1604 fileobj.seek(saved_pos) 1605 continue 1606 raise ReadError("file could not be opened successfully") 1607 1608 elif ":" in mode: 1609 filemode, comptype = mode.split(":", 1) 1610 filemode = filemode or "r" 1611 comptype = comptype or "tar" 1612 1613 # Select the *open() function according to 1614 # given compression. 1615 if comptype in cls.OPEN_METH: 1616 func = getattr(cls, cls.OPEN_METH[comptype]) 1617 else: 1618 raise CompressionError("unknown compression type %r" % comptype) 1619 return func(name, filemode, fileobj, **kwargs) 1620 1621 elif "|" in mode: 1622 filemode, comptype = mode.split("|", 1) 1623 filemode = filemode or "r" 1624 comptype = comptype or "tar" 1625 1626 if filemode not in ("r", "w"): 1627 raise ValueError("mode must be 'r' or 'w'") 1628 1629 stream = _Stream(name, filemode, comptype, fileobj, bufsize) 1630 try: 1631 t = cls(name, filemode, stream, **kwargs) 1632 except: 1633 stream.close() 1634 raise 1635 t._extfileobj = False 1636 return t 1637 1638 elif mode in ("a", "w", "x"): 1639 return cls.taropen(name, mode, fileobj, **kwargs) 1640 1641 raise ValueError("undiscernible mode") 1642 1643 @classmethod 1644 def taropen(cls, name, mode="r", fileobj=None, **kwargs): 1645 """Open uncompressed tar archive name for reading or writing. 1646 """ 1647 if mode not in ("r", "a", "w", "x"): 1648 raise ValueError("mode must be 'r', 'a', 'w' or 'x'") 1649 return cls(name, mode, fileobj, **kwargs) 1650 1651 @classmethod 1652 def gzopen(cls, name, mode="r", fileobj=None, compresslevel=9, **kwargs): 1653 """Open gzip compressed tar archive name for reading or writing. 1654 Appending is not allowed. 1655 """ 1656 if mode not in ("r", "w", "x"): 1657 raise ValueError("mode must be 'r', 'w' or 'x'") 1658 1659 try: 1660 from gzip import GzipFile 1661 except ImportError: 1662 raise CompressionError("gzip module is not available") 1663 1664 try: 1665 fileobj = GzipFile(name, mode + "b", compresslevel, fileobj) 1666 except OSError: 1667 if fileobj is not None and mode == 'r': 1668 raise ReadError("not a gzip file") 1669 raise 1670 1671 try: 1672 t = cls.taropen(name, mode, fileobj, **kwargs) 1673 except OSError: 1674 fileobj.close() 1675 if mode == 'r': 1676 raise ReadError("not a gzip file") 1677 raise 1678 except: 1679 fileobj.close() 1680 raise 1681 t._extfileobj = False 1682 return t 1683 1684 @classmethod 1685 def bz2open(cls, name, mode="r", fileobj=None, compresslevel=9, **kwargs): 1686 """Open bzip2 compressed tar archive name for reading or writing. 1687 Appending is not allowed. 1688 """ 1689 if mode not in ("r", "w", "x"): 1690 raise ValueError("mode must be 'r', 'w' or 'x'") 1691 1692 try: 1693 from bz2 import BZ2File 1694 except ImportError: 1695 raise CompressionError("bz2 module is not available") 1696 1697 fileobj = BZ2File(fileobj or name, mode, compresslevel=compresslevel) 1698 1699 try: 1700 t = cls.taropen(name, mode, fileobj, **kwargs) 1701 except (OSError, EOFError): 1702 fileobj.close() 1703 if mode == 'r': 1704 raise ReadError("not a bzip2 file") 1705 raise 1706 except: 1707 fileobj.close() 1708 raise 1709 t._extfileobj = False 1710 return t 1711 1712 @classmethod 1713 def xzopen(cls, name, mode="r", fileobj=None, preset=None, **kwargs): 1714 """Open lzma compressed tar archive name for reading or writing. 1715 Appending is not allowed. 1716 """ 1717 if mode not in ("r", "w", "x"): 1718 raise ValueError("mode must be 'r', 'w' or 'x'") 1719 1720 try: 1721 from lzma import LZMAFile, LZMAError 1722 except ImportError: 1723 raise CompressionError("lzma module is not available") 1724 1725 fileobj = LZMAFile(fileobj or name, mode, preset=preset) 1726 1727 try: 1728 t = cls.taropen(name, mode, fileobj, **kwargs) 1729 except (LZMAError, EOFError): 1730 fileobj.close() 1731 if mode == 'r': 1732 raise ReadError("not an lzma file") 1733 raise 1734 except: 1735 fileobj.close() 1736 raise 1737 t._extfileobj = False 1738 return t 1739 1740 # All *open() methods are registered here. 1741 OPEN_METH = { 1742 "tar": "taropen", # uncompressed tar 1743 "gz": "gzopen", # gzip compressed tar 1744 "bz2": "bz2open", # bzip2 compressed tar 1745 "xz": "xzopen" # lzma compressed tar 1746 } 1747 1748 #-------------------------------------------------------------------------- 1749 # The public methods which TarFile provides: 1750 1751 def close(self): 1752 """Close the TarFile. In write-mode, two finishing zero blocks are 1753 appended to the archive. 1754 """ 1755 if self.closed: 1756 return 1757 1758 self.closed = True 1759 try: 1760 if self.mode in ("a", "w", "x"): 1761 self.fileobj.write(NUL * (BLOCKSIZE * 2)) 1762 self.offset += (BLOCKSIZE * 2) 1763 # fill up the end with zero-blocks 1764 # (like option -b20 for tar does) 1765 blocks, remainder = divmod(self.offset, RECORDSIZE) 1766 if remainder > 0: 1767 self.fileobj.write(NUL * (RECORDSIZE - remainder)) 1768 finally: 1769 if not self._extfileobj: 1770 self.fileobj.close() 1771 1772 def getmember(self, name): 1773 """Return a TarInfo object for member `name'. If `name' can not be 1774 found in the archive, KeyError is raised. If a member occurs more 1775 than once in the archive, its last occurrence is assumed to be the 1776 most up-to-date version. 1777 """ 1778 tarinfo = self._getmember(name) 1779 if tarinfo is None: 1780 raise KeyError("filename %r not found" % name) 1781 return tarinfo 1782 1783 def getmembers(self): 1784 """Return the members of the archive as a list of TarInfo objects. The 1785 list has the same order as the members in the archive. 1786 """ 1787 self._check() 1788 if not self._loaded: # if we want to obtain a list of 1789 self._load() # all members, we first have to 1790 # scan the whole archive. 1791 return self.members 1792 1793 def getnames(self): 1794 """Return the members of the archive as a list of their names. It has 1795 the same order as the list returned by getmembers(). 1796 """ 1797 return [tarinfo.name for tarinfo in self.getmembers()] 1798 1799 def gettarinfo(self, name=None, arcname=None, fileobj=None): 1800 """Create a TarInfo object from the result of os.stat or equivalent 1801 on an existing file. The file is either named by `name', or 1802 specified as a file object `fileobj' with a file descriptor. If 1803 given, `arcname' specifies an alternative name for the file in the 1804 archive, otherwise, the name is taken from the 'name' attribute of 1805 'fileobj', or the 'name' argument. The name should be a text 1806 string. 1807 """ 1808 self._check("awx") 1809 1810 # When fileobj is given, replace name by 1811 # fileobj's real name. 1812 if fileobj is not None: 1813 name = fileobj.name 1814 1815 # Building the name of the member in the archive. 1816 # Backward slashes are converted to forward slashes, 1817 # Absolute paths are turned to relative paths. 1818 if arcname is None: 1819 arcname = name 1820 drv, arcname = os.path.splitdrive(arcname) 1821 arcname = arcname.replace(os.sep, "/") 1822 arcname = arcname.lstrip("/") 1823 1824 # Now, fill the TarInfo object with 1825 # information specific for the file. 1826 tarinfo = self.tarinfo() 1827 tarinfo.tarfile = self # Not needed 1828 1829 # Use os.stat or os.lstat, depending on if symlinks shall be resolved. 1830 if fileobj is None: 1831 if not self.dereference: 1832 statres = os.lstat(name) 1833 else: 1834 statres = os.stat(name) 1835 else: 1836 statres = os.fstat(fileobj.fileno()) 1837 linkname = "" 1838 1839 stmd = statres.st_mode 1840 if stat.S_ISREG(stmd): 1841 inode = (statres.st_ino, statres.st_dev) 1842 if not self.dereference and statres.st_nlink > 1 and \ 1843 inode in self.inodes and arcname != self.inodes[inode]: 1844 # Is it a hardlink to an already 1845 # archived file? 1846 type = LNKTYPE 1847 linkname = self.inodes[inode] 1848 else: 1849 # The inode is added only if its valid. 1850 # For win32 it is always 0. 1851 type = REGTYPE 1852 if inode[0]: 1853 self.inodes[inode] = arcname 1854 elif stat.S_ISDIR(stmd): 1855 type = DIRTYPE 1856 elif stat.S_ISFIFO(stmd): 1857 type = FIFOTYPE 1858 elif stat.S_ISLNK(stmd): 1859 type = SYMTYPE 1860 linkname = os.readlink(name) 1861 elif stat.S_ISCHR(stmd): 1862 type = CHRTYPE 1863 elif stat.S_ISBLK(stmd): 1864 type = BLKTYPE 1865 else: 1866 return None 1867 1868 # Fill the TarInfo object with all 1869 # information we can get. 1870 tarinfo.name = arcname 1871 tarinfo.mode = stmd 1872 tarinfo.uid = statres.st_uid 1873 tarinfo.gid = statres.st_gid 1874 if type == REGTYPE: 1875 tarinfo.size = statres.st_size 1876 else: 1877 tarinfo.size = 0 1878 tarinfo.mtime = statres.st_mtime 1879 tarinfo.type = type 1880 tarinfo.linkname = linkname 1881 if pwd: 1882 try: 1883 tarinfo.uname = pwd.getpwuid(tarinfo.uid)[0] 1884 except KeyError: 1885 pass 1886 if grp: 1887 try: 1888 tarinfo.gname = grp.getgrgid(tarinfo.gid)[0] 1889 except KeyError: 1890 pass 1891 1892 if type in (CHRTYPE, BLKTYPE): 1893 if hasattr(os, "major") and hasattr(os, "minor"): 1894 tarinfo.devmajor = os.major(statres.st_rdev) 1895 tarinfo.devminor = os.minor(statres.st_rdev) 1896 return tarinfo 1897 1898 def list(self, verbose=True, *, members=None): 1899 """Print a table of contents to sys.stdout. If `verbose' is False, only 1900 the names of the members are printed. If it is True, an `ls -l'-like 1901 output is produced. `members' is optional and must be a subset of the 1902 list returned by getmembers(). 1903 """ 1904 self._check() 1905 1906 if members is None: 1907 members = self 1908 for tarinfo in members: 1909 if verbose: 1910 _safe_print(stat.filemode(tarinfo.mode)) 1911 _safe_print("%s/%s" % (tarinfo.uname or tarinfo.uid, 1912 tarinfo.gname or tarinfo.gid)) 1913 if tarinfo.ischr() or tarinfo.isblk(): 1914 _safe_print("%10s" % 1915 ("%d,%d" % (tarinfo.devmajor, tarinfo.devminor))) 1916 else: 1917 _safe_print("%10d" % tarinfo.size) 1918 _safe_print("%d-%02d-%02d %02d:%02d:%02d" \ 1919 % time.localtime(tarinfo.mtime)[:6]) 1920 1921 _safe_print(tarinfo.name + ("/" if tarinfo.isdir() else "")) 1922 1923 if verbose: 1924 if tarinfo.issym(): 1925 _safe_print("-> " + tarinfo.linkname) 1926 if tarinfo.islnk(): 1927 _safe_print("link to " + tarinfo.linkname) 1928 print() 1929 1930 def add(self, name, arcname=None, recursive=True, *, filter=None): 1931 """Add the file `name' to the archive. `name' may be any type of file 1932 (directory, fifo, symbolic link, etc.). If given, `arcname' 1933 specifies an alternative name for the file in the archive. 1934 Directories are added recursively by default. This can be avoided by 1935 setting `recursive' to False. `filter' is a function 1936 that expects a TarInfo object argument and returns the changed 1937 TarInfo object, if it returns None the TarInfo object will be 1938 excluded from the archive. 1939 """ 1940 self._check("awx") 1941 1942 if arcname is None: 1943 arcname = name 1944 1945 # Skip if somebody tries to archive the archive... 1946 if self.name is not None and os.path.abspath(name) == self.name: 1947 self._dbg(2, "tarfile: Skipped %r" % name) 1948 return 1949 1950 self._dbg(1, name) 1951 1952 # Create a TarInfo object from the file. 1953 tarinfo = self.gettarinfo(name, arcname) 1954 1955 if tarinfo is None: 1956 self._dbg(1, "tarfile: Unsupported type %r" % name) 1957 return 1958 1959 # Change or exclude the TarInfo object. 1960 if filter is not None: 1961 tarinfo = filter(tarinfo) 1962 if tarinfo is None: 1963 self._dbg(2, "tarfile: Excluded %r" % name) 1964 return 1965 1966 # Append the tar header and data to the archive. 1967 if tarinfo.isreg(): 1968 with bltn_open(name, "rb") as f: 1969 self.addfile(tarinfo, f) 1970 1971 elif tarinfo.isdir(): 1972 self.addfile(tarinfo) 1973 if recursive: 1974 for f in sorted(os.listdir(name)): 1975 self.add(os.path.join(name, f), os.path.join(arcname, f), 1976 recursive, filter=filter) 1977 1978 else: 1979 self.addfile(tarinfo) 1980 1981 def addfile(self, tarinfo, fileobj=None): 1982 """Add the TarInfo object `tarinfo' to the archive. If `fileobj' is 1983 given, it should be a binary file, and tarinfo.size bytes are read 1984 from it and added to the archive. You can create TarInfo objects 1985 directly, or by using gettarinfo(). 1986 """ 1987 self._check("awx") 1988 1989 tarinfo = copy.copy(tarinfo) 1990 1991 buf = tarinfo.tobuf(self.format, self.encoding, self.errors) 1992 self.fileobj.write(buf) 1993 self.offset += len(buf) 1994 bufsize=self.copybufsize 1995 # If there's data to follow, append it. 1996 if fileobj is not None: 1997 copyfileobj(fileobj, self.fileobj, tarinfo.size, bufsize=bufsize) 1998 blocks, remainder = divmod(tarinfo.size, BLOCKSIZE) 1999 if remainder > 0: 2000 self.fileobj.write(NUL * (BLOCKSIZE - remainder)) 2001 blocks += 1 2002 self.offset += blocks * BLOCKSIZE 2003 2004 self.members.append(tarinfo) 2005 2006 def extractall(self, path=".", members=None, *, numeric_owner=False): 2007 """Extract all members from the archive to the current working 2008 directory and set owner, modification time and permissions on 2009 directories afterwards. `path' specifies a different directory 2010 to extract to. `members' is optional and must be a subset of the 2011 list returned by getmembers(). If `numeric_owner` is True, only 2012 the numbers for user/group names are used and not the names. 2013 """ 2014 directories = [] 2015 2016 if members is None: 2017 members = self 2018 2019 for tarinfo in members: 2020 if tarinfo.isdir(): 2021 # Extract directories with a safe mode. 2022 directories.append(tarinfo) 2023 tarinfo = copy.copy(tarinfo) 2024 tarinfo.mode = 0o700 2025 # Do not set_attrs directories, as we will do that further down 2026 self.extract(tarinfo, path, set_attrs=not tarinfo.isdir(), 2027 numeric_owner=numeric_owner) 2028 2029 # Reverse sort directories. 2030 directories.sort(key=lambda a: a.name) 2031 directories.reverse() 2032 2033 # Set correct owner, mtime and filemode on directories. 2034 for tarinfo in directories: 2035 dirpath = os.path.join(path, tarinfo.name) 2036 try: 2037 self.chown(tarinfo, dirpath, numeric_owner=numeric_owner) 2038 self.utime(tarinfo, dirpath) 2039 self.chmod(tarinfo, dirpath) 2040 except ExtractError as e: 2041 if self.errorlevel > 1: 2042 raise 2043 else: 2044 self._dbg(1, "tarfile: %s" % e) 2045 2046 def extract(self, member, path="", set_attrs=True, *, numeric_owner=False): 2047 """Extract a member from the archive to the current working directory, 2048 using its full name. Its file information is extracted as accurately 2049 as possible. `member' may be a filename or a TarInfo object. You can 2050 specify a different directory using `path'. File attributes (owner, 2051 mtime, mode) are set unless `set_attrs' is False. If `numeric_owner` 2052 is True, only the numbers for user/group names are used and not 2053 the names. 2054 """ 2055 self._check("r") 2056 2057 if isinstance(member, str): 2058 tarinfo = self.getmember(member) 2059 else: 2060 tarinfo = member 2061 2062 # Prepare the link target for makelink(). 2063 if tarinfo.islnk(): 2064 tarinfo._link_target = os.path.join(path, tarinfo.linkname) 2065 2066 try: 2067 self._extract_member(tarinfo, os.path.join(path, tarinfo.name), 2068 set_attrs=set_attrs, 2069 numeric_owner=numeric_owner) 2070 except OSError as e: 2071 if self.errorlevel > 0: 2072 raise 2073 else: 2074 if e.filename is None: 2075 self._dbg(1, "tarfile: %s" % e.strerror) 2076 else: 2077 self._dbg(1, "tarfile: %s %r" % (e.strerror, e.filename)) 2078 except ExtractError as e: 2079 if self.errorlevel > 1: 2080 raise 2081 else: 2082 self._dbg(1, "tarfile: %s" % e) 2083 2084 def extractfile(self, member): 2085 """Extract a member from the archive as a file object. `member' may be 2086 a filename or a TarInfo object. If `member' is a regular file or a 2087 link, an io.BufferedReader object is returned. Otherwise, None is 2088 returned. 2089 """ 2090 self._check("r") 2091 2092 if isinstance(member, str): 2093 tarinfo = self.getmember(member) 2094 else: 2095 tarinfo = member 2096 2097 if tarinfo.isreg() or tarinfo.type not in SUPPORTED_TYPES: 2098 # Members with unknown types are treated as regular files. 2099 return self.fileobject(self, tarinfo) 2100 2101 elif tarinfo.islnk() or tarinfo.issym(): 2102 if isinstance(self.fileobj, _Stream): 2103 # A small but ugly workaround for the case that someone tries 2104 # to extract a (sym)link as a file-object from a non-seekable 2105 # stream of tar blocks. 2106 raise StreamError("cannot extract (sym)link as file object") 2107 else: 2108 # A (sym)link's file object is its target's file object. 2109 return self.extractfile(self._find_link_target(tarinfo)) 2110 else: 2111 # If there's no data associated with the member (directory, chrdev, 2112 # blkdev, etc.), return None instead of a file object. 2113 return None 2114 2115 def _extract_member(self, tarinfo, targetpath, set_attrs=True, 2116 numeric_owner=False): 2117 """Extract the TarInfo object tarinfo to a physical 2118 file called targetpath. 2119 """ 2120 # Fetch the TarInfo object for the given name 2121 # and build the destination pathname, replacing 2122 # forward slashes to platform specific separators. 2123 targetpath = targetpath.rstrip("/") 2124 targetpath = targetpath.replace("/", os.sep) 2125 2126 # Create all upper directories. 2127 upperdirs = os.path.dirname(targetpath) 2128 if upperdirs and not os.path.exists(upperdirs): 2129 # Create directories that are not part of the archive with 2130 # default permissions. 2131 os.makedirs(upperdirs) 2132 2133 if tarinfo.islnk() or tarinfo.issym(): 2134 self._dbg(1, "%s -> %s" % (tarinfo.name, tarinfo.linkname)) 2135 else: 2136 self._dbg(1, tarinfo.name) 2137 2138 if tarinfo.isreg(): 2139 self.makefile(tarinfo, targetpath) 2140 elif tarinfo.isdir(): 2141 self.makedir(tarinfo, targetpath) 2142 elif tarinfo.isfifo(): 2143 self.makefifo(tarinfo, targetpath) 2144 elif tarinfo.ischr() or tarinfo.isblk(): 2145 self.makedev(tarinfo, targetpath) 2146 elif tarinfo.islnk() or tarinfo.issym(): 2147 self.makelink(tarinfo, targetpath) 2148 elif tarinfo.type not in SUPPORTED_TYPES: 2149 self.makeunknown(tarinfo, targetpath) 2150 else: 2151 self.makefile(tarinfo, targetpath) 2152 2153 if set_attrs: 2154 self.chown(tarinfo, targetpath, numeric_owner) 2155 if not tarinfo.issym(): 2156 self.chmod(tarinfo, targetpath) 2157 self.utime(tarinfo, targetpath) 2158 2159 #-------------------------------------------------------------------------- 2160 # Below are the different file methods. They are called via 2161 # _extract_member() when extract() is called. They can be replaced in a 2162 # subclass to implement other functionality. 2163 2164 def makedir(self, tarinfo, targetpath): 2165 """Make a directory called targetpath. 2166 """ 2167 try: 2168 # Use a safe mode for the directory, the real mode is set 2169 # later in _extract_member(). 2170 os.mkdir(targetpath, 0o700) 2171 except FileExistsError: 2172 pass 2173 2174 def makefile(self, tarinfo, targetpath): 2175 """Make a file called targetpath. 2176 """ 2177 source = self.fileobj 2178 source.seek(tarinfo.offset_data) 2179 bufsize = self.copybufsize 2180 with bltn_open(targetpath, "wb") as target: 2181 if tarinfo.sparse is not None: 2182 for offset, size in tarinfo.sparse: 2183 target.seek(offset) 2184 copyfileobj(source, target, size, ReadError, bufsize) 2185 target.seek(tarinfo.size) 2186 target.truncate() 2187 else: 2188 copyfileobj(source, target, tarinfo.size, ReadError, bufsize) 2189 2190 def makeunknown(self, tarinfo, targetpath): 2191 """Make a file from a TarInfo object with an unknown type 2192 at targetpath. 2193 """ 2194 self.makefile(tarinfo, targetpath) 2195 self._dbg(1, "tarfile: Unknown file type %r, " \ 2196 "extracted as regular file." % tarinfo.type) 2197 2198 def makefifo(self, tarinfo, targetpath): 2199 """Make a fifo called targetpath. 2200 """ 2201 if hasattr(os, "mkfifo"): 2202 os.mkfifo(targetpath) 2203 else: 2204 raise ExtractError("fifo not supported by system") 2205 2206 def makedev(self, tarinfo, targetpath): 2207 """Make a character or block device called targetpath. 2208 """ 2209 if not hasattr(os, "mknod") or not hasattr(os, "makedev"): 2210 raise ExtractError("special devices not supported by system") 2211 2212 mode = tarinfo.mode 2213 if tarinfo.isblk(): 2214 mode |= stat.S_IFBLK 2215 else: 2216 mode |= stat.S_IFCHR 2217 2218 os.mknod(targetpath, mode, 2219 os.makedev(tarinfo.devmajor, tarinfo.devminor)) 2220 2221 def makelink(self, tarinfo, targetpath): 2222 """Make a (symbolic) link called targetpath. If it cannot be created 2223 (platform limitation), we try to make a copy of the referenced file 2224 instead of a link. 2225 """ 2226 try: 2227 # For systems that support symbolic and hard links. 2228 if tarinfo.issym(): 2229 os.symlink(tarinfo.linkname, targetpath) 2230 else: 2231 # See extract(). 2232 if os.path.exists(tarinfo._link_target): 2233 os.link(tarinfo._link_target, targetpath) 2234 else: 2235 self._extract_member(self._find_link_target(tarinfo), 2236 targetpath) 2237 except symlink_exception: 2238 try: 2239 self._extract_member(self._find_link_target(tarinfo), 2240 targetpath) 2241 except KeyError: 2242 raise ExtractError("unable to resolve link inside archive") 2243 2244 def chown(self, tarinfo, targetpath, numeric_owner): 2245 """Set owner of targetpath according to tarinfo. If numeric_owner 2246 is True, use .gid/.uid instead of .gname/.uname. If numeric_owner 2247 is False, fall back to .gid/.uid when the search based on name 2248 fails. 2249 """ 2250 if hasattr(os, "geteuid") and os.geteuid() == 0: 2251 # We have to be root to do so. 2252 g = tarinfo.gid 2253 u = tarinfo.uid 2254 if not numeric_owner: 2255 try: 2256 if grp: 2257 g = grp.getgrnam(tarinfo.gname)[2] 2258 except KeyError: 2259 pass 2260 try: 2261 if pwd: 2262 u = pwd.getpwnam(tarinfo.uname)[2] 2263 except KeyError: 2264 pass 2265 try: 2266 if tarinfo.issym() and hasattr(os, "lchown"): 2267 os.lchown(targetpath, u, g) 2268 else: 2269 os.chown(targetpath, u, g) 2270 except OSError: 2271 raise ExtractError("could not change owner") 2272 2273 def chmod(self, tarinfo, targetpath): 2274 """Set file permissions of targetpath according to tarinfo. 2275 """ 2276 try: 2277 os.chmod(targetpath, tarinfo.mode) 2278 except OSError: 2279 raise ExtractError("could not change mode") 2280 2281 def utime(self, tarinfo, targetpath): 2282 """Set modification time of targetpath according to tarinfo. 2283 """ 2284 if not hasattr(os, 'utime'): 2285 return 2286 try: 2287 os.utime(targetpath, (tarinfo.mtime, tarinfo.mtime)) 2288 except OSError: 2289 raise ExtractError("could not change modification time") 2290 2291 #-------------------------------------------------------------------------- 2292 def next(self): 2293 """Return the next member of the archive as a TarInfo object, when 2294 TarFile is opened for reading. Return None if there is no more 2295 available. 2296 """ 2297 self._check("ra") 2298 if self.firstmember is not None: 2299 m = self.firstmember 2300 self.firstmember = None 2301 return m 2302 2303 # Advance the file pointer. 2304 if self.offset != self.fileobj.tell(): 2305 self.fileobj.seek(self.offset - 1) 2306 if not self.fileobj.read(1): 2307 raise ReadError("unexpected end of data") 2308 2309 # Read the next block. 2310 tarinfo = None 2311 while True: 2312 try: 2313 tarinfo = self.tarinfo.fromtarfile(self) 2314 except EOFHeaderError as e: 2315 if self.ignore_zeros: 2316 self._dbg(2, "0x%X: %s" % (self.offset, e)) 2317 self.offset += BLOCKSIZE 2318 continue 2319 except InvalidHeaderError as e: 2320 if self.ignore_zeros: 2321 self._dbg(2, "0x%X: %s" % (self.offset, e)) 2322 self.offset += BLOCKSIZE 2323 continue 2324 elif self.offset == 0: 2325 raise ReadError(str(e)) 2326 except EmptyHeaderError: 2327 if self.offset == 0: 2328 raise ReadError("empty file") 2329 except TruncatedHeaderError as e: 2330 if self.offset == 0: 2331 raise ReadError(str(e)) 2332 except SubsequentHeaderError as e: 2333 raise ReadError(str(e)) 2334 break 2335 2336 if tarinfo is not None: 2337 self.members.append(tarinfo) 2338 else: 2339 self._loaded = True 2340 2341 return tarinfo 2342 2343 #-------------------------------------------------------------------------- 2344 # Little helper methods: 2345 2346 def _getmember(self, name, tarinfo=None, normalize=False): 2347 """Find an archive member by name from bottom to top. 2348 If tarinfo is given, it is used as the starting point. 2349 """ 2350 # Ensure that all members have been loaded. 2351 members = self.getmembers() 2352 2353 # Limit the member search list up to tarinfo. 2354 if tarinfo is not None: 2355 members = members[:members.index(tarinfo)] 2356 2357 if normalize: 2358 name = os.path.normpath(name) 2359 2360 for member in reversed(members): 2361 if normalize: 2362 member_name = os.path.normpath(member.name) 2363 else: 2364 member_name = member.name 2365 2366 if name == member_name: 2367 return member 2368 2369 def _load(self): 2370 """Read through the entire archive file and look for readable 2371 members. 2372 """ 2373 while True: 2374 tarinfo = self.next() 2375 if tarinfo is None: 2376 break 2377 self._loaded = True 2378 2379 def _check(self, mode=None): 2380 """Check if TarFile is still open, and if the operation's mode 2381 corresponds to TarFile's mode. 2382 """ 2383 if self.closed: 2384 raise OSError("%s is closed" % self.__class__.__name__) 2385 if mode is not None and self.mode not in mode: 2386 raise OSError("bad operation for mode %r" % self.mode) 2387 2388 def _find_link_target(self, tarinfo): 2389 """Find the target member of a symlink or hardlink member in the 2390 archive. 2391 """ 2392 if tarinfo.issym(): 2393 # Always search the entire archive. 2394 linkname = "/".join(filter(None, (os.path.dirname(tarinfo.name), tarinfo.linkname))) 2395 limit = None 2396 else: 2397 # Search the archive before the link, because a hard link is 2398 # just a reference to an already archived file. 2399 linkname = tarinfo.linkname 2400 limit = tarinfo 2401 2402 member = self._getmember(linkname, tarinfo=limit, normalize=True) 2403 if member is None: 2404 raise KeyError("linkname %r not found" % linkname) 2405 return member 2406 2407 def __iter__(self): 2408 """Provide an iterator object. 2409 """ 2410 if self._loaded: 2411 yield from self.members 2412 return 2413 2414 # Yield items using TarFile's next() method. 2415 # When all members have been read, set TarFile as _loaded. 2416 index = 0 2417 # Fix for SF #1100429: Under rare circumstances it can 2418 # happen that getmembers() is called during iteration, 2419 # which will have already exhausted the next() method. 2420 if self.firstmember is not None: 2421 tarinfo = self.next() 2422 index += 1 2423 yield tarinfo 2424 2425 while True: 2426 if index < len(self.members): 2427 tarinfo = self.members[index] 2428 elif not self._loaded: 2429 tarinfo = self.next() 2430 if not tarinfo: 2431 self._loaded = True 2432 return 2433 else: 2434 return 2435 index += 1 2436 yield tarinfo 2437 2438 def _dbg(self, level, msg): 2439 """Write debugging output to sys.stderr. 2440 """ 2441 if level <= self.debug: 2442 print(msg, file=sys.stderr) 2443 2444 def __enter__(self): 2445 self._check() 2446 return self 2447 2448 def __exit__(self, type, value, traceback): 2449 if type is None: 2450 self.close() 2451 else: 2452 # An exception occurred. We must not call close() because 2453 # it would try to write end-of-archive blocks and padding. 2454 if not self._extfileobj: 2455 self.fileobj.close() 2456 self.closed = True 2457 2458#-------------------- 2459# exported functions 2460#-------------------- 2461def is_tarfile(name): 2462 """Return True if name points to a tar archive that we 2463 are able to handle, else return False. 2464 """ 2465 try: 2466 t = open(name) 2467 t.close() 2468 return True 2469 except TarError: 2470 return False 2471 2472open = TarFile.open 2473 2474 2475def main(): 2476 import argparse 2477 2478 description = 'A simple command-line interface for tarfile module.' 2479 parser = argparse.ArgumentParser(description=description) 2480 parser.add_argument('-v', '--verbose', action='store_true', default=False, 2481 help='Verbose output') 2482 group = parser.add_mutually_exclusive_group(required=True) 2483 group.add_argument('-l', '--list', metavar='<tarfile>', 2484 help='Show listing of a tarfile') 2485 group.add_argument('-e', '--extract', nargs='+', 2486 metavar=('<tarfile>', '<output_dir>'), 2487 help='Extract tarfile into target dir') 2488 group.add_argument('-c', '--create', nargs='+', 2489 metavar=('<name>', '<file>'), 2490 help='Create tarfile from sources') 2491 group.add_argument('-t', '--test', metavar='<tarfile>', 2492 help='Test if a tarfile is valid') 2493 args = parser.parse_args() 2494 2495 if args.test is not None: 2496 src = args.test 2497 if is_tarfile(src): 2498 with open(src, 'r') as tar: 2499 tar.getmembers() 2500 print(tar.getmembers(), file=sys.stderr) 2501 if args.verbose: 2502 print('{!r} is a tar archive.'.format(src)) 2503 else: 2504 parser.exit(1, '{!r} is not a tar archive.\n'.format(src)) 2505 2506 elif args.list is not None: 2507 src = args.list 2508 if is_tarfile(src): 2509 with TarFile.open(src, 'r:*') as tf: 2510 tf.list(verbose=args.verbose) 2511 else: 2512 parser.exit(1, '{!r} is not a tar archive.\n'.format(src)) 2513 2514 elif args.extract is not None: 2515 if len(args.extract) == 1: 2516 src = args.extract[0] 2517 curdir = os.curdir 2518 elif len(args.extract) == 2: 2519 src, curdir = args.extract 2520 else: 2521 parser.exit(1, parser.format_help()) 2522 2523 if is_tarfile(src): 2524 with TarFile.open(src, 'r:*') as tf: 2525 tf.extractall(path=curdir) 2526 if args.verbose: 2527 if curdir == '.': 2528 msg = '{!r} file is extracted.'.format(src) 2529 else: 2530 msg = ('{!r} file is extracted ' 2531 'into {!r} directory.').format(src, curdir) 2532 print(msg) 2533 else: 2534 parser.exit(1, '{!r} is not a tar archive.\n'.format(src)) 2535 2536 elif args.create is not None: 2537 tar_name = args.create.pop(0) 2538 _, ext = os.path.splitext(tar_name) 2539 compressions = { 2540 # gz 2541 '.gz': 'gz', 2542 '.tgz': 'gz', 2543 # xz 2544 '.xz': 'xz', 2545 '.txz': 'xz', 2546 # bz2 2547 '.bz2': 'bz2', 2548 '.tbz': 'bz2', 2549 '.tbz2': 'bz2', 2550 '.tb2': 'bz2', 2551 } 2552 tar_mode = 'w:' + compressions[ext] if ext in compressions else 'w' 2553 tar_files = args.create 2554 2555 with TarFile.open(tar_name, tar_mode) as tf: 2556 for file_name in tar_files: 2557 tf.add(file_name) 2558 2559 if args.verbose: 2560 print('{!r} file created.'.format(tar_name)) 2561 2562if __name__ == '__main__': 2563 main() 2564