• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/env python3
2#-------------------------------------------------------------------
3# tarfile.py
4#-------------------------------------------------------------------
5# Copyright (C) 2002 Lars Gustaebel <lars@gustaebel.de>
6# All rights reserved.
7#
8# Permission  is  hereby granted,  free  of charge,  to  any person
9# obtaining a  copy of  this software  and associated documentation
10# files  (the  "Software"),  to   deal  in  the  Software   without
11# restriction,  including  without limitation  the  rights to  use,
12# copy, modify, merge, publish, distribute, sublicense, and/or sell
13# copies  of  the  Software,  and to  permit  persons  to  whom the
14# Software  is  furnished  to  do  so,  subject  to  the  following
15# conditions:
16#
17# The above copyright  notice and this  permission notice shall  be
18# included in all copies or substantial portions of the Software.
19#
20# THE SOFTWARE IS PROVIDED "AS  IS", WITHOUT WARRANTY OF ANY  KIND,
21# EXPRESS OR IMPLIED, INCLUDING  BUT NOT LIMITED TO  THE WARRANTIES
22# OF  MERCHANTABILITY,  FITNESS   FOR  A  PARTICULAR   PURPOSE  AND
23# NONINFRINGEMENT.  IN  NO  EVENT SHALL  THE  AUTHORS  OR COPYRIGHT
24# HOLDERS  BE LIABLE  FOR ANY  CLAIM, DAMAGES  OR OTHER  LIABILITY,
25# WHETHER  IN AN  ACTION OF  CONTRACT, TORT  OR OTHERWISE,  ARISING
26# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
27# OTHER DEALINGS IN THE SOFTWARE.
28#
29"""Read from and write to tar format archives.
30"""
31
32version     = "0.9.0"
33__author__  = "Lars Gust\u00e4bel (lars@gustaebel.de)"
34__credits__ = "Gustavo Niemeyer, Niels Gust\u00e4bel, Richard Townsend."
35
36#---------
37# Imports
38#---------
39from builtins import open as bltn_open
40import sys
41import os
42import io
43import shutil
44import stat
45import time
46import struct
47import copy
48import re
49
50try:
51    import pwd
52except ImportError:
53    pwd = None
54try:
55    import grp
56except ImportError:
57    grp = None
58
59# os.symlink on Windows prior to 6.0 raises NotImplementedError
60symlink_exception = (AttributeError, NotImplementedError)
61try:
62    # OSError (winerror=1314) will be raised if the caller does not hold the
63    # SeCreateSymbolicLinkPrivilege privilege
64    symlink_exception += (OSError,)
65except NameError:
66    pass
67
68# from tarfile import *
69__all__ = ["TarFile", "TarInfo", "is_tarfile", "TarError", "ReadError",
70           "CompressionError", "StreamError", "ExtractError", "HeaderError",
71           "ENCODING", "USTAR_FORMAT", "GNU_FORMAT", "PAX_FORMAT",
72           "DEFAULT_FORMAT", "open"]
73
74#---------------------------------------------------------
75# tar constants
76#---------------------------------------------------------
77NUL = b"\0"                     # the null character
78BLOCKSIZE = 512                 # length of processing blocks
79RECORDSIZE = BLOCKSIZE * 20     # length of records
80GNU_MAGIC = b"ustar  \0"        # magic gnu tar string
81POSIX_MAGIC = b"ustar\x0000"    # magic posix tar string
82
83LENGTH_NAME = 100               # maximum length of a filename
84LENGTH_LINK = 100               # maximum length of a linkname
85LENGTH_PREFIX = 155             # maximum length of the prefix field
86
87REGTYPE = b"0"                  # regular file
88AREGTYPE = b"\0"                # regular file
89LNKTYPE = b"1"                  # link (inside tarfile)
90SYMTYPE = b"2"                  # symbolic link
91CHRTYPE = b"3"                  # character special device
92BLKTYPE = b"4"                  # block special device
93DIRTYPE = b"5"                  # directory
94FIFOTYPE = b"6"                 # fifo special device
95CONTTYPE = b"7"                 # contiguous file
96
97GNUTYPE_LONGNAME = b"L"         # GNU tar longname
98GNUTYPE_LONGLINK = b"K"         # GNU tar longlink
99GNUTYPE_SPARSE = b"S"           # GNU tar sparse file
100
101XHDTYPE = b"x"                  # POSIX.1-2001 extended header
102XGLTYPE = b"g"                  # POSIX.1-2001 global header
103SOLARIS_XHDTYPE = b"X"          # Solaris extended header
104
105USTAR_FORMAT = 0                # POSIX.1-1988 (ustar) format
106GNU_FORMAT = 1                  # GNU tar format
107PAX_FORMAT = 2                  # POSIX.1-2001 (pax) format
108DEFAULT_FORMAT = PAX_FORMAT
109
110#---------------------------------------------------------
111# tarfile constants
112#---------------------------------------------------------
113# File types that tarfile supports:
114SUPPORTED_TYPES = (REGTYPE, AREGTYPE, LNKTYPE,
115                   SYMTYPE, DIRTYPE, FIFOTYPE,
116                   CONTTYPE, CHRTYPE, BLKTYPE,
117                   GNUTYPE_LONGNAME, GNUTYPE_LONGLINK,
118                   GNUTYPE_SPARSE)
119
120# File types that will be treated as a regular file.
121REGULAR_TYPES = (REGTYPE, AREGTYPE,
122                 CONTTYPE, GNUTYPE_SPARSE)
123
124# File types that are part of the GNU tar format.
125GNU_TYPES = (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK,
126             GNUTYPE_SPARSE)
127
128# Fields from a pax header that override a TarInfo attribute.
129PAX_FIELDS = ("path", "linkpath", "size", "mtime",
130              "uid", "gid", "uname", "gname")
131
132# Fields from a pax header that are affected by hdrcharset.
133PAX_NAME_FIELDS = {"path", "linkpath", "uname", "gname"}
134
135# Fields in a pax header that are numbers, all other fields
136# are treated as strings.
137PAX_NUMBER_FIELDS = {
138    "atime": float,
139    "ctime": float,
140    "mtime": float,
141    "uid": int,
142    "gid": int,
143    "size": int
144}
145
146#---------------------------------------------------------
147# initialization
148#---------------------------------------------------------
149if os.name == "nt":
150    ENCODING = "utf-8"
151else:
152    ENCODING = sys.getfilesystemencoding()
153
154#---------------------------------------------------------
155# Some useful functions
156#---------------------------------------------------------
157
158def stn(s, length, encoding, errors):
159    """Convert a string to a null-terminated bytes object.
160    """
161    s = s.encode(encoding, errors)
162    return s[:length] + (length - len(s)) * NUL
163
164def nts(s, encoding, errors):
165    """Convert a null-terminated bytes object to a string.
166    """
167    p = s.find(b"\0")
168    if p != -1:
169        s = s[:p]
170    return s.decode(encoding, errors)
171
172def nti(s):
173    """Convert a number field to a python number.
174    """
175    # There are two possible encodings for a number field, see
176    # itn() below.
177    if s[0] in (0o200, 0o377):
178        n = 0
179        for i in range(len(s) - 1):
180            n <<= 8
181            n += s[i + 1]
182        if s[0] == 0o377:
183            n = -(256 ** (len(s) - 1) - n)
184    else:
185        try:
186            s = nts(s, "ascii", "strict")
187            n = int(s.strip() or "0", 8)
188        except ValueError:
189            raise InvalidHeaderError("invalid header")
190    return n
191
192def itn(n, digits=8, format=DEFAULT_FORMAT):
193    """Convert a python number to a number field.
194    """
195    # POSIX 1003.1-1988 requires numbers to be encoded as a string of
196    # octal digits followed by a null-byte, this allows values up to
197    # (8**(digits-1))-1. GNU tar allows storing numbers greater than
198    # that if necessary. A leading 0o200 or 0o377 byte indicate this
199    # particular encoding, the following digits-1 bytes are a big-endian
200    # base-256 representation. This allows values up to (256**(digits-1))-1.
201    # A 0o200 byte indicates a positive number, a 0o377 byte a negative
202    # number.
203    n = int(n)
204    if 0 <= n < 8 ** (digits - 1):
205        s = bytes("%0*o" % (digits - 1, n), "ascii") + NUL
206    elif format == GNU_FORMAT and -256 ** (digits - 1) <= n < 256 ** (digits - 1):
207        if n >= 0:
208            s = bytearray([0o200])
209        else:
210            s = bytearray([0o377])
211            n = 256 ** digits + n
212
213        for i in range(digits - 1):
214            s.insert(1, n & 0o377)
215            n >>= 8
216    else:
217        raise ValueError("overflow in number field")
218
219    return s
220
221def calc_chksums(buf):
222    """Calculate the checksum for a member's header by summing up all
223       characters except for the chksum field which is treated as if
224       it was filled with spaces. According to the GNU tar sources,
225       some tars (Sun and NeXT) calculate chksum with signed char,
226       which will be different if there are chars in the buffer with
227       the high bit set. So we calculate two checksums, unsigned and
228       signed.
229    """
230    unsigned_chksum = 256 + sum(struct.unpack_from("148B8x356B", buf))
231    signed_chksum = 256 + sum(struct.unpack_from("148b8x356b", buf))
232    return unsigned_chksum, signed_chksum
233
234def copyfileobj(src, dst, length=None, exception=OSError, bufsize=None):
235    """Copy length bytes from fileobj src to fileobj dst.
236       If length is None, copy the entire content.
237    """
238    bufsize = bufsize or 16 * 1024
239    if length == 0:
240        return
241    if length is None:
242        shutil.copyfileobj(src, dst, bufsize)
243        return
244
245    blocks, remainder = divmod(length, bufsize)
246    for b in range(blocks):
247        buf = src.read(bufsize)
248        if len(buf) < bufsize:
249            raise exception("unexpected end of data")
250        dst.write(buf)
251
252    if remainder != 0:
253        buf = src.read(remainder)
254        if len(buf) < remainder:
255            raise exception("unexpected end of data")
256        dst.write(buf)
257    return
258
259def _safe_print(s):
260    encoding = getattr(sys.stdout, 'encoding', None)
261    if encoding is not None:
262        s = s.encode(encoding, 'backslashreplace').decode(encoding)
263    print(s, end=' ')
264
265
266class TarError(Exception):
267    """Base exception."""
268    pass
269class ExtractError(TarError):
270    """General exception for extract errors."""
271    pass
272class ReadError(TarError):
273    """Exception for unreadable tar archives."""
274    pass
275class CompressionError(TarError):
276    """Exception for unavailable compression methods."""
277    pass
278class StreamError(TarError):
279    """Exception for unsupported operations on stream-like TarFiles."""
280    pass
281class HeaderError(TarError):
282    """Base exception for header errors."""
283    pass
284class EmptyHeaderError(HeaderError):
285    """Exception for empty headers."""
286    pass
287class TruncatedHeaderError(HeaderError):
288    """Exception for truncated headers."""
289    pass
290class EOFHeaderError(HeaderError):
291    """Exception for end of file headers."""
292    pass
293class InvalidHeaderError(HeaderError):
294    """Exception for invalid headers."""
295    pass
296class SubsequentHeaderError(HeaderError):
297    """Exception for missing and invalid extended headers."""
298    pass
299
300#---------------------------
301# internal stream interface
302#---------------------------
303class _LowLevelFile:
304    """Low-level file object. Supports reading and writing.
305       It is used instead of a regular file object for streaming
306       access.
307    """
308
309    def __init__(self, name, mode):
310        mode = {
311            "r": os.O_RDONLY,
312            "w": os.O_WRONLY | os.O_CREAT | os.O_TRUNC,
313        }[mode]
314        if hasattr(os, "O_BINARY"):
315            mode |= os.O_BINARY
316        self.fd = os.open(name, mode, 0o666)
317
318    def close(self):
319        os.close(self.fd)
320
321    def read(self, size):
322        return os.read(self.fd, size)
323
324    def write(self, s):
325        os.write(self.fd, s)
326
327class _Stream:
328    """Class that serves as an adapter between TarFile and
329       a stream-like object.  The stream-like object only
330       needs to have a read() or write() method and is accessed
331       blockwise.  Use of gzip or bzip2 compression is possible.
332       A stream-like object could be for example: sys.stdin,
333       sys.stdout, a socket, a tape device etc.
334
335       _Stream is intended to be used only internally.
336    """
337
338    def __init__(self, name, mode, comptype, fileobj, bufsize):
339        """Construct a _Stream object.
340        """
341        self._extfileobj = True
342        if fileobj is None:
343            fileobj = _LowLevelFile(name, mode)
344            self._extfileobj = False
345
346        if comptype == '*':
347            # Enable transparent compression detection for the
348            # stream interface
349            fileobj = _StreamProxy(fileobj)
350            comptype = fileobj.getcomptype()
351
352        self.name     = name or ""
353        self.mode     = mode
354        self.comptype = comptype
355        self.fileobj  = fileobj
356        self.bufsize  = bufsize
357        self.buf      = b""
358        self.pos      = 0
359        self.closed   = False
360
361        try:
362            if comptype == "gz":
363                try:
364                    import zlib
365                except ImportError:
366                    raise CompressionError("zlib module is not available")
367                self.zlib = zlib
368                self.crc = zlib.crc32(b"")
369                if mode == "r":
370                    self._init_read_gz()
371                    self.exception = zlib.error
372                else:
373                    self._init_write_gz()
374
375            elif comptype == "bz2":
376                try:
377                    import bz2
378                except ImportError:
379                    raise CompressionError("bz2 module is not available")
380                if mode == "r":
381                    self.dbuf = b""
382                    self.cmp = bz2.BZ2Decompressor()
383                    self.exception = OSError
384                else:
385                    self.cmp = bz2.BZ2Compressor()
386
387            elif comptype == "xz":
388                try:
389                    import lzma
390                except ImportError:
391                    raise CompressionError("lzma module is not available")
392                if mode == "r":
393                    self.dbuf = b""
394                    self.cmp = lzma.LZMADecompressor()
395                    self.exception = lzma.LZMAError
396                else:
397                    self.cmp = lzma.LZMACompressor()
398
399            elif comptype != "tar":
400                raise CompressionError("unknown compression type %r" % comptype)
401
402        except:
403            if not self._extfileobj:
404                self.fileobj.close()
405            self.closed = True
406            raise
407
408    def __del__(self):
409        if hasattr(self, "closed") and not self.closed:
410            self.close()
411
412    def _init_write_gz(self):
413        """Initialize for writing with gzip compression.
414        """
415        self.cmp = self.zlib.compressobj(9, self.zlib.DEFLATED,
416                                            -self.zlib.MAX_WBITS,
417                                            self.zlib.DEF_MEM_LEVEL,
418                                            0)
419        timestamp = struct.pack("<L", int(time.time()))
420        self.__write(b"\037\213\010\010" + timestamp + b"\002\377")
421        if self.name.endswith(".gz"):
422            self.name = self.name[:-3]
423        # RFC1952 says we must use ISO-8859-1 for the FNAME field.
424        self.__write(self.name.encode("iso-8859-1", "replace") + NUL)
425
426    def write(self, s):
427        """Write string s to the stream.
428        """
429        if self.comptype == "gz":
430            self.crc = self.zlib.crc32(s, self.crc)
431        self.pos += len(s)
432        if self.comptype != "tar":
433            s = self.cmp.compress(s)
434        self.__write(s)
435
436    def __write(self, s):
437        """Write string s to the stream if a whole new block
438           is ready to be written.
439        """
440        self.buf += s
441        while len(self.buf) > self.bufsize:
442            self.fileobj.write(self.buf[:self.bufsize])
443            self.buf = self.buf[self.bufsize:]
444
445    def close(self):
446        """Close the _Stream object. No operation should be
447           done on it afterwards.
448        """
449        if self.closed:
450            return
451
452        self.closed = True
453        try:
454            if self.mode == "w" and self.comptype != "tar":
455                self.buf += self.cmp.flush()
456
457            if self.mode == "w" and self.buf:
458                self.fileobj.write(self.buf)
459                self.buf = b""
460                if self.comptype == "gz":
461                    self.fileobj.write(struct.pack("<L", self.crc))
462                    self.fileobj.write(struct.pack("<L", self.pos & 0xffffFFFF))
463        finally:
464            if not self._extfileobj:
465                self.fileobj.close()
466
467    def _init_read_gz(self):
468        """Initialize for reading a gzip compressed fileobj.
469        """
470        self.cmp = self.zlib.decompressobj(-self.zlib.MAX_WBITS)
471        self.dbuf = b""
472
473        # taken from gzip.GzipFile with some alterations
474        if self.__read(2) != b"\037\213":
475            raise ReadError("not a gzip file")
476        if self.__read(1) != b"\010":
477            raise CompressionError("unsupported compression method")
478
479        flag = ord(self.__read(1))
480        self.__read(6)
481
482        if flag & 4:
483            xlen = ord(self.__read(1)) + 256 * ord(self.__read(1))
484            self.read(xlen)
485        if flag & 8:
486            while True:
487                s = self.__read(1)
488                if not s or s == NUL:
489                    break
490        if flag & 16:
491            while True:
492                s = self.__read(1)
493                if not s or s == NUL:
494                    break
495        if flag & 2:
496            self.__read(2)
497
498    def tell(self):
499        """Return the stream's file pointer position.
500        """
501        return self.pos
502
503    def seek(self, pos=0):
504        """Set the stream's file pointer to pos. Negative seeking
505           is forbidden.
506        """
507        if pos - self.pos >= 0:
508            blocks, remainder = divmod(pos - self.pos, self.bufsize)
509            for i in range(blocks):
510                self.read(self.bufsize)
511            self.read(remainder)
512        else:
513            raise StreamError("seeking backwards is not allowed")
514        return self.pos
515
516    def read(self, size):
517        """Return the next size number of bytes from the stream."""
518        assert size is not None
519        buf = self._read(size)
520        self.pos += len(buf)
521        return buf
522
523    def _read(self, size):
524        """Return size bytes from the stream.
525        """
526        if self.comptype == "tar":
527            return self.__read(size)
528
529        c = len(self.dbuf)
530        t = [self.dbuf]
531        while c < size:
532            # Skip underlying buffer to avoid unaligned double buffering.
533            if self.buf:
534                buf = self.buf
535                self.buf = b""
536            else:
537                buf = self.fileobj.read(self.bufsize)
538                if not buf:
539                    break
540            try:
541                buf = self.cmp.decompress(buf)
542            except self.exception:
543                raise ReadError("invalid compressed data")
544            t.append(buf)
545            c += len(buf)
546        t = b"".join(t)
547        self.dbuf = t[size:]
548        return t[:size]
549
550    def __read(self, size):
551        """Return size bytes from stream. If internal buffer is empty,
552           read another block from the stream.
553        """
554        c = len(self.buf)
555        t = [self.buf]
556        while c < size:
557            buf = self.fileobj.read(self.bufsize)
558            if not buf:
559                break
560            t.append(buf)
561            c += len(buf)
562        t = b"".join(t)
563        self.buf = t[size:]
564        return t[:size]
565# class _Stream
566
567class _StreamProxy(object):
568    """Small proxy class that enables transparent compression
569       detection for the Stream interface (mode 'r|*').
570    """
571
572    def __init__(self, fileobj):
573        self.fileobj = fileobj
574        self.buf = self.fileobj.read(BLOCKSIZE)
575
576    def read(self, size):
577        self.read = self.fileobj.read
578        return self.buf
579
580    def getcomptype(self):
581        if self.buf.startswith(b"\x1f\x8b\x08"):
582            return "gz"
583        elif self.buf[0:3] == b"BZh" and self.buf[4:10] == b"1AY&SY":
584            return "bz2"
585        elif self.buf.startswith((b"\x5d\x00\x00\x80", b"\xfd7zXZ")):
586            return "xz"
587        else:
588            return "tar"
589
590    def close(self):
591        self.fileobj.close()
592# class StreamProxy
593
594#------------------------
595# Extraction file object
596#------------------------
597class _FileInFile(object):
598    """A thin wrapper around an existing file object that
599       provides a part of its data as an individual file
600       object.
601    """
602
603    def __init__(self, fileobj, offset, size, blockinfo=None):
604        self.fileobj = fileobj
605        self.offset = offset
606        self.size = size
607        self.position = 0
608        self.name = getattr(fileobj, "name", None)
609        self.closed = False
610
611        if blockinfo is None:
612            blockinfo = [(0, size)]
613
614        # Construct a map with data and zero blocks.
615        self.map_index = 0
616        self.map = []
617        lastpos = 0
618        realpos = self.offset
619        for offset, size in blockinfo:
620            if offset > lastpos:
621                self.map.append((False, lastpos, offset, None))
622            self.map.append((True, offset, offset + size, realpos))
623            realpos += size
624            lastpos = offset + size
625        if lastpos < self.size:
626            self.map.append((False, lastpos, self.size, None))
627
628    def flush(self):
629        pass
630
631    def readable(self):
632        return True
633
634    def writable(self):
635        return False
636
637    def seekable(self):
638        return self.fileobj.seekable()
639
640    def tell(self):
641        """Return the current file position.
642        """
643        return self.position
644
645    def seek(self, position, whence=io.SEEK_SET):
646        """Seek to a position in the file.
647        """
648        if whence == io.SEEK_SET:
649            self.position = min(max(position, 0), self.size)
650        elif whence == io.SEEK_CUR:
651            if position < 0:
652                self.position = max(self.position + position, 0)
653            else:
654                self.position = min(self.position + position, self.size)
655        elif whence == io.SEEK_END:
656            self.position = max(min(self.size + position, self.size), 0)
657        else:
658            raise ValueError("Invalid argument")
659        return self.position
660
661    def read(self, size=None):
662        """Read data from the file.
663        """
664        if size is None:
665            size = self.size - self.position
666        else:
667            size = min(size, self.size - self.position)
668
669        buf = b""
670        while size > 0:
671            while True:
672                data, start, stop, offset = self.map[self.map_index]
673                if start <= self.position < stop:
674                    break
675                else:
676                    self.map_index += 1
677                    if self.map_index == len(self.map):
678                        self.map_index = 0
679            length = min(size, stop - self.position)
680            if data:
681                self.fileobj.seek(offset + (self.position - start))
682                b = self.fileobj.read(length)
683                if len(b) != length:
684                    raise ReadError("unexpected end of data")
685                buf += b
686            else:
687                buf += NUL * length
688            size -= length
689            self.position += length
690        return buf
691
692    def readinto(self, b):
693        buf = self.read(len(b))
694        b[:len(buf)] = buf
695        return len(buf)
696
697    def close(self):
698        self.closed = True
699#class _FileInFile
700
701class ExFileObject(io.BufferedReader):
702
703    def __init__(self, tarfile, tarinfo):
704        fileobj = _FileInFile(tarfile.fileobj, tarinfo.offset_data,
705                tarinfo.size, tarinfo.sparse)
706        super().__init__(fileobj)
707#class ExFileObject
708
709#------------------
710# Exported Classes
711#------------------
712class TarInfo(object):
713    """Informational class which holds the details about an
714       archive member given by a tar header block.
715       TarInfo objects are returned by TarFile.getmember(),
716       TarFile.getmembers() and TarFile.gettarinfo() and are
717       usually created internally.
718    """
719
720    __slots__ = dict(
721        name = 'Name of the archive member.',
722        mode = 'Permission bits.',
723        uid = 'User ID of the user who originally stored this member.',
724        gid = 'Group ID of the user who originally stored this member.',
725        size = 'Size in bytes.',
726        mtime = 'Time of last modification.',
727        chksum = 'Header checksum.',
728        type = ('File type. type is usually one of these constants: '
729                'REGTYPE, AREGTYPE, LNKTYPE, SYMTYPE, DIRTYPE, FIFOTYPE, '
730                'CONTTYPE, CHRTYPE, BLKTYPE, GNUTYPE_SPARSE.'),
731        linkname = ('Name of the target file name, which is only present '
732                    'in TarInfo objects of type LNKTYPE and SYMTYPE.'),
733        uname = 'User name.',
734        gname = 'Group name.',
735        devmajor = 'Device major number.',
736        devminor = 'Device minor number.',
737        offset = 'The tar header starts here.',
738        offset_data = "The file's data starts here.",
739        pax_headers = ('A dictionary containing key-value pairs of an '
740                       'associated pax extended header.'),
741        sparse = 'Sparse member information.',
742        tarfile = None,
743        _sparse_structs = None,
744        _link_target = None,
745        )
746
747    def __init__(self, name=""):
748        """Construct a TarInfo object. name is the optional name
749           of the member.
750        """
751        self.name = name        # member name
752        self.mode = 0o644       # file permissions
753        self.uid = 0            # user id
754        self.gid = 0            # group id
755        self.size = 0           # file size
756        self.mtime = 0          # modification time
757        self.chksum = 0         # header checksum
758        self.type = REGTYPE     # member type
759        self.linkname = ""      # link name
760        self.uname = ""         # user name
761        self.gname = ""         # group name
762        self.devmajor = 0       # device major number
763        self.devminor = 0       # device minor number
764
765        self.offset = 0         # the tar header starts here
766        self.offset_data = 0    # the file's data starts here
767
768        self.sparse = None      # sparse member information
769        self.pax_headers = {}   # pax header information
770
771    @property
772    def path(self):
773        'In pax headers, "name" is called "path".'
774        return self.name
775
776    @path.setter
777    def path(self, name):
778        self.name = name
779
780    @property
781    def linkpath(self):
782        'In pax headers, "linkname" is called "linkpath".'
783        return self.linkname
784
785    @linkpath.setter
786    def linkpath(self, linkname):
787        self.linkname = linkname
788
789    def __repr__(self):
790        return "<%s %r at %#x>" % (self.__class__.__name__,self.name,id(self))
791
792    def get_info(self):
793        """Return the TarInfo's attributes as a dictionary.
794        """
795        info = {
796            "name":     self.name,
797            "mode":     self.mode & 0o7777,
798            "uid":      self.uid,
799            "gid":      self.gid,
800            "size":     self.size,
801            "mtime":    self.mtime,
802            "chksum":   self.chksum,
803            "type":     self.type,
804            "linkname": self.linkname,
805            "uname":    self.uname,
806            "gname":    self.gname,
807            "devmajor": self.devmajor,
808            "devminor": self.devminor
809        }
810
811        if info["type"] == DIRTYPE and not info["name"].endswith("/"):
812            info["name"] += "/"
813
814        return info
815
816    def tobuf(self, format=DEFAULT_FORMAT, encoding=ENCODING, errors="surrogateescape"):
817        """Return a tar header as a string of 512 byte blocks.
818        """
819        info = self.get_info()
820
821        if format == USTAR_FORMAT:
822            return self.create_ustar_header(info, encoding, errors)
823        elif format == GNU_FORMAT:
824            return self.create_gnu_header(info, encoding, errors)
825        elif format == PAX_FORMAT:
826            return self.create_pax_header(info, encoding)
827        else:
828            raise ValueError("invalid format")
829
830    def create_ustar_header(self, info, encoding, errors):
831        """Return the object as a ustar header block.
832        """
833        info["magic"] = POSIX_MAGIC
834
835        if len(info["linkname"].encode(encoding, errors)) > LENGTH_LINK:
836            raise ValueError("linkname is too long")
837
838        if len(info["name"].encode(encoding, errors)) > LENGTH_NAME:
839            info["prefix"], info["name"] = self._posix_split_name(info["name"], encoding, errors)
840
841        return self._create_header(info, USTAR_FORMAT, encoding, errors)
842
843    def create_gnu_header(self, info, encoding, errors):
844        """Return the object as a GNU header block sequence.
845        """
846        info["magic"] = GNU_MAGIC
847
848        buf = b""
849        if len(info["linkname"].encode(encoding, errors)) > LENGTH_LINK:
850            buf += self._create_gnu_long_header(info["linkname"], GNUTYPE_LONGLINK, encoding, errors)
851
852        if len(info["name"].encode(encoding, errors)) > LENGTH_NAME:
853            buf += self._create_gnu_long_header(info["name"], GNUTYPE_LONGNAME, encoding, errors)
854
855        return buf + self._create_header(info, GNU_FORMAT, encoding, errors)
856
857    def create_pax_header(self, info, encoding):
858        """Return the object as a ustar header block. If it cannot be
859           represented this way, prepend a pax extended header sequence
860           with supplement information.
861        """
862        info["magic"] = POSIX_MAGIC
863        pax_headers = self.pax_headers.copy()
864
865        # Test string fields for values that exceed the field length or cannot
866        # be represented in ASCII encoding.
867        for name, hname, length in (
868                ("name", "path", LENGTH_NAME), ("linkname", "linkpath", LENGTH_LINK),
869                ("uname", "uname", 32), ("gname", "gname", 32)):
870
871            if hname in pax_headers:
872                # The pax header has priority.
873                continue
874
875            # Try to encode the string as ASCII.
876            try:
877                info[name].encode("ascii", "strict")
878            except UnicodeEncodeError:
879                pax_headers[hname] = info[name]
880                continue
881
882            if len(info[name]) > length:
883                pax_headers[hname] = info[name]
884
885        # Test number fields for values that exceed the field limit or values
886        # that like to be stored as float.
887        for name, digits in (("uid", 8), ("gid", 8), ("size", 12), ("mtime", 12)):
888            if name in pax_headers:
889                # The pax header has priority. Avoid overflow.
890                info[name] = 0
891                continue
892
893            val = info[name]
894            if not 0 <= val < 8 ** (digits - 1) or isinstance(val, float):
895                pax_headers[name] = str(val)
896                info[name] = 0
897
898        # Create a pax extended header if necessary.
899        if pax_headers:
900            buf = self._create_pax_generic_header(pax_headers, XHDTYPE, encoding)
901        else:
902            buf = b""
903
904        return buf + self._create_header(info, USTAR_FORMAT, "ascii", "replace")
905
906    @classmethod
907    def create_pax_global_header(cls, pax_headers):
908        """Return the object as a pax global header block sequence.
909        """
910        return cls._create_pax_generic_header(pax_headers, XGLTYPE, "utf-8")
911
912    def _posix_split_name(self, name, encoding, errors):
913        """Split a name longer than 100 chars into a prefix
914           and a name part.
915        """
916        components = name.split("/")
917        for i in range(1, len(components)):
918            prefix = "/".join(components[:i])
919            name = "/".join(components[i:])
920            if len(prefix.encode(encoding, errors)) <= LENGTH_PREFIX and \
921                    len(name.encode(encoding, errors)) <= LENGTH_NAME:
922                break
923        else:
924            raise ValueError("name is too long")
925
926        return prefix, name
927
928    @staticmethod
929    def _create_header(info, format, encoding, errors):
930        """Return a header block. info is a dictionary with file
931           information, format must be one of the *_FORMAT constants.
932        """
933        parts = [
934            stn(info.get("name", ""), 100, encoding, errors),
935            itn(info.get("mode", 0) & 0o7777, 8, format),
936            itn(info.get("uid", 0), 8, format),
937            itn(info.get("gid", 0), 8, format),
938            itn(info.get("size", 0), 12, format),
939            itn(info.get("mtime", 0), 12, format),
940            b"        ", # checksum field
941            info.get("type", REGTYPE),
942            stn(info.get("linkname", ""), 100, encoding, errors),
943            info.get("magic", POSIX_MAGIC),
944            stn(info.get("uname", ""), 32, encoding, errors),
945            stn(info.get("gname", ""), 32, encoding, errors),
946            itn(info.get("devmajor", 0), 8, format),
947            itn(info.get("devminor", 0), 8, format),
948            stn(info.get("prefix", ""), 155, encoding, errors)
949        ]
950
951        buf = struct.pack("%ds" % BLOCKSIZE, b"".join(parts))
952        chksum = calc_chksums(buf[-BLOCKSIZE:])[0]
953        buf = buf[:-364] + bytes("%06o\0" % chksum, "ascii") + buf[-357:]
954        return buf
955
956    @staticmethod
957    def _create_payload(payload):
958        """Return the string payload filled with zero bytes
959           up to the next 512 byte border.
960        """
961        blocks, remainder = divmod(len(payload), BLOCKSIZE)
962        if remainder > 0:
963            payload += (BLOCKSIZE - remainder) * NUL
964        return payload
965
966    @classmethod
967    def _create_gnu_long_header(cls, name, type, encoding, errors):
968        """Return a GNUTYPE_LONGNAME or GNUTYPE_LONGLINK sequence
969           for name.
970        """
971        name = name.encode(encoding, errors) + NUL
972
973        info = {}
974        info["name"] = "././@LongLink"
975        info["type"] = type
976        info["size"] = len(name)
977        info["magic"] = GNU_MAGIC
978
979        # create extended header + name blocks.
980        return cls._create_header(info, USTAR_FORMAT, encoding, errors) + \
981                cls._create_payload(name)
982
983    @classmethod
984    def _create_pax_generic_header(cls, pax_headers, type, encoding):
985        """Return a POSIX.1-2008 extended or global header sequence
986           that contains a list of keyword, value pairs. The values
987           must be strings.
988        """
989        # Check if one of the fields contains surrogate characters and thereby
990        # forces hdrcharset=BINARY, see _proc_pax() for more information.
991        binary = False
992        for keyword, value in pax_headers.items():
993            try:
994                value.encode("utf-8", "strict")
995            except UnicodeEncodeError:
996                binary = True
997                break
998
999        records = b""
1000        if binary:
1001            # Put the hdrcharset field at the beginning of the header.
1002            records += b"21 hdrcharset=BINARY\n"
1003
1004        for keyword, value in pax_headers.items():
1005            keyword = keyword.encode("utf-8")
1006            if binary:
1007                # Try to restore the original byte representation of `value'.
1008                # Needless to say, that the encoding must match the string.
1009                value = value.encode(encoding, "surrogateescape")
1010            else:
1011                value = value.encode("utf-8")
1012
1013            l = len(keyword) + len(value) + 3   # ' ' + '=' + '\n'
1014            n = p = 0
1015            while True:
1016                n = l + len(str(p))
1017                if n == p:
1018                    break
1019                p = n
1020            records += bytes(str(p), "ascii") + b" " + keyword + b"=" + value + b"\n"
1021
1022        # We use a hardcoded "././@PaxHeader" name like star does
1023        # instead of the one that POSIX recommends.
1024        info = {}
1025        info["name"] = "././@PaxHeader"
1026        info["type"] = type
1027        info["size"] = len(records)
1028        info["magic"] = POSIX_MAGIC
1029
1030        # Create pax header + record blocks.
1031        return cls._create_header(info, USTAR_FORMAT, "ascii", "replace") + \
1032                cls._create_payload(records)
1033
1034    @classmethod
1035    def frombuf(cls, buf, encoding, errors):
1036        """Construct a TarInfo object from a 512 byte bytes object.
1037        """
1038        if len(buf) == 0:
1039            raise EmptyHeaderError("empty header")
1040        if len(buf) != BLOCKSIZE:
1041            raise TruncatedHeaderError("truncated header")
1042        if buf.count(NUL) == BLOCKSIZE:
1043            raise EOFHeaderError("end of file header")
1044
1045        chksum = nti(buf[148:156])
1046        if chksum not in calc_chksums(buf):
1047            raise InvalidHeaderError("bad checksum")
1048
1049        obj = cls()
1050        obj.name = nts(buf[0:100], encoding, errors)
1051        obj.mode = nti(buf[100:108])
1052        obj.uid = nti(buf[108:116])
1053        obj.gid = nti(buf[116:124])
1054        obj.size = nti(buf[124:136])
1055        obj.mtime = nti(buf[136:148])
1056        obj.chksum = chksum
1057        obj.type = buf[156:157]
1058        obj.linkname = nts(buf[157:257], encoding, errors)
1059        obj.uname = nts(buf[265:297], encoding, errors)
1060        obj.gname = nts(buf[297:329], encoding, errors)
1061        obj.devmajor = nti(buf[329:337])
1062        obj.devminor = nti(buf[337:345])
1063        prefix = nts(buf[345:500], encoding, errors)
1064
1065        # Old V7 tar format represents a directory as a regular
1066        # file with a trailing slash.
1067        if obj.type == AREGTYPE and obj.name.endswith("/"):
1068            obj.type = DIRTYPE
1069
1070        # The old GNU sparse format occupies some of the unused
1071        # space in the buffer for up to 4 sparse structures.
1072        # Save them for later processing in _proc_sparse().
1073        if obj.type == GNUTYPE_SPARSE:
1074            pos = 386
1075            structs = []
1076            for i in range(4):
1077                try:
1078                    offset = nti(buf[pos:pos + 12])
1079                    numbytes = nti(buf[pos + 12:pos + 24])
1080                except ValueError:
1081                    break
1082                structs.append((offset, numbytes))
1083                pos += 24
1084            isextended = bool(buf[482])
1085            origsize = nti(buf[483:495])
1086            obj._sparse_structs = (structs, isextended, origsize)
1087
1088        # Remove redundant slashes from directories.
1089        if obj.isdir():
1090            obj.name = obj.name.rstrip("/")
1091
1092        # Reconstruct a ustar longname.
1093        if prefix and obj.type not in GNU_TYPES:
1094            obj.name = prefix + "/" + obj.name
1095        return obj
1096
1097    @classmethod
1098    def fromtarfile(cls, tarfile):
1099        """Return the next TarInfo object from TarFile object
1100           tarfile.
1101        """
1102        buf = tarfile.fileobj.read(BLOCKSIZE)
1103        obj = cls.frombuf(buf, tarfile.encoding, tarfile.errors)
1104        obj.offset = tarfile.fileobj.tell() - BLOCKSIZE
1105        return obj._proc_member(tarfile)
1106
1107    #--------------------------------------------------------------------------
1108    # The following are methods that are called depending on the type of a
1109    # member. The entry point is _proc_member() which can be overridden in a
1110    # subclass to add custom _proc_*() methods. A _proc_*() method MUST
1111    # implement the following
1112    # operations:
1113    # 1. Set self.offset_data to the position where the data blocks begin,
1114    #    if there is data that follows.
1115    # 2. Set tarfile.offset to the position where the next member's header will
1116    #    begin.
1117    # 3. Return self or another valid TarInfo object.
1118    def _proc_member(self, tarfile):
1119        """Choose the right processing method depending on
1120           the type and call it.
1121        """
1122        if self.type in (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK):
1123            return self._proc_gnulong(tarfile)
1124        elif self.type == GNUTYPE_SPARSE:
1125            return self._proc_sparse(tarfile)
1126        elif self.type in (XHDTYPE, XGLTYPE, SOLARIS_XHDTYPE):
1127            return self._proc_pax(tarfile)
1128        else:
1129            return self._proc_builtin(tarfile)
1130
1131    def _proc_builtin(self, tarfile):
1132        """Process a builtin type or an unknown type which
1133           will be treated as a regular file.
1134        """
1135        self.offset_data = tarfile.fileobj.tell()
1136        offset = self.offset_data
1137        if self.isreg() or self.type not in SUPPORTED_TYPES:
1138            # Skip the following data blocks.
1139            offset += self._block(self.size)
1140        tarfile.offset = offset
1141
1142        # Patch the TarInfo object with saved global
1143        # header information.
1144        self._apply_pax_info(tarfile.pax_headers, tarfile.encoding, tarfile.errors)
1145
1146        return self
1147
1148    def _proc_gnulong(self, tarfile):
1149        """Process the blocks that hold a GNU longname
1150           or longlink member.
1151        """
1152        buf = tarfile.fileobj.read(self._block(self.size))
1153
1154        # Fetch the next header and process it.
1155        try:
1156            next = self.fromtarfile(tarfile)
1157        except HeaderError:
1158            raise SubsequentHeaderError("missing or bad subsequent header")
1159
1160        # Patch the TarInfo object from the next header with
1161        # the longname information.
1162        next.offset = self.offset
1163        if self.type == GNUTYPE_LONGNAME:
1164            next.name = nts(buf, tarfile.encoding, tarfile.errors)
1165        elif self.type == GNUTYPE_LONGLINK:
1166            next.linkname = nts(buf, tarfile.encoding, tarfile.errors)
1167
1168        return next
1169
1170    def _proc_sparse(self, tarfile):
1171        """Process a GNU sparse header plus extra headers.
1172        """
1173        # We already collected some sparse structures in frombuf().
1174        structs, isextended, origsize = self._sparse_structs
1175        del self._sparse_structs
1176
1177        # Collect sparse structures from extended header blocks.
1178        while isextended:
1179            buf = tarfile.fileobj.read(BLOCKSIZE)
1180            pos = 0
1181            for i in range(21):
1182                try:
1183                    offset = nti(buf[pos:pos + 12])
1184                    numbytes = nti(buf[pos + 12:pos + 24])
1185                except ValueError:
1186                    break
1187                if offset and numbytes:
1188                    structs.append((offset, numbytes))
1189                pos += 24
1190            isextended = bool(buf[504])
1191        self.sparse = structs
1192
1193        self.offset_data = tarfile.fileobj.tell()
1194        tarfile.offset = self.offset_data + self._block(self.size)
1195        self.size = origsize
1196        return self
1197
1198    def _proc_pax(self, tarfile):
1199        """Process an extended or global header as described in
1200           POSIX.1-2008.
1201        """
1202        # Read the header information.
1203        buf = tarfile.fileobj.read(self._block(self.size))
1204
1205        # A pax header stores supplemental information for either
1206        # the following file (extended) or all following files
1207        # (global).
1208        if self.type == XGLTYPE:
1209            pax_headers = tarfile.pax_headers
1210        else:
1211            pax_headers = tarfile.pax_headers.copy()
1212
1213        # Check if the pax header contains a hdrcharset field. This tells us
1214        # the encoding of the path, linkpath, uname and gname fields. Normally,
1215        # these fields are UTF-8 encoded but since POSIX.1-2008 tar
1216        # implementations are allowed to store them as raw binary strings if
1217        # the translation to UTF-8 fails.
1218        match = re.search(br"\d+ hdrcharset=([^\n]+)\n", buf)
1219        if match is not None:
1220            pax_headers["hdrcharset"] = match.group(1).decode("utf-8")
1221
1222        # For the time being, we don't care about anything other than "BINARY".
1223        # The only other value that is currently allowed by the standard is
1224        # "ISO-IR 10646 2000 UTF-8" in other words UTF-8.
1225        hdrcharset = pax_headers.get("hdrcharset")
1226        if hdrcharset == "BINARY":
1227            encoding = tarfile.encoding
1228        else:
1229            encoding = "utf-8"
1230
1231        # Parse pax header information. A record looks like that:
1232        # "%d %s=%s\n" % (length, keyword, value). length is the size
1233        # of the complete record including the length field itself and
1234        # the newline. keyword and value are both UTF-8 encoded strings.
1235        regex = re.compile(br"(\d+) ([^=]+)=")
1236        pos = 0
1237        while True:
1238            match = regex.match(buf, pos)
1239            if not match:
1240                break
1241
1242            length, keyword = match.groups()
1243            length = int(length)
1244            if length == 0:
1245                raise InvalidHeaderError("invalid header")
1246            value = buf[match.end(2) + 1:match.start(1) + length - 1]
1247
1248            # Normally, we could just use "utf-8" as the encoding and "strict"
1249            # as the error handler, but we better not take the risk. For
1250            # example, GNU tar <= 1.23 is known to store filenames it cannot
1251            # translate to UTF-8 as raw strings (unfortunately without a
1252            # hdrcharset=BINARY header).
1253            # We first try the strict standard encoding, and if that fails we
1254            # fall back on the user's encoding and error handler.
1255            keyword = self._decode_pax_field(keyword, "utf-8", "utf-8",
1256                    tarfile.errors)
1257            if keyword in PAX_NAME_FIELDS:
1258                value = self._decode_pax_field(value, encoding, tarfile.encoding,
1259                        tarfile.errors)
1260            else:
1261                value = self._decode_pax_field(value, "utf-8", "utf-8",
1262                        tarfile.errors)
1263
1264            pax_headers[keyword] = value
1265            pos += length
1266
1267        # Fetch the next header.
1268        try:
1269            next = self.fromtarfile(tarfile)
1270        except HeaderError:
1271            raise SubsequentHeaderError("missing or bad subsequent header")
1272
1273        # Process GNU sparse information.
1274        if "GNU.sparse.map" in pax_headers:
1275            # GNU extended sparse format version 0.1.
1276            self._proc_gnusparse_01(next, pax_headers)
1277
1278        elif "GNU.sparse.size" in pax_headers:
1279            # GNU extended sparse format version 0.0.
1280            self._proc_gnusparse_00(next, pax_headers, buf)
1281
1282        elif pax_headers.get("GNU.sparse.major") == "1" and pax_headers.get("GNU.sparse.minor") == "0":
1283            # GNU extended sparse format version 1.0.
1284            self._proc_gnusparse_10(next, pax_headers, tarfile)
1285
1286        if self.type in (XHDTYPE, SOLARIS_XHDTYPE):
1287            # Patch the TarInfo object with the extended header info.
1288            next._apply_pax_info(pax_headers, tarfile.encoding, tarfile.errors)
1289            next.offset = self.offset
1290
1291            if "size" in pax_headers:
1292                # If the extended header replaces the size field,
1293                # we need to recalculate the offset where the next
1294                # header starts.
1295                offset = next.offset_data
1296                if next.isreg() or next.type not in SUPPORTED_TYPES:
1297                    offset += next._block(next.size)
1298                tarfile.offset = offset
1299
1300        return next
1301
1302    def _proc_gnusparse_00(self, next, pax_headers, buf):
1303        """Process a GNU tar extended sparse header, version 0.0.
1304        """
1305        offsets = []
1306        for match in re.finditer(br"\d+ GNU.sparse.offset=(\d+)\n", buf):
1307            offsets.append(int(match.group(1)))
1308        numbytes = []
1309        for match in re.finditer(br"\d+ GNU.sparse.numbytes=(\d+)\n", buf):
1310            numbytes.append(int(match.group(1)))
1311        next.sparse = list(zip(offsets, numbytes))
1312
1313    def _proc_gnusparse_01(self, next, pax_headers):
1314        """Process a GNU tar extended sparse header, version 0.1.
1315        """
1316        sparse = [int(x) for x in pax_headers["GNU.sparse.map"].split(",")]
1317        next.sparse = list(zip(sparse[::2], sparse[1::2]))
1318
1319    def _proc_gnusparse_10(self, next, pax_headers, tarfile):
1320        """Process a GNU tar extended sparse header, version 1.0.
1321        """
1322        fields = None
1323        sparse = []
1324        buf = tarfile.fileobj.read(BLOCKSIZE)
1325        fields, buf = buf.split(b"\n", 1)
1326        fields = int(fields)
1327        while len(sparse) < fields * 2:
1328            if b"\n" not in buf:
1329                buf += tarfile.fileobj.read(BLOCKSIZE)
1330            number, buf = buf.split(b"\n", 1)
1331            sparse.append(int(number))
1332        next.offset_data = tarfile.fileobj.tell()
1333        next.sparse = list(zip(sparse[::2], sparse[1::2]))
1334
1335    def _apply_pax_info(self, pax_headers, encoding, errors):
1336        """Replace fields with supplemental information from a previous
1337           pax extended or global header.
1338        """
1339        for keyword, value in pax_headers.items():
1340            if keyword == "GNU.sparse.name":
1341                setattr(self, "path", value)
1342            elif keyword == "GNU.sparse.size":
1343                setattr(self, "size", int(value))
1344            elif keyword == "GNU.sparse.realsize":
1345                setattr(self, "size", int(value))
1346            elif keyword in PAX_FIELDS:
1347                if keyword in PAX_NUMBER_FIELDS:
1348                    try:
1349                        value = PAX_NUMBER_FIELDS[keyword](value)
1350                    except ValueError:
1351                        value = 0
1352                if keyword == "path":
1353                    value = value.rstrip("/")
1354                setattr(self, keyword, value)
1355
1356        self.pax_headers = pax_headers.copy()
1357
1358    def _decode_pax_field(self, value, encoding, fallback_encoding, fallback_errors):
1359        """Decode a single field from a pax record.
1360        """
1361        try:
1362            return value.decode(encoding, "strict")
1363        except UnicodeDecodeError:
1364            return value.decode(fallback_encoding, fallback_errors)
1365
1366    def _block(self, count):
1367        """Round up a byte count by BLOCKSIZE and return it,
1368           e.g. _block(834) => 1024.
1369        """
1370        blocks, remainder = divmod(count, BLOCKSIZE)
1371        if remainder:
1372            blocks += 1
1373        return blocks * BLOCKSIZE
1374
1375    def isreg(self):
1376        'Return True if the Tarinfo object is a regular file.'
1377        return self.type in REGULAR_TYPES
1378
1379    def isfile(self):
1380        'Return True if the Tarinfo object is a regular file.'
1381        return self.isreg()
1382
1383    def isdir(self):
1384        'Return True if it is a directory.'
1385        return self.type == DIRTYPE
1386
1387    def issym(self):
1388        'Return True if it is a symbolic link.'
1389        return self.type == SYMTYPE
1390
1391    def islnk(self):
1392        'Return True if it is a hard link.'
1393        return self.type == LNKTYPE
1394
1395    def ischr(self):
1396        'Return True if it is a character device.'
1397        return self.type == CHRTYPE
1398
1399    def isblk(self):
1400        'Return True if it is a block device.'
1401        return self.type == BLKTYPE
1402
1403    def isfifo(self):
1404        'Return True if it is a FIFO.'
1405        return self.type == FIFOTYPE
1406
1407    def issparse(self):
1408        return self.sparse is not None
1409
1410    def isdev(self):
1411        'Return True if it is one of character device, block device or FIFO.'
1412        return self.type in (CHRTYPE, BLKTYPE, FIFOTYPE)
1413# class TarInfo
1414
1415class TarFile(object):
1416    """The TarFile Class provides an interface to tar archives.
1417    """
1418
1419    debug = 0                   # May be set from 0 (no msgs) to 3 (all msgs)
1420
1421    dereference = False         # If true, add content of linked file to the
1422                                # tar file, else the link.
1423
1424    ignore_zeros = False        # If true, skips empty or invalid blocks and
1425                                # continues processing.
1426
1427    errorlevel = 1              # If 0, fatal errors only appear in debug
1428                                # messages (if debug >= 0). If > 0, errors
1429                                # are passed to the caller as exceptions.
1430
1431    format = DEFAULT_FORMAT     # The format to use when creating an archive.
1432
1433    encoding = ENCODING         # Encoding for 8-bit character strings.
1434
1435    errors = None               # Error handler for unicode conversion.
1436
1437    tarinfo = TarInfo           # The default TarInfo class to use.
1438
1439    fileobject = ExFileObject   # The file-object for extractfile().
1440
1441    def __init__(self, name=None, mode="r", fileobj=None, format=None,
1442            tarinfo=None, dereference=None, ignore_zeros=None, encoding=None,
1443            errors="surrogateescape", pax_headers=None, debug=None,
1444            errorlevel=None, copybufsize=None):
1445        """Open an (uncompressed) tar archive `name'. `mode' is either 'r' to
1446           read from an existing archive, 'a' to append data to an existing
1447           file or 'w' to create a new file overwriting an existing one. `mode'
1448           defaults to 'r'.
1449           If `fileobj' is given, it is used for reading or writing data. If it
1450           can be determined, `mode' is overridden by `fileobj's mode.
1451           `fileobj' is not closed, when TarFile is closed.
1452        """
1453        modes = {"r": "rb", "a": "r+b", "w": "wb", "x": "xb"}
1454        if mode not in modes:
1455            raise ValueError("mode must be 'r', 'a', 'w' or 'x'")
1456        self.mode = mode
1457        self._mode = modes[mode]
1458
1459        if not fileobj:
1460            if self.mode == "a" and not os.path.exists(name):
1461                # Create nonexistent files in append mode.
1462                self.mode = "w"
1463                self._mode = "wb"
1464            fileobj = bltn_open(name, self._mode)
1465            self._extfileobj = False
1466        else:
1467            if (name is None and hasattr(fileobj, "name") and
1468                isinstance(fileobj.name, (str, bytes))):
1469                name = fileobj.name
1470            if hasattr(fileobj, "mode"):
1471                self._mode = fileobj.mode
1472            self._extfileobj = True
1473        self.name = os.path.abspath(name) if name else None
1474        self.fileobj = fileobj
1475
1476        # Init attributes.
1477        if format is not None:
1478            self.format = format
1479        if tarinfo is not None:
1480            self.tarinfo = tarinfo
1481        if dereference is not None:
1482            self.dereference = dereference
1483        if ignore_zeros is not None:
1484            self.ignore_zeros = ignore_zeros
1485        if encoding is not None:
1486            self.encoding = encoding
1487        self.errors = errors
1488
1489        if pax_headers is not None and self.format == PAX_FORMAT:
1490            self.pax_headers = pax_headers
1491        else:
1492            self.pax_headers = {}
1493
1494        if debug is not None:
1495            self.debug = debug
1496        if errorlevel is not None:
1497            self.errorlevel = errorlevel
1498
1499        # Init datastructures.
1500        self.copybufsize = copybufsize
1501        self.closed = False
1502        self.members = []       # list of members as TarInfo objects
1503        self._loaded = False    # flag if all members have been read
1504        self.offset = self.fileobj.tell()
1505                                # current position in the archive file
1506        self.inodes = {}        # dictionary caching the inodes of
1507                                # archive members already added
1508
1509        try:
1510            if self.mode == "r":
1511                self.firstmember = None
1512                self.firstmember = self.next()
1513
1514            if self.mode == "a":
1515                # Move to the end of the archive,
1516                # before the first empty block.
1517                while True:
1518                    self.fileobj.seek(self.offset)
1519                    try:
1520                        tarinfo = self.tarinfo.fromtarfile(self)
1521                        self.members.append(tarinfo)
1522                    except EOFHeaderError:
1523                        self.fileobj.seek(self.offset)
1524                        break
1525                    except HeaderError as e:
1526                        raise ReadError(str(e))
1527
1528            if self.mode in ("a", "w", "x"):
1529                self._loaded = True
1530
1531                if self.pax_headers:
1532                    buf = self.tarinfo.create_pax_global_header(self.pax_headers.copy())
1533                    self.fileobj.write(buf)
1534                    self.offset += len(buf)
1535        except:
1536            if not self._extfileobj:
1537                self.fileobj.close()
1538            self.closed = True
1539            raise
1540
1541    #--------------------------------------------------------------------------
1542    # Below are the classmethods which act as alternate constructors to the
1543    # TarFile class. The open() method is the only one that is needed for
1544    # public use; it is the "super"-constructor and is able to select an
1545    # adequate "sub"-constructor for a particular compression using the mapping
1546    # from OPEN_METH.
1547    #
1548    # This concept allows one to subclass TarFile without losing the comfort of
1549    # the super-constructor. A sub-constructor is registered and made available
1550    # by adding it to the mapping in OPEN_METH.
1551
1552    @classmethod
1553    def open(cls, name=None, mode="r", fileobj=None, bufsize=RECORDSIZE, **kwargs):
1554        """Open a tar archive for reading, writing or appending. Return
1555           an appropriate TarFile class.
1556
1557           mode:
1558           'r' or 'r:*' open for reading with transparent compression
1559           'r:'         open for reading exclusively uncompressed
1560           'r:gz'       open for reading with gzip compression
1561           'r:bz2'      open for reading with bzip2 compression
1562           'r:xz'       open for reading with lzma compression
1563           'a' or 'a:'  open for appending, creating the file if necessary
1564           'w' or 'w:'  open for writing without compression
1565           'w:gz'       open for writing with gzip compression
1566           'w:bz2'      open for writing with bzip2 compression
1567           'w:xz'       open for writing with lzma compression
1568
1569           'x' or 'x:'  create a tarfile exclusively without compression, raise
1570                        an exception if the file is already created
1571           'x:gz'       create a gzip compressed tarfile, raise an exception
1572                        if the file is already created
1573           'x:bz2'      create a bzip2 compressed tarfile, raise an exception
1574                        if the file is already created
1575           'x:xz'       create an lzma compressed tarfile, raise an exception
1576                        if the file is already created
1577
1578           'r|*'        open a stream of tar blocks with transparent compression
1579           'r|'         open an uncompressed stream of tar blocks for reading
1580           'r|gz'       open a gzip compressed stream of tar blocks
1581           'r|bz2'      open a bzip2 compressed stream of tar blocks
1582           'r|xz'       open an lzma compressed stream of tar blocks
1583           'w|'         open an uncompressed stream for writing
1584           'w|gz'       open a gzip compressed stream for writing
1585           'w|bz2'      open a bzip2 compressed stream for writing
1586           'w|xz'       open an lzma compressed stream for writing
1587        """
1588
1589        if not name and not fileobj:
1590            raise ValueError("nothing to open")
1591
1592        if mode in ("r", "r:*"):
1593            # Find out which *open() is appropriate for opening the file.
1594            def not_compressed(comptype):
1595                return cls.OPEN_METH[comptype] == 'taropen'
1596            for comptype in sorted(cls.OPEN_METH, key=not_compressed):
1597                func = getattr(cls, cls.OPEN_METH[comptype])
1598                if fileobj is not None:
1599                    saved_pos = fileobj.tell()
1600                try:
1601                    return func(name, "r", fileobj, **kwargs)
1602                except (ReadError, CompressionError):
1603                    if fileobj is not None:
1604                        fileobj.seek(saved_pos)
1605                    continue
1606            raise ReadError("file could not be opened successfully")
1607
1608        elif ":" in mode:
1609            filemode, comptype = mode.split(":", 1)
1610            filemode = filemode or "r"
1611            comptype = comptype or "tar"
1612
1613            # Select the *open() function according to
1614            # given compression.
1615            if comptype in cls.OPEN_METH:
1616                func = getattr(cls, cls.OPEN_METH[comptype])
1617            else:
1618                raise CompressionError("unknown compression type %r" % comptype)
1619            return func(name, filemode, fileobj, **kwargs)
1620
1621        elif "|" in mode:
1622            filemode, comptype = mode.split("|", 1)
1623            filemode = filemode or "r"
1624            comptype = comptype or "tar"
1625
1626            if filemode not in ("r", "w"):
1627                raise ValueError("mode must be 'r' or 'w'")
1628
1629            stream = _Stream(name, filemode, comptype, fileobj, bufsize)
1630            try:
1631                t = cls(name, filemode, stream, **kwargs)
1632            except:
1633                stream.close()
1634                raise
1635            t._extfileobj = False
1636            return t
1637
1638        elif mode in ("a", "w", "x"):
1639            return cls.taropen(name, mode, fileobj, **kwargs)
1640
1641        raise ValueError("undiscernible mode")
1642
1643    @classmethod
1644    def taropen(cls, name, mode="r", fileobj=None, **kwargs):
1645        """Open uncompressed tar archive name for reading or writing.
1646        """
1647        if mode not in ("r", "a", "w", "x"):
1648            raise ValueError("mode must be 'r', 'a', 'w' or 'x'")
1649        return cls(name, mode, fileobj, **kwargs)
1650
1651    @classmethod
1652    def gzopen(cls, name, mode="r", fileobj=None, compresslevel=9, **kwargs):
1653        """Open gzip compressed tar archive name for reading or writing.
1654           Appending is not allowed.
1655        """
1656        if mode not in ("r", "w", "x"):
1657            raise ValueError("mode must be 'r', 'w' or 'x'")
1658
1659        try:
1660            from gzip import GzipFile
1661        except ImportError:
1662            raise CompressionError("gzip module is not available")
1663
1664        try:
1665            fileobj = GzipFile(name, mode + "b", compresslevel, fileobj)
1666        except OSError:
1667            if fileobj is not None and mode == 'r':
1668                raise ReadError("not a gzip file")
1669            raise
1670
1671        try:
1672            t = cls.taropen(name, mode, fileobj, **kwargs)
1673        except OSError:
1674            fileobj.close()
1675            if mode == 'r':
1676                raise ReadError("not a gzip file")
1677            raise
1678        except:
1679            fileobj.close()
1680            raise
1681        t._extfileobj = False
1682        return t
1683
1684    @classmethod
1685    def bz2open(cls, name, mode="r", fileobj=None, compresslevel=9, **kwargs):
1686        """Open bzip2 compressed tar archive name for reading or writing.
1687           Appending is not allowed.
1688        """
1689        if mode not in ("r", "w", "x"):
1690            raise ValueError("mode must be 'r', 'w' or 'x'")
1691
1692        try:
1693            from bz2 import BZ2File
1694        except ImportError:
1695            raise CompressionError("bz2 module is not available")
1696
1697        fileobj = BZ2File(fileobj or name, mode, compresslevel=compresslevel)
1698
1699        try:
1700            t = cls.taropen(name, mode, fileobj, **kwargs)
1701        except (OSError, EOFError):
1702            fileobj.close()
1703            if mode == 'r':
1704                raise ReadError("not a bzip2 file")
1705            raise
1706        except:
1707            fileobj.close()
1708            raise
1709        t._extfileobj = False
1710        return t
1711
1712    @classmethod
1713    def xzopen(cls, name, mode="r", fileobj=None, preset=None, **kwargs):
1714        """Open lzma compressed tar archive name for reading or writing.
1715           Appending is not allowed.
1716        """
1717        if mode not in ("r", "w", "x"):
1718            raise ValueError("mode must be 'r', 'w' or 'x'")
1719
1720        try:
1721            from lzma import LZMAFile, LZMAError
1722        except ImportError:
1723            raise CompressionError("lzma module is not available")
1724
1725        fileobj = LZMAFile(fileobj or name, mode, preset=preset)
1726
1727        try:
1728            t = cls.taropen(name, mode, fileobj, **kwargs)
1729        except (LZMAError, EOFError):
1730            fileobj.close()
1731            if mode == 'r':
1732                raise ReadError("not an lzma file")
1733            raise
1734        except:
1735            fileobj.close()
1736            raise
1737        t._extfileobj = False
1738        return t
1739
1740    # All *open() methods are registered here.
1741    OPEN_METH = {
1742        "tar": "taropen",   # uncompressed tar
1743        "gz":  "gzopen",    # gzip compressed tar
1744        "bz2": "bz2open",   # bzip2 compressed tar
1745        "xz":  "xzopen"     # lzma compressed tar
1746    }
1747
1748    #--------------------------------------------------------------------------
1749    # The public methods which TarFile provides:
1750
1751    def close(self):
1752        """Close the TarFile. In write-mode, two finishing zero blocks are
1753           appended to the archive.
1754        """
1755        if self.closed:
1756            return
1757
1758        self.closed = True
1759        try:
1760            if self.mode in ("a", "w", "x"):
1761                self.fileobj.write(NUL * (BLOCKSIZE * 2))
1762                self.offset += (BLOCKSIZE * 2)
1763                # fill up the end with zero-blocks
1764                # (like option -b20 for tar does)
1765                blocks, remainder = divmod(self.offset, RECORDSIZE)
1766                if remainder > 0:
1767                    self.fileobj.write(NUL * (RECORDSIZE - remainder))
1768        finally:
1769            if not self._extfileobj:
1770                self.fileobj.close()
1771
1772    def getmember(self, name):
1773        """Return a TarInfo object for member `name'. If `name' can not be
1774           found in the archive, KeyError is raised. If a member occurs more
1775           than once in the archive, its last occurrence is assumed to be the
1776           most up-to-date version.
1777        """
1778        tarinfo = self._getmember(name)
1779        if tarinfo is None:
1780            raise KeyError("filename %r not found" % name)
1781        return tarinfo
1782
1783    def getmembers(self):
1784        """Return the members of the archive as a list of TarInfo objects. The
1785           list has the same order as the members in the archive.
1786        """
1787        self._check()
1788        if not self._loaded:    # if we want to obtain a list of
1789            self._load()        # all members, we first have to
1790                                # scan the whole archive.
1791        return self.members
1792
1793    def getnames(self):
1794        """Return the members of the archive as a list of their names. It has
1795           the same order as the list returned by getmembers().
1796        """
1797        return [tarinfo.name for tarinfo in self.getmembers()]
1798
1799    def gettarinfo(self, name=None, arcname=None, fileobj=None):
1800        """Create a TarInfo object from the result of os.stat or equivalent
1801           on an existing file. The file is either named by `name', or
1802           specified as a file object `fileobj' with a file descriptor. If
1803           given, `arcname' specifies an alternative name for the file in the
1804           archive, otherwise, the name is taken from the 'name' attribute of
1805           'fileobj', or the 'name' argument. The name should be a text
1806           string.
1807        """
1808        self._check("awx")
1809
1810        # When fileobj is given, replace name by
1811        # fileobj's real name.
1812        if fileobj is not None:
1813            name = fileobj.name
1814
1815        # Building the name of the member in the archive.
1816        # Backward slashes are converted to forward slashes,
1817        # Absolute paths are turned to relative paths.
1818        if arcname is None:
1819            arcname = name
1820        drv, arcname = os.path.splitdrive(arcname)
1821        arcname = arcname.replace(os.sep, "/")
1822        arcname = arcname.lstrip("/")
1823
1824        # Now, fill the TarInfo object with
1825        # information specific for the file.
1826        tarinfo = self.tarinfo()
1827        tarinfo.tarfile = self  # Not needed
1828
1829        # Use os.stat or os.lstat, depending on if symlinks shall be resolved.
1830        if fileobj is None:
1831            if not self.dereference:
1832                statres = os.lstat(name)
1833            else:
1834                statres = os.stat(name)
1835        else:
1836            statres = os.fstat(fileobj.fileno())
1837        linkname = ""
1838
1839        stmd = statres.st_mode
1840        if stat.S_ISREG(stmd):
1841            inode = (statres.st_ino, statres.st_dev)
1842            if not self.dereference and statres.st_nlink > 1 and \
1843                    inode in self.inodes and arcname != self.inodes[inode]:
1844                # Is it a hardlink to an already
1845                # archived file?
1846                type = LNKTYPE
1847                linkname = self.inodes[inode]
1848            else:
1849                # The inode is added only if its valid.
1850                # For win32 it is always 0.
1851                type = REGTYPE
1852                if inode[0]:
1853                    self.inodes[inode] = arcname
1854        elif stat.S_ISDIR(stmd):
1855            type = DIRTYPE
1856        elif stat.S_ISFIFO(stmd):
1857            type = FIFOTYPE
1858        elif stat.S_ISLNK(stmd):
1859            type = SYMTYPE
1860            linkname = os.readlink(name)
1861        elif stat.S_ISCHR(stmd):
1862            type = CHRTYPE
1863        elif stat.S_ISBLK(stmd):
1864            type = BLKTYPE
1865        else:
1866            return None
1867
1868        # Fill the TarInfo object with all
1869        # information we can get.
1870        tarinfo.name = arcname
1871        tarinfo.mode = stmd
1872        tarinfo.uid = statres.st_uid
1873        tarinfo.gid = statres.st_gid
1874        if type == REGTYPE:
1875            tarinfo.size = statres.st_size
1876        else:
1877            tarinfo.size = 0
1878        tarinfo.mtime = statres.st_mtime
1879        tarinfo.type = type
1880        tarinfo.linkname = linkname
1881        if pwd:
1882            try:
1883                tarinfo.uname = pwd.getpwuid(tarinfo.uid)[0]
1884            except KeyError:
1885                pass
1886        if grp:
1887            try:
1888                tarinfo.gname = grp.getgrgid(tarinfo.gid)[0]
1889            except KeyError:
1890                pass
1891
1892        if type in (CHRTYPE, BLKTYPE):
1893            if hasattr(os, "major") and hasattr(os, "minor"):
1894                tarinfo.devmajor = os.major(statres.st_rdev)
1895                tarinfo.devminor = os.minor(statres.st_rdev)
1896        return tarinfo
1897
1898    def list(self, verbose=True, *, members=None):
1899        """Print a table of contents to sys.stdout. If `verbose' is False, only
1900           the names of the members are printed. If it is True, an `ls -l'-like
1901           output is produced. `members' is optional and must be a subset of the
1902           list returned by getmembers().
1903        """
1904        self._check()
1905
1906        if members is None:
1907            members = self
1908        for tarinfo in members:
1909            if verbose:
1910                _safe_print(stat.filemode(tarinfo.mode))
1911                _safe_print("%s/%s" % (tarinfo.uname or tarinfo.uid,
1912                                       tarinfo.gname or tarinfo.gid))
1913                if tarinfo.ischr() or tarinfo.isblk():
1914                    _safe_print("%10s" %
1915                            ("%d,%d" % (tarinfo.devmajor, tarinfo.devminor)))
1916                else:
1917                    _safe_print("%10d" % tarinfo.size)
1918                _safe_print("%d-%02d-%02d %02d:%02d:%02d" \
1919                            % time.localtime(tarinfo.mtime)[:6])
1920
1921            _safe_print(tarinfo.name + ("/" if tarinfo.isdir() else ""))
1922
1923            if verbose:
1924                if tarinfo.issym():
1925                    _safe_print("-> " + tarinfo.linkname)
1926                if tarinfo.islnk():
1927                    _safe_print("link to " + tarinfo.linkname)
1928            print()
1929
1930    def add(self, name, arcname=None, recursive=True, *, filter=None):
1931        """Add the file `name' to the archive. `name' may be any type of file
1932           (directory, fifo, symbolic link, etc.). If given, `arcname'
1933           specifies an alternative name for the file in the archive.
1934           Directories are added recursively by default. This can be avoided by
1935           setting `recursive' to False. `filter' is a function
1936           that expects a TarInfo object argument and returns the changed
1937           TarInfo object, if it returns None the TarInfo object will be
1938           excluded from the archive.
1939        """
1940        self._check("awx")
1941
1942        if arcname is None:
1943            arcname = name
1944
1945        # Skip if somebody tries to archive the archive...
1946        if self.name is not None and os.path.abspath(name) == self.name:
1947            self._dbg(2, "tarfile: Skipped %r" % name)
1948            return
1949
1950        self._dbg(1, name)
1951
1952        # Create a TarInfo object from the file.
1953        tarinfo = self.gettarinfo(name, arcname)
1954
1955        if tarinfo is None:
1956            self._dbg(1, "tarfile: Unsupported type %r" % name)
1957            return
1958
1959        # Change or exclude the TarInfo object.
1960        if filter is not None:
1961            tarinfo = filter(tarinfo)
1962            if tarinfo is None:
1963                self._dbg(2, "tarfile: Excluded %r" % name)
1964                return
1965
1966        # Append the tar header and data to the archive.
1967        if tarinfo.isreg():
1968            with bltn_open(name, "rb") as f:
1969                self.addfile(tarinfo, f)
1970
1971        elif tarinfo.isdir():
1972            self.addfile(tarinfo)
1973            if recursive:
1974                for f in sorted(os.listdir(name)):
1975                    self.add(os.path.join(name, f), os.path.join(arcname, f),
1976                            recursive, filter=filter)
1977
1978        else:
1979            self.addfile(tarinfo)
1980
1981    def addfile(self, tarinfo, fileobj=None):
1982        """Add the TarInfo object `tarinfo' to the archive. If `fileobj' is
1983           given, it should be a binary file, and tarinfo.size bytes are read
1984           from it and added to the archive. You can create TarInfo objects
1985           directly, or by using gettarinfo().
1986        """
1987        self._check("awx")
1988
1989        tarinfo = copy.copy(tarinfo)
1990
1991        buf = tarinfo.tobuf(self.format, self.encoding, self.errors)
1992        self.fileobj.write(buf)
1993        self.offset += len(buf)
1994        bufsize=self.copybufsize
1995        # If there's data to follow, append it.
1996        if fileobj is not None:
1997            copyfileobj(fileobj, self.fileobj, tarinfo.size, bufsize=bufsize)
1998            blocks, remainder = divmod(tarinfo.size, BLOCKSIZE)
1999            if remainder > 0:
2000                self.fileobj.write(NUL * (BLOCKSIZE - remainder))
2001                blocks += 1
2002            self.offset += blocks * BLOCKSIZE
2003
2004        self.members.append(tarinfo)
2005
2006    def extractall(self, path=".", members=None, *, numeric_owner=False):
2007        """Extract all members from the archive to the current working
2008           directory and set owner, modification time and permissions on
2009           directories afterwards. `path' specifies a different directory
2010           to extract to. `members' is optional and must be a subset of the
2011           list returned by getmembers(). If `numeric_owner` is True, only
2012           the numbers for user/group names are used and not the names.
2013        """
2014        directories = []
2015
2016        if members is None:
2017            members = self
2018
2019        for tarinfo in members:
2020            if tarinfo.isdir():
2021                # Extract directories with a safe mode.
2022                directories.append(tarinfo)
2023                tarinfo = copy.copy(tarinfo)
2024                tarinfo.mode = 0o700
2025            # Do not set_attrs directories, as we will do that further down
2026            self.extract(tarinfo, path, set_attrs=not tarinfo.isdir(),
2027                         numeric_owner=numeric_owner)
2028
2029        # Reverse sort directories.
2030        directories.sort(key=lambda a: a.name)
2031        directories.reverse()
2032
2033        # Set correct owner, mtime and filemode on directories.
2034        for tarinfo in directories:
2035            dirpath = os.path.join(path, tarinfo.name)
2036            try:
2037                self.chown(tarinfo, dirpath, numeric_owner=numeric_owner)
2038                self.utime(tarinfo, dirpath)
2039                self.chmod(tarinfo, dirpath)
2040            except ExtractError as e:
2041                if self.errorlevel > 1:
2042                    raise
2043                else:
2044                    self._dbg(1, "tarfile: %s" % e)
2045
2046    def extract(self, member, path="", set_attrs=True, *, numeric_owner=False):
2047        """Extract a member from the archive to the current working directory,
2048           using its full name. Its file information is extracted as accurately
2049           as possible. `member' may be a filename or a TarInfo object. You can
2050           specify a different directory using `path'. File attributes (owner,
2051           mtime, mode) are set unless `set_attrs' is False. If `numeric_owner`
2052           is True, only the numbers for user/group names are used and not
2053           the names.
2054        """
2055        self._check("r")
2056
2057        if isinstance(member, str):
2058            tarinfo = self.getmember(member)
2059        else:
2060            tarinfo = member
2061
2062        # Prepare the link target for makelink().
2063        if tarinfo.islnk():
2064            tarinfo._link_target = os.path.join(path, tarinfo.linkname)
2065
2066        try:
2067            self._extract_member(tarinfo, os.path.join(path, tarinfo.name),
2068                                 set_attrs=set_attrs,
2069                                 numeric_owner=numeric_owner)
2070        except OSError as e:
2071            if self.errorlevel > 0:
2072                raise
2073            else:
2074                if e.filename is None:
2075                    self._dbg(1, "tarfile: %s" % e.strerror)
2076                else:
2077                    self._dbg(1, "tarfile: %s %r" % (e.strerror, e.filename))
2078        except ExtractError as e:
2079            if self.errorlevel > 1:
2080                raise
2081            else:
2082                self._dbg(1, "tarfile: %s" % e)
2083
2084    def extractfile(self, member):
2085        """Extract a member from the archive as a file object. `member' may be
2086           a filename or a TarInfo object. If `member' is a regular file or a
2087           link, an io.BufferedReader object is returned. Otherwise, None is
2088           returned.
2089        """
2090        self._check("r")
2091
2092        if isinstance(member, str):
2093            tarinfo = self.getmember(member)
2094        else:
2095            tarinfo = member
2096
2097        if tarinfo.isreg() or tarinfo.type not in SUPPORTED_TYPES:
2098            # Members with unknown types are treated as regular files.
2099            return self.fileobject(self, tarinfo)
2100
2101        elif tarinfo.islnk() or tarinfo.issym():
2102            if isinstance(self.fileobj, _Stream):
2103                # A small but ugly workaround for the case that someone tries
2104                # to extract a (sym)link as a file-object from a non-seekable
2105                # stream of tar blocks.
2106                raise StreamError("cannot extract (sym)link as file object")
2107            else:
2108                # A (sym)link's file object is its target's file object.
2109                return self.extractfile(self._find_link_target(tarinfo))
2110        else:
2111            # If there's no data associated with the member (directory, chrdev,
2112            # blkdev, etc.), return None instead of a file object.
2113            return None
2114
2115    def _extract_member(self, tarinfo, targetpath, set_attrs=True,
2116                        numeric_owner=False):
2117        """Extract the TarInfo object tarinfo to a physical
2118           file called targetpath.
2119        """
2120        # Fetch the TarInfo object for the given name
2121        # and build the destination pathname, replacing
2122        # forward slashes to platform specific separators.
2123        targetpath = targetpath.rstrip("/")
2124        targetpath = targetpath.replace("/", os.sep)
2125
2126        # Create all upper directories.
2127        upperdirs = os.path.dirname(targetpath)
2128        if upperdirs and not os.path.exists(upperdirs):
2129            # Create directories that are not part of the archive with
2130            # default permissions.
2131            os.makedirs(upperdirs)
2132
2133        if tarinfo.islnk() or tarinfo.issym():
2134            self._dbg(1, "%s -> %s" % (tarinfo.name, tarinfo.linkname))
2135        else:
2136            self._dbg(1, tarinfo.name)
2137
2138        if tarinfo.isreg():
2139            self.makefile(tarinfo, targetpath)
2140        elif tarinfo.isdir():
2141            self.makedir(tarinfo, targetpath)
2142        elif tarinfo.isfifo():
2143            self.makefifo(tarinfo, targetpath)
2144        elif tarinfo.ischr() or tarinfo.isblk():
2145            self.makedev(tarinfo, targetpath)
2146        elif tarinfo.islnk() or tarinfo.issym():
2147            self.makelink(tarinfo, targetpath)
2148        elif tarinfo.type not in SUPPORTED_TYPES:
2149            self.makeunknown(tarinfo, targetpath)
2150        else:
2151            self.makefile(tarinfo, targetpath)
2152
2153        if set_attrs:
2154            self.chown(tarinfo, targetpath, numeric_owner)
2155            if not tarinfo.issym():
2156                self.chmod(tarinfo, targetpath)
2157                self.utime(tarinfo, targetpath)
2158
2159    #--------------------------------------------------------------------------
2160    # Below are the different file methods. They are called via
2161    # _extract_member() when extract() is called. They can be replaced in a
2162    # subclass to implement other functionality.
2163
2164    def makedir(self, tarinfo, targetpath):
2165        """Make a directory called targetpath.
2166        """
2167        try:
2168            # Use a safe mode for the directory, the real mode is set
2169            # later in _extract_member().
2170            os.mkdir(targetpath, 0o700)
2171        except FileExistsError:
2172            pass
2173
2174    def makefile(self, tarinfo, targetpath):
2175        """Make a file called targetpath.
2176        """
2177        source = self.fileobj
2178        source.seek(tarinfo.offset_data)
2179        bufsize = self.copybufsize
2180        with bltn_open(targetpath, "wb") as target:
2181            if tarinfo.sparse is not None:
2182                for offset, size in tarinfo.sparse:
2183                    target.seek(offset)
2184                    copyfileobj(source, target, size, ReadError, bufsize)
2185                target.seek(tarinfo.size)
2186                target.truncate()
2187            else:
2188                copyfileobj(source, target, tarinfo.size, ReadError, bufsize)
2189
2190    def makeunknown(self, tarinfo, targetpath):
2191        """Make a file from a TarInfo object with an unknown type
2192           at targetpath.
2193        """
2194        self.makefile(tarinfo, targetpath)
2195        self._dbg(1, "tarfile: Unknown file type %r, " \
2196                     "extracted as regular file." % tarinfo.type)
2197
2198    def makefifo(self, tarinfo, targetpath):
2199        """Make a fifo called targetpath.
2200        """
2201        if hasattr(os, "mkfifo"):
2202            os.mkfifo(targetpath)
2203        else:
2204            raise ExtractError("fifo not supported by system")
2205
2206    def makedev(self, tarinfo, targetpath):
2207        """Make a character or block device called targetpath.
2208        """
2209        if not hasattr(os, "mknod") or not hasattr(os, "makedev"):
2210            raise ExtractError("special devices not supported by system")
2211
2212        mode = tarinfo.mode
2213        if tarinfo.isblk():
2214            mode |= stat.S_IFBLK
2215        else:
2216            mode |= stat.S_IFCHR
2217
2218        os.mknod(targetpath, mode,
2219                 os.makedev(tarinfo.devmajor, tarinfo.devminor))
2220
2221    def makelink(self, tarinfo, targetpath):
2222        """Make a (symbolic) link called targetpath. If it cannot be created
2223          (platform limitation), we try to make a copy of the referenced file
2224          instead of a link.
2225        """
2226        try:
2227            # For systems that support symbolic and hard links.
2228            if tarinfo.issym():
2229                os.symlink(tarinfo.linkname, targetpath)
2230            else:
2231                # See extract().
2232                if os.path.exists(tarinfo._link_target):
2233                    os.link(tarinfo._link_target, targetpath)
2234                else:
2235                    self._extract_member(self._find_link_target(tarinfo),
2236                                         targetpath)
2237        except symlink_exception:
2238            try:
2239                self._extract_member(self._find_link_target(tarinfo),
2240                                     targetpath)
2241            except KeyError:
2242                raise ExtractError("unable to resolve link inside archive")
2243
2244    def chown(self, tarinfo, targetpath, numeric_owner):
2245        """Set owner of targetpath according to tarinfo. If numeric_owner
2246           is True, use .gid/.uid instead of .gname/.uname. If numeric_owner
2247           is False, fall back to .gid/.uid when the search based on name
2248           fails.
2249        """
2250        if hasattr(os, "geteuid") and os.geteuid() == 0:
2251            # We have to be root to do so.
2252            g = tarinfo.gid
2253            u = tarinfo.uid
2254            if not numeric_owner:
2255                try:
2256                    if grp:
2257                        g = grp.getgrnam(tarinfo.gname)[2]
2258                except KeyError:
2259                    pass
2260                try:
2261                    if pwd:
2262                        u = pwd.getpwnam(tarinfo.uname)[2]
2263                except KeyError:
2264                    pass
2265            try:
2266                if tarinfo.issym() and hasattr(os, "lchown"):
2267                    os.lchown(targetpath, u, g)
2268                else:
2269                    os.chown(targetpath, u, g)
2270            except OSError:
2271                raise ExtractError("could not change owner")
2272
2273    def chmod(self, tarinfo, targetpath):
2274        """Set file permissions of targetpath according to tarinfo.
2275        """
2276        try:
2277            os.chmod(targetpath, tarinfo.mode)
2278        except OSError:
2279            raise ExtractError("could not change mode")
2280
2281    def utime(self, tarinfo, targetpath):
2282        """Set modification time of targetpath according to tarinfo.
2283        """
2284        if not hasattr(os, 'utime'):
2285            return
2286        try:
2287            os.utime(targetpath, (tarinfo.mtime, tarinfo.mtime))
2288        except OSError:
2289            raise ExtractError("could not change modification time")
2290
2291    #--------------------------------------------------------------------------
2292    def next(self):
2293        """Return the next member of the archive as a TarInfo object, when
2294           TarFile is opened for reading. Return None if there is no more
2295           available.
2296        """
2297        self._check("ra")
2298        if self.firstmember is not None:
2299            m = self.firstmember
2300            self.firstmember = None
2301            return m
2302
2303        # Advance the file pointer.
2304        if self.offset != self.fileobj.tell():
2305            self.fileobj.seek(self.offset - 1)
2306            if not self.fileobj.read(1):
2307                raise ReadError("unexpected end of data")
2308
2309        # Read the next block.
2310        tarinfo = None
2311        while True:
2312            try:
2313                tarinfo = self.tarinfo.fromtarfile(self)
2314            except EOFHeaderError as e:
2315                if self.ignore_zeros:
2316                    self._dbg(2, "0x%X: %s" % (self.offset, e))
2317                    self.offset += BLOCKSIZE
2318                    continue
2319            except InvalidHeaderError as e:
2320                if self.ignore_zeros:
2321                    self._dbg(2, "0x%X: %s" % (self.offset, e))
2322                    self.offset += BLOCKSIZE
2323                    continue
2324                elif self.offset == 0:
2325                    raise ReadError(str(e))
2326            except EmptyHeaderError:
2327                if self.offset == 0:
2328                    raise ReadError("empty file")
2329            except TruncatedHeaderError as e:
2330                if self.offset == 0:
2331                    raise ReadError(str(e))
2332            except SubsequentHeaderError as e:
2333                raise ReadError(str(e))
2334            break
2335
2336        if tarinfo is not None:
2337            self.members.append(tarinfo)
2338        else:
2339            self._loaded = True
2340
2341        return tarinfo
2342
2343    #--------------------------------------------------------------------------
2344    # Little helper methods:
2345
2346    def _getmember(self, name, tarinfo=None, normalize=False):
2347        """Find an archive member by name from bottom to top.
2348           If tarinfo is given, it is used as the starting point.
2349        """
2350        # Ensure that all members have been loaded.
2351        members = self.getmembers()
2352
2353        # Limit the member search list up to tarinfo.
2354        if tarinfo is not None:
2355            members = members[:members.index(tarinfo)]
2356
2357        if normalize:
2358            name = os.path.normpath(name)
2359
2360        for member in reversed(members):
2361            if normalize:
2362                member_name = os.path.normpath(member.name)
2363            else:
2364                member_name = member.name
2365
2366            if name == member_name:
2367                return member
2368
2369    def _load(self):
2370        """Read through the entire archive file and look for readable
2371           members.
2372        """
2373        while True:
2374            tarinfo = self.next()
2375            if tarinfo is None:
2376                break
2377        self._loaded = True
2378
2379    def _check(self, mode=None):
2380        """Check if TarFile is still open, and if the operation's mode
2381           corresponds to TarFile's mode.
2382        """
2383        if self.closed:
2384            raise OSError("%s is closed" % self.__class__.__name__)
2385        if mode is not None and self.mode not in mode:
2386            raise OSError("bad operation for mode %r" % self.mode)
2387
2388    def _find_link_target(self, tarinfo):
2389        """Find the target member of a symlink or hardlink member in the
2390           archive.
2391        """
2392        if tarinfo.issym():
2393            # Always search the entire archive.
2394            linkname = "/".join(filter(None, (os.path.dirname(tarinfo.name), tarinfo.linkname)))
2395            limit = None
2396        else:
2397            # Search the archive before the link, because a hard link is
2398            # just a reference to an already archived file.
2399            linkname = tarinfo.linkname
2400            limit = tarinfo
2401
2402        member = self._getmember(linkname, tarinfo=limit, normalize=True)
2403        if member is None:
2404            raise KeyError("linkname %r not found" % linkname)
2405        return member
2406
2407    def __iter__(self):
2408        """Provide an iterator object.
2409        """
2410        if self._loaded:
2411            yield from self.members
2412            return
2413
2414        # Yield items using TarFile's next() method.
2415        # When all members have been read, set TarFile as _loaded.
2416        index = 0
2417        # Fix for SF #1100429: Under rare circumstances it can
2418        # happen that getmembers() is called during iteration,
2419        # which will have already exhausted the next() method.
2420        if self.firstmember is not None:
2421            tarinfo = self.next()
2422            index += 1
2423            yield tarinfo
2424
2425        while True:
2426            if index < len(self.members):
2427                tarinfo = self.members[index]
2428            elif not self._loaded:
2429                tarinfo = self.next()
2430                if not tarinfo:
2431                    self._loaded = True
2432                    return
2433            else:
2434                return
2435            index += 1
2436            yield tarinfo
2437
2438    def _dbg(self, level, msg):
2439        """Write debugging output to sys.stderr.
2440        """
2441        if level <= self.debug:
2442            print(msg, file=sys.stderr)
2443
2444    def __enter__(self):
2445        self._check()
2446        return self
2447
2448    def __exit__(self, type, value, traceback):
2449        if type is None:
2450            self.close()
2451        else:
2452            # An exception occurred. We must not call close() because
2453            # it would try to write end-of-archive blocks and padding.
2454            if not self._extfileobj:
2455                self.fileobj.close()
2456            self.closed = True
2457
2458#--------------------
2459# exported functions
2460#--------------------
2461def is_tarfile(name):
2462    """Return True if name points to a tar archive that we
2463       are able to handle, else return False.
2464    """
2465    try:
2466        t = open(name)
2467        t.close()
2468        return True
2469    except TarError:
2470        return False
2471
2472open = TarFile.open
2473
2474
2475def main():
2476    import argparse
2477
2478    description = 'A simple command-line interface for tarfile module.'
2479    parser = argparse.ArgumentParser(description=description)
2480    parser.add_argument('-v', '--verbose', action='store_true', default=False,
2481                        help='Verbose output')
2482    group = parser.add_mutually_exclusive_group(required=True)
2483    group.add_argument('-l', '--list', metavar='<tarfile>',
2484                       help='Show listing of a tarfile')
2485    group.add_argument('-e', '--extract', nargs='+',
2486                       metavar=('<tarfile>', '<output_dir>'),
2487                       help='Extract tarfile into target dir')
2488    group.add_argument('-c', '--create', nargs='+',
2489                       metavar=('<name>', '<file>'),
2490                       help='Create tarfile from sources')
2491    group.add_argument('-t', '--test', metavar='<tarfile>',
2492                       help='Test if a tarfile is valid')
2493    args = parser.parse_args()
2494
2495    if args.test is not None:
2496        src = args.test
2497        if is_tarfile(src):
2498            with open(src, 'r') as tar:
2499                tar.getmembers()
2500                print(tar.getmembers(), file=sys.stderr)
2501            if args.verbose:
2502                print('{!r} is a tar archive.'.format(src))
2503        else:
2504            parser.exit(1, '{!r} is not a tar archive.\n'.format(src))
2505
2506    elif args.list is not None:
2507        src = args.list
2508        if is_tarfile(src):
2509            with TarFile.open(src, 'r:*') as tf:
2510                tf.list(verbose=args.verbose)
2511        else:
2512            parser.exit(1, '{!r} is not a tar archive.\n'.format(src))
2513
2514    elif args.extract is not None:
2515        if len(args.extract) == 1:
2516            src = args.extract[0]
2517            curdir = os.curdir
2518        elif len(args.extract) == 2:
2519            src, curdir = args.extract
2520        else:
2521            parser.exit(1, parser.format_help())
2522
2523        if is_tarfile(src):
2524            with TarFile.open(src, 'r:*') as tf:
2525                tf.extractall(path=curdir)
2526            if args.verbose:
2527                if curdir == '.':
2528                    msg = '{!r} file is extracted.'.format(src)
2529                else:
2530                    msg = ('{!r} file is extracted '
2531                           'into {!r} directory.').format(src, curdir)
2532                print(msg)
2533        else:
2534            parser.exit(1, '{!r} is not a tar archive.\n'.format(src))
2535
2536    elif args.create is not None:
2537        tar_name = args.create.pop(0)
2538        _, ext = os.path.splitext(tar_name)
2539        compressions = {
2540            # gz
2541            '.gz': 'gz',
2542            '.tgz': 'gz',
2543            # xz
2544            '.xz': 'xz',
2545            '.txz': 'xz',
2546            # bz2
2547            '.bz2': 'bz2',
2548            '.tbz': 'bz2',
2549            '.tbz2': 'bz2',
2550            '.tb2': 'bz2',
2551        }
2552        tar_mode = 'w:' + compressions[ext] if ext in compressions else 'w'
2553        tar_files = args.create
2554
2555        with TarFile.open(tar_name, tar_mode) as tf:
2556            for file_name in tar_files:
2557                tf.add(file_name)
2558
2559        if args.verbose:
2560            print('{!r} file created.'.format(tar_name))
2561
2562if __name__ == '__main__':
2563    main()
2564