• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/env python3
2#-------------------------------------------------------------------
3# tarfile.py
4#-------------------------------------------------------------------
5# Copyright (C) 2002 Lars Gustaebel <lars@gustaebel.de>
6# All rights reserved.
7#
8# Permission  is  hereby granted,  free  of charge,  to  any person
9# obtaining a  copy of  this software  and associated documentation
10# files  (the  "Software"),  to   deal  in  the  Software   without
11# restriction,  including  without limitation  the  rights to  use,
12# copy, modify, merge, publish, distribute, sublicense, and/or sell
13# copies  of  the  Software,  and to  permit  persons  to  whom the
14# Software  is  furnished  to  do  so,  subject  to  the  following
15# conditions:
16#
17# The above copyright  notice and this  permission notice shall  be
18# included in all copies or substantial portions of the Software.
19#
20# THE SOFTWARE IS PROVIDED "AS  IS", WITHOUT WARRANTY OF ANY  KIND,
21# EXPRESS OR IMPLIED, INCLUDING  BUT NOT LIMITED TO  THE WARRANTIES
22# OF  MERCHANTABILITY,  FITNESS   FOR  A  PARTICULAR   PURPOSE  AND
23# NONINFRINGEMENT.  IN  NO  EVENT SHALL  THE  AUTHORS  OR COPYRIGHT
24# HOLDERS  BE LIABLE  FOR ANY  CLAIM, DAMAGES  OR OTHER  LIABILITY,
25# WHETHER  IN AN  ACTION OF  CONTRACT, TORT  OR OTHERWISE,  ARISING
26# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
27# OTHER DEALINGS IN THE SOFTWARE.
28#
29"""Read from and write to tar format archives.
30"""
31
32version     = "0.9.0"
33__author__  = "Lars Gust\u00e4bel (lars@gustaebel.de)"
34__credits__ = "Gustavo Niemeyer, Niels Gust\u00e4bel, Richard Townsend."
35
36#---------
37# Imports
38#---------
39from builtins import open as bltn_open
40import sys
41import os
42import io
43import shutil
44import stat
45import time
46import struct
47import copy
48import re
49
50try:
51    import pwd
52except ImportError:
53    pwd = None
54try:
55    import grp
56except ImportError:
57    grp = None
58
59# os.symlink on Windows prior to 6.0 raises NotImplementedError
60symlink_exception = (AttributeError, NotImplementedError)
61try:
62    # OSError (winerror=1314) will be raised if the caller does not hold the
63    # SeCreateSymbolicLinkPrivilege privilege
64    symlink_exception += (OSError,)
65except NameError:
66    pass
67
68# from tarfile import *
69__all__ = ["TarFile", "TarInfo", "is_tarfile", "TarError", "ReadError",
70           "CompressionError", "StreamError", "ExtractError", "HeaderError",
71           "ENCODING", "USTAR_FORMAT", "GNU_FORMAT", "PAX_FORMAT",
72           "DEFAULT_FORMAT", "open"]
73
74#---------------------------------------------------------
75# tar constants
76#---------------------------------------------------------
77NUL = b"\0"                     # the null character
78BLOCKSIZE = 512                 # length of processing blocks
79RECORDSIZE = BLOCKSIZE * 20     # length of records
80GNU_MAGIC = b"ustar  \0"        # magic gnu tar string
81POSIX_MAGIC = b"ustar\x0000"    # magic posix tar string
82
83LENGTH_NAME = 100               # maximum length of a filename
84LENGTH_LINK = 100               # maximum length of a linkname
85LENGTH_PREFIX = 155             # maximum length of the prefix field
86
87REGTYPE = b"0"                  # regular file
88AREGTYPE = b"\0"                # regular file
89LNKTYPE = b"1"                  # link (inside tarfile)
90SYMTYPE = b"2"                  # symbolic link
91CHRTYPE = b"3"                  # character special device
92BLKTYPE = b"4"                  # block special device
93DIRTYPE = b"5"                  # directory
94FIFOTYPE = b"6"                 # fifo special device
95CONTTYPE = b"7"                 # contiguous file
96
97GNUTYPE_LONGNAME = b"L"         # GNU tar longname
98GNUTYPE_LONGLINK = b"K"         # GNU tar longlink
99GNUTYPE_SPARSE = b"S"           # GNU tar sparse file
100
101XHDTYPE = b"x"                  # POSIX.1-2001 extended header
102XGLTYPE = b"g"                  # POSIX.1-2001 global header
103SOLARIS_XHDTYPE = b"X"          # Solaris extended header
104
105USTAR_FORMAT = 0                # POSIX.1-1988 (ustar) format
106GNU_FORMAT = 1                  # GNU tar format
107PAX_FORMAT = 2                  # POSIX.1-2001 (pax) format
108DEFAULT_FORMAT = PAX_FORMAT
109
110#---------------------------------------------------------
111# tarfile constants
112#---------------------------------------------------------
113# File types that tarfile supports:
114SUPPORTED_TYPES = (REGTYPE, AREGTYPE, LNKTYPE,
115                   SYMTYPE, DIRTYPE, FIFOTYPE,
116                   CONTTYPE, CHRTYPE, BLKTYPE,
117                   GNUTYPE_LONGNAME, GNUTYPE_LONGLINK,
118                   GNUTYPE_SPARSE)
119
120# File types that will be treated as a regular file.
121REGULAR_TYPES = (REGTYPE, AREGTYPE,
122                 CONTTYPE, GNUTYPE_SPARSE)
123
124# File types that are part of the GNU tar format.
125GNU_TYPES = (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK,
126             GNUTYPE_SPARSE)
127
128# Fields from a pax header that override a TarInfo attribute.
129PAX_FIELDS = ("path", "linkpath", "size", "mtime",
130              "uid", "gid", "uname", "gname")
131
132# Fields from a pax header that are affected by hdrcharset.
133PAX_NAME_FIELDS = {"path", "linkpath", "uname", "gname"}
134
135# Fields in a pax header that are numbers, all other fields
136# are treated as strings.
137PAX_NUMBER_FIELDS = {
138    "atime": float,
139    "ctime": float,
140    "mtime": float,
141    "uid": int,
142    "gid": int,
143    "size": int
144}
145
146#---------------------------------------------------------
147# initialization
148#---------------------------------------------------------
149if os.name == "nt":
150    ENCODING = "utf-8"
151else:
152    ENCODING = sys.getfilesystemencoding()
153
154#---------------------------------------------------------
155# Some useful functions
156#---------------------------------------------------------
157
158def stn(s, length, encoding, errors):
159    """Convert a string to a null-terminated bytes object.
160    """
161    s = s.encode(encoding, errors)
162    return s[:length] + (length - len(s)) * NUL
163
164def nts(s, encoding, errors):
165    """Convert a null-terminated bytes object to a string.
166    """
167    p = s.find(b"\0")
168    if p != -1:
169        s = s[:p]
170    return s.decode(encoding, errors)
171
172def nti(s):
173    """Convert a number field to a python number.
174    """
175    # There are two possible encodings for a number field, see
176    # itn() below.
177    if s[0] in (0o200, 0o377):
178        n = 0
179        for i in range(len(s) - 1):
180            n <<= 8
181            n += s[i + 1]
182        if s[0] == 0o377:
183            n = -(256 ** (len(s) - 1) - n)
184    else:
185        try:
186            s = nts(s, "ascii", "strict")
187            n = int(s.strip() or "0", 8)
188        except ValueError:
189            raise InvalidHeaderError("invalid header")
190    return n
191
192def itn(n, digits=8, format=DEFAULT_FORMAT):
193    """Convert a python number to a number field.
194    """
195    # POSIX 1003.1-1988 requires numbers to be encoded as a string of
196    # octal digits followed by a null-byte, this allows values up to
197    # (8**(digits-1))-1. GNU tar allows storing numbers greater than
198    # that if necessary. A leading 0o200 or 0o377 byte indicate this
199    # particular encoding, the following digits-1 bytes are a big-endian
200    # base-256 representation. This allows values up to (256**(digits-1))-1.
201    # A 0o200 byte indicates a positive number, a 0o377 byte a negative
202    # number.
203    n = int(n)
204    if 0 <= n < 8 ** (digits - 1):
205        s = bytes("%0*o" % (digits - 1, n), "ascii") + NUL
206    elif format == GNU_FORMAT and -256 ** (digits - 1) <= n < 256 ** (digits - 1):
207        if n >= 0:
208            s = bytearray([0o200])
209        else:
210            s = bytearray([0o377])
211            n = 256 ** digits + n
212
213        for i in range(digits - 1):
214            s.insert(1, n & 0o377)
215            n >>= 8
216    else:
217        raise ValueError("overflow in number field")
218
219    return s
220
221def calc_chksums(buf):
222    """Calculate the checksum for a member's header by summing up all
223       characters except for the chksum field which is treated as if
224       it was filled with spaces. According to the GNU tar sources,
225       some tars (Sun and NeXT) calculate chksum with signed char,
226       which will be different if there are chars in the buffer with
227       the high bit set. So we calculate two checksums, unsigned and
228       signed.
229    """
230    unsigned_chksum = 256 + sum(struct.unpack_from("148B8x356B", buf))
231    signed_chksum = 256 + sum(struct.unpack_from("148b8x356b", buf))
232    return unsigned_chksum, signed_chksum
233
234def copyfileobj(src, dst, length=None, exception=OSError, bufsize=None):
235    """Copy length bytes from fileobj src to fileobj dst.
236       If length is None, copy the entire content.
237    """
238    bufsize = bufsize or 16 * 1024
239    if length == 0:
240        return
241    if length is None:
242        shutil.copyfileobj(src, dst, bufsize)
243        return
244
245    blocks, remainder = divmod(length, bufsize)
246    for b in range(blocks):
247        buf = src.read(bufsize)
248        if len(buf) < bufsize:
249            raise exception("unexpected end of data")
250        dst.write(buf)
251
252    if remainder != 0:
253        buf = src.read(remainder)
254        if len(buf) < remainder:
255            raise exception("unexpected end of data")
256        dst.write(buf)
257    return
258
259def _safe_print(s):
260    encoding = getattr(sys.stdout, 'encoding', None)
261    if encoding is not None:
262        s = s.encode(encoding, 'backslashreplace').decode(encoding)
263    print(s, end=' ')
264
265
266class TarError(Exception):
267    """Base exception."""
268    pass
269class ExtractError(TarError):
270    """General exception for extract errors."""
271    pass
272class ReadError(TarError):
273    """Exception for unreadable tar archives."""
274    pass
275class CompressionError(TarError):
276    """Exception for unavailable compression methods."""
277    pass
278class StreamError(TarError):
279    """Exception for unsupported operations on stream-like TarFiles."""
280    pass
281class HeaderError(TarError):
282    """Base exception for header errors."""
283    pass
284class EmptyHeaderError(HeaderError):
285    """Exception for empty headers."""
286    pass
287class TruncatedHeaderError(HeaderError):
288    """Exception for truncated headers."""
289    pass
290class EOFHeaderError(HeaderError):
291    """Exception for end of file headers."""
292    pass
293class InvalidHeaderError(HeaderError):
294    """Exception for invalid headers."""
295    pass
296class SubsequentHeaderError(HeaderError):
297    """Exception for missing and invalid extended headers."""
298    pass
299
300#---------------------------
301# internal stream interface
302#---------------------------
303class _LowLevelFile:
304    """Low-level file object. Supports reading and writing.
305       It is used instead of a regular file object for streaming
306       access.
307    """
308
309    def __init__(self, name, mode):
310        mode = {
311            "r": os.O_RDONLY,
312            "w": os.O_WRONLY | os.O_CREAT | os.O_TRUNC,
313        }[mode]
314        if hasattr(os, "O_BINARY"):
315            mode |= os.O_BINARY
316        self.fd = os.open(name, mode, 0o666)
317
318    def close(self):
319        os.close(self.fd)
320
321    def read(self, size):
322        return os.read(self.fd, size)
323
324    def write(self, s):
325        os.write(self.fd, s)
326
327class _Stream:
328    """Class that serves as an adapter between TarFile and
329       a stream-like object.  The stream-like object only
330       needs to have a read() or write() method and is accessed
331       blockwise.  Use of gzip or bzip2 compression is possible.
332       A stream-like object could be for example: sys.stdin,
333       sys.stdout, a socket, a tape device etc.
334
335       _Stream is intended to be used only internally.
336    """
337
338    def __init__(self, name, mode, comptype, fileobj, bufsize):
339        """Construct a _Stream object.
340        """
341        self._extfileobj = True
342        if fileobj is None:
343            fileobj = _LowLevelFile(name, mode)
344            self._extfileobj = False
345
346        if comptype == '*':
347            # Enable transparent compression detection for the
348            # stream interface
349            fileobj = _StreamProxy(fileobj)
350            comptype = fileobj.getcomptype()
351
352        self.name     = name or ""
353        self.mode     = mode
354        self.comptype = comptype
355        self.fileobj  = fileobj
356        self.bufsize  = bufsize
357        self.buf      = b""
358        self.pos      = 0
359        self.closed   = False
360
361        try:
362            if comptype == "gz":
363                try:
364                    import zlib
365                except ImportError:
366                    raise CompressionError("zlib module is not available")
367                self.zlib = zlib
368                self.crc = zlib.crc32(b"")
369                if mode == "r":
370                    self._init_read_gz()
371                    self.exception = zlib.error
372                else:
373                    self._init_write_gz()
374
375            elif comptype == "bz2":
376                try:
377                    import bz2
378                except ImportError:
379                    raise CompressionError("bz2 module is not available")
380                if mode == "r":
381                    self.dbuf = b""
382                    self.cmp = bz2.BZ2Decompressor()
383                    self.exception = OSError
384                else:
385                    self.cmp = bz2.BZ2Compressor()
386
387            elif comptype == "xz":
388                try:
389                    import lzma
390                except ImportError:
391                    raise CompressionError("lzma module is not available")
392                if mode == "r":
393                    self.dbuf = b""
394                    self.cmp = lzma.LZMADecompressor()
395                    self.exception = lzma.LZMAError
396                else:
397                    self.cmp = lzma.LZMACompressor()
398
399            elif comptype != "tar":
400                raise CompressionError("unknown compression type %r" % comptype)
401
402        except:
403            if not self._extfileobj:
404                self.fileobj.close()
405            self.closed = True
406            raise
407
408    def __del__(self):
409        if hasattr(self, "closed") and not self.closed:
410            self.close()
411
412    def _init_write_gz(self):
413        """Initialize for writing with gzip compression.
414        """
415        self.cmp = self.zlib.compressobj(9, self.zlib.DEFLATED,
416                                            -self.zlib.MAX_WBITS,
417                                            self.zlib.DEF_MEM_LEVEL,
418                                            0)
419        timestamp = struct.pack("<L", int(time.time()))
420        self.__write(b"\037\213\010\010" + timestamp + b"\002\377")
421        if self.name.endswith(".gz"):
422            self.name = self.name[:-3]
423        # Honor "directory components removed" from RFC1952
424        self.name = os.path.basename(self.name)
425        # RFC1952 says we must use ISO-8859-1 for the FNAME field.
426        self.__write(self.name.encode("iso-8859-1", "replace") + NUL)
427
428    def write(self, s):
429        """Write string s to the stream.
430        """
431        if self.comptype == "gz":
432            self.crc = self.zlib.crc32(s, self.crc)
433        self.pos += len(s)
434        if self.comptype != "tar":
435            s = self.cmp.compress(s)
436        self.__write(s)
437
438    def __write(self, s):
439        """Write string s to the stream if a whole new block
440           is ready to be written.
441        """
442        self.buf += s
443        while len(self.buf) > self.bufsize:
444            self.fileobj.write(self.buf[:self.bufsize])
445            self.buf = self.buf[self.bufsize:]
446
447    def close(self):
448        """Close the _Stream object. No operation should be
449           done on it afterwards.
450        """
451        if self.closed:
452            return
453
454        self.closed = True
455        try:
456            if self.mode == "w" and self.comptype != "tar":
457                self.buf += self.cmp.flush()
458
459            if self.mode == "w" and self.buf:
460                self.fileobj.write(self.buf)
461                self.buf = b""
462                if self.comptype == "gz":
463                    self.fileobj.write(struct.pack("<L", self.crc))
464                    self.fileobj.write(struct.pack("<L", self.pos & 0xffffFFFF))
465        finally:
466            if not self._extfileobj:
467                self.fileobj.close()
468
469    def _init_read_gz(self):
470        """Initialize for reading a gzip compressed fileobj.
471        """
472        self.cmp = self.zlib.decompressobj(-self.zlib.MAX_WBITS)
473        self.dbuf = b""
474
475        # taken from gzip.GzipFile with some alterations
476        if self.__read(2) != b"\037\213":
477            raise ReadError("not a gzip file")
478        if self.__read(1) != b"\010":
479            raise CompressionError("unsupported compression method")
480
481        flag = ord(self.__read(1))
482        self.__read(6)
483
484        if flag & 4:
485            xlen = ord(self.__read(1)) + 256 * ord(self.__read(1))
486            self.read(xlen)
487        if flag & 8:
488            while True:
489                s = self.__read(1)
490                if not s or s == NUL:
491                    break
492        if flag & 16:
493            while True:
494                s = self.__read(1)
495                if not s or s == NUL:
496                    break
497        if flag & 2:
498            self.__read(2)
499
500    def tell(self):
501        """Return the stream's file pointer position.
502        """
503        return self.pos
504
505    def seek(self, pos=0):
506        """Set the stream's file pointer to pos. Negative seeking
507           is forbidden.
508        """
509        if pos - self.pos >= 0:
510            blocks, remainder = divmod(pos - self.pos, self.bufsize)
511            for i in range(blocks):
512                self.read(self.bufsize)
513            self.read(remainder)
514        else:
515            raise StreamError("seeking backwards is not allowed")
516        return self.pos
517
518    def read(self, size):
519        """Return the next size number of bytes from the stream."""
520        assert size is not None
521        buf = self._read(size)
522        self.pos += len(buf)
523        return buf
524
525    def _read(self, size):
526        """Return size bytes from the stream.
527        """
528        if self.comptype == "tar":
529            return self.__read(size)
530
531        c = len(self.dbuf)
532        t = [self.dbuf]
533        while c < size:
534            # Skip underlying buffer to avoid unaligned double buffering.
535            if self.buf:
536                buf = self.buf
537                self.buf = b""
538            else:
539                buf = self.fileobj.read(self.bufsize)
540                if not buf:
541                    break
542            try:
543                buf = self.cmp.decompress(buf)
544            except self.exception:
545                raise ReadError("invalid compressed data")
546            t.append(buf)
547            c += len(buf)
548        t = b"".join(t)
549        self.dbuf = t[size:]
550        return t[:size]
551
552    def __read(self, size):
553        """Return size bytes from stream. If internal buffer is empty,
554           read another block from the stream.
555        """
556        c = len(self.buf)
557        t = [self.buf]
558        while c < size:
559            buf = self.fileobj.read(self.bufsize)
560            if not buf:
561                break
562            t.append(buf)
563            c += len(buf)
564        t = b"".join(t)
565        self.buf = t[size:]
566        return t[:size]
567# class _Stream
568
569class _StreamProxy(object):
570    """Small proxy class that enables transparent compression
571       detection for the Stream interface (mode 'r|*').
572    """
573
574    def __init__(self, fileobj):
575        self.fileobj = fileobj
576        self.buf = self.fileobj.read(BLOCKSIZE)
577
578    def read(self, size):
579        self.read = self.fileobj.read
580        return self.buf
581
582    def getcomptype(self):
583        if self.buf.startswith(b"\x1f\x8b\x08"):
584            return "gz"
585        elif self.buf[0:3] == b"BZh" and self.buf[4:10] == b"1AY&SY":
586            return "bz2"
587        elif self.buf.startswith((b"\x5d\x00\x00\x80", b"\xfd7zXZ")):
588            return "xz"
589        else:
590            return "tar"
591
592    def close(self):
593        self.fileobj.close()
594# class StreamProxy
595
596#------------------------
597# Extraction file object
598#------------------------
599class _FileInFile(object):
600    """A thin wrapper around an existing file object that
601       provides a part of its data as an individual file
602       object.
603    """
604
605    def __init__(self, fileobj, offset, size, blockinfo=None):
606        self.fileobj = fileobj
607        self.offset = offset
608        self.size = size
609        self.position = 0
610        self.name = getattr(fileobj, "name", None)
611        self.closed = False
612
613        if blockinfo is None:
614            blockinfo = [(0, size)]
615
616        # Construct a map with data and zero blocks.
617        self.map_index = 0
618        self.map = []
619        lastpos = 0
620        realpos = self.offset
621        for offset, size in blockinfo:
622            if offset > lastpos:
623                self.map.append((False, lastpos, offset, None))
624            self.map.append((True, offset, offset + size, realpos))
625            realpos += size
626            lastpos = offset + size
627        if lastpos < self.size:
628            self.map.append((False, lastpos, self.size, None))
629
630    def flush(self):
631        pass
632
633    def readable(self):
634        return True
635
636    def writable(self):
637        return False
638
639    def seekable(self):
640        return self.fileobj.seekable()
641
642    def tell(self):
643        """Return the current file position.
644        """
645        return self.position
646
647    def seek(self, position, whence=io.SEEK_SET):
648        """Seek to a position in the file.
649        """
650        if whence == io.SEEK_SET:
651            self.position = min(max(position, 0), self.size)
652        elif whence == io.SEEK_CUR:
653            if position < 0:
654                self.position = max(self.position + position, 0)
655            else:
656                self.position = min(self.position + position, self.size)
657        elif whence == io.SEEK_END:
658            self.position = max(min(self.size + position, self.size), 0)
659        else:
660            raise ValueError("Invalid argument")
661        return self.position
662
663    def read(self, size=None):
664        """Read data from the file.
665        """
666        if size is None:
667            size = self.size - self.position
668        else:
669            size = min(size, self.size - self.position)
670
671        buf = b""
672        while size > 0:
673            while True:
674                data, start, stop, offset = self.map[self.map_index]
675                if start <= self.position < stop:
676                    break
677                else:
678                    self.map_index += 1
679                    if self.map_index == len(self.map):
680                        self.map_index = 0
681            length = min(size, stop - self.position)
682            if data:
683                self.fileobj.seek(offset + (self.position - start))
684                b = self.fileobj.read(length)
685                if len(b) != length:
686                    raise ReadError("unexpected end of data")
687                buf += b
688            else:
689                buf += NUL * length
690            size -= length
691            self.position += length
692        return buf
693
694    def readinto(self, b):
695        buf = self.read(len(b))
696        b[:len(buf)] = buf
697        return len(buf)
698
699    def close(self):
700        self.closed = True
701#class _FileInFile
702
703class ExFileObject(io.BufferedReader):
704
705    def __init__(self, tarfile, tarinfo):
706        fileobj = _FileInFile(tarfile.fileobj, tarinfo.offset_data,
707                tarinfo.size, tarinfo.sparse)
708        super().__init__(fileobj)
709#class ExFileObject
710
711#------------------
712# Exported Classes
713#------------------
714class TarInfo(object):
715    """Informational class which holds the details about an
716       archive member given by a tar header block.
717       TarInfo objects are returned by TarFile.getmember(),
718       TarFile.getmembers() and TarFile.gettarinfo() and are
719       usually created internally.
720    """
721
722    __slots__ = dict(
723        name = 'Name of the archive member.',
724        mode = 'Permission bits.',
725        uid = 'User ID of the user who originally stored this member.',
726        gid = 'Group ID of the user who originally stored this member.',
727        size = 'Size in bytes.',
728        mtime = 'Time of last modification.',
729        chksum = 'Header checksum.',
730        type = ('File type. type is usually one of these constants: '
731                'REGTYPE, AREGTYPE, LNKTYPE, SYMTYPE, DIRTYPE, FIFOTYPE, '
732                'CONTTYPE, CHRTYPE, BLKTYPE, GNUTYPE_SPARSE.'),
733        linkname = ('Name of the target file name, which is only present '
734                    'in TarInfo objects of type LNKTYPE and SYMTYPE.'),
735        uname = 'User name.',
736        gname = 'Group name.',
737        devmajor = 'Device major number.',
738        devminor = 'Device minor number.',
739        offset = 'The tar header starts here.',
740        offset_data = "The file's data starts here.",
741        pax_headers = ('A dictionary containing key-value pairs of an '
742                       'associated pax extended header.'),
743        sparse = 'Sparse member information.',
744        tarfile = None,
745        _sparse_structs = None,
746        _link_target = None,
747        )
748
749    def __init__(self, name=""):
750        """Construct a TarInfo object. name is the optional name
751           of the member.
752        """
753        self.name = name        # member name
754        self.mode = 0o644       # file permissions
755        self.uid = 0            # user id
756        self.gid = 0            # group id
757        self.size = 0           # file size
758        self.mtime = 0          # modification time
759        self.chksum = 0         # header checksum
760        self.type = REGTYPE     # member type
761        self.linkname = ""      # link name
762        self.uname = ""         # user name
763        self.gname = ""         # group name
764        self.devmajor = 0       # device major number
765        self.devminor = 0       # device minor number
766
767        self.offset = 0         # the tar header starts here
768        self.offset_data = 0    # the file's data starts here
769
770        self.sparse = None      # sparse member information
771        self.pax_headers = {}   # pax header information
772
773    @property
774    def path(self):
775        'In pax headers, "name" is called "path".'
776        return self.name
777
778    @path.setter
779    def path(self, name):
780        self.name = name
781
782    @property
783    def linkpath(self):
784        'In pax headers, "linkname" is called "linkpath".'
785        return self.linkname
786
787    @linkpath.setter
788    def linkpath(self, linkname):
789        self.linkname = linkname
790
791    def __repr__(self):
792        return "<%s %r at %#x>" % (self.__class__.__name__,self.name,id(self))
793
794    def get_info(self):
795        """Return the TarInfo's attributes as a dictionary.
796        """
797        info = {
798            "name":     self.name,
799            "mode":     self.mode & 0o7777,
800            "uid":      self.uid,
801            "gid":      self.gid,
802            "size":     self.size,
803            "mtime":    self.mtime,
804            "chksum":   self.chksum,
805            "type":     self.type,
806            "linkname": self.linkname,
807            "uname":    self.uname,
808            "gname":    self.gname,
809            "devmajor": self.devmajor,
810            "devminor": self.devminor
811        }
812
813        if info["type"] == DIRTYPE and not info["name"].endswith("/"):
814            info["name"] += "/"
815
816        return info
817
818    def tobuf(self, format=DEFAULT_FORMAT, encoding=ENCODING, errors="surrogateescape"):
819        """Return a tar header as a string of 512 byte blocks.
820        """
821        info = self.get_info()
822
823        if format == USTAR_FORMAT:
824            return self.create_ustar_header(info, encoding, errors)
825        elif format == GNU_FORMAT:
826            return self.create_gnu_header(info, encoding, errors)
827        elif format == PAX_FORMAT:
828            return self.create_pax_header(info, encoding)
829        else:
830            raise ValueError("invalid format")
831
832    def create_ustar_header(self, info, encoding, errors):
833        """Return the object as a ustar header block.
834        """
835        info["magic"] = POSIX_MAGIC
836
837        if len(info["linkname"].encode(encoding, errors)) > LENGTH_LINK:
838            raise ValueError("linkname is too long")
839
840        if len(info["name"].encode(encoding, errors)) > LENGTH_NAME:
841            info["prefix"], info["name"] = self._posix_split_name(info["name"], encoding, errors)
842
843        return self._create_header(info, USTAR_FORMAT, encoding, errors)
844
845    def create_gnu_header(self, info, encoding, errors):
846        """Return the object as a GNU header block sequence.
847        """
848        info["magic"] = GNU_MAGIC
849
850        buf = b""
851        if len(info["linkname"].encode(encoding, errors)) > LENGTH_LINK:
852            buf += self._create_gnu_long_header(info["linkname"], GNUTYPE_LONGLINK, encoding, errors)
853
854        if len(info["name"].encode(encoding, errors)) > LENGTH_NAME:
855            buf += self._create_gnu_long_header(info["name"], GNUTYPE_LONGNAME, encoding, errors)
856
857        return buf + self._create_header(info, GNU_FORMAT, encoding, errors)
858
859    def create_pax_header(self, info, encoding):
860        """Return the object as a ustar header block. If it cannot be
861           represented this way, prepend a pax extended header sequence
862           with supplement information.
863        """
864        info["magic"] = POSIX_MAGIC
865        pax_headers = self.pax_headers.copy()
866
867        # Test string fields for values that exceed the field length or cannot
868        # be represented in ASCII encoding.
869        for name, hname, length in (
870                ("name", "path", LENGTH_NAME), ("linkname", "linkpath", LENGTH_LINK),
871                ("uname", "uname", 32), ("gname", "gname", 32)):
872
873            if hname in pax_headers:
874                # The pax header has priority.
875                continue
876
877            # Try to encode the string as ASCII.
878            try:
879                info[name].encode("ascii", "strict")
880            except UnicodeEncodeError:
881                pax_headers[hname] = info[name]
882                continue
883
884            if len(info[name]) > length:
885                pax_headers[hname] = info[name]
886
887        # Test number fields for values that exceed the field limit or values
888        # that like to be stored as float.
889        for name, digits in (("uid", 8), ("gid", 8), ("size", 12), ("mtime", 12)):
890            if name in pax_headers:
891                # The pax header has priority. Avoid overflow.
892                info[name] = 0
893                continue
894
895            val = info[name]
896            if not 0 <= val < 8 ** (digits - 1) or isinstance(val, float):
897                pax_headers[name] = str(val)
898                info[name] = 0
899
900        # Create a pax extended header if necessary.
901        if pax_headers:
902            buf = self._create_pax_generic_header(pax_headers, XHDTYPE, encoding)
903        else:
904            buf = b""
905
906        return buf + self._create_header(info, USTAR_FORMAT, "ascii", "replace")
907
908    @classmethod
909    def create_pax_global_header(cls, pax_headers):
910        """Return the object as a pax global header block sequence.
911        """
912        return cls._create_pax_generic_header(pax_headers, XGLTYPE, "utf-8")
913
914    def _posix_split_name(self, name, encoding, errors):
915        """Split a name longer than 100 chars into a prefix
916           and a name part.
917        """
918        components = name.split("/")
919        for i in range(1, len(components)):
920            prefix = "/".join(components[:i])
921            name = "/".join(components[i:])
922            if len(prefix.encode(encoding, errors)) <= LENGTH_PREFIX and \
923                    len(name.encode(encoding, errors)) <= LENGTH_NAME:
924                break
925        else:
926            raise ValueError("name is too long")
927
928        return prefix, name
929
930    @staticmethod
931    def _create_header(info, format, encoding, errors):
932        """Return a header block. info is a dictionary with file
933           information, format must be one of the *_FORMAT constants.
934        """
935        has_device_fields = info.get("type") in (CHRTYPE, BLKTYPE)
936        if has_device_fields:
937            devmajor = itn(info.get("devmajor", 0), 8, format)
938            devminor = itn(info.get("devminor", 0), 8, format)
939        else:
940            devmajor = stn("", 8, encoding, errors)
941            devminor = stn("", 8, encoding, errors)
942
943        parts = [
944            stn(info.get("name", ""), 100, encoding, errors),
945            itn(info.get("mode", 0) & 0o7777, 8, format),
946            itn(info.get("uid", 0), 8, format),
947            itn(info.get("gid", 0), 8, format),
948            itn(info.get("size", 0), 12, format),
949            itn(info.get("mtime", 0), 12, format),
950            b"        ", # checksum field
951            info.get("type", REGTYPE),
952            stn(info.get("linkname", ""), 100, encoding, errors),
953            info.get("magic", POSIX_MAGIC),
954            stn(info.get("uname", ""), 32, encoding, errors),
955            stn(info.get("gname", ""), 32, encoding, errors),
956            devmajor,
957            devminor,
958            stn(info.get("prefix", ""), 155, encoding, errors)
959        ]
960
961        buf = struct.pack("%ds" % BLOCKSIZE, b"".join(parts))
962        chksum = calc_chksums(buf[-BLOCKSIZE:])[0]
963        buf = buf[:-364] + bytes("%06o\0" % chksum, "ascii") + buf[-357:]
964        return buf
965
966    @staticmethod
967    def _create_payload(payload):
968        """Return the string payload filled with zero bytes
969           up to the next 512 byte border.
970        """
971        blocks, remainder = divmod(len(payload), BLOCKSIZE)
972        if remainder > 0:
973            payload += (BLOCKSIZE - remainder) * NUL
974        return payload
975
976    @classmethod
977    def _create_gnu_long_header(cls, name, type, encoding, errors):
978        """Return a GNUTYPE_LONGNAME or GNUTYPE_LONGLINK sequence
979           for name.
980        """
981        name = name.encode(encoding, errors) + NUL
982
983        info = {}
984        info["name"] = "././@LongLink"
985        info["type"] = type
986        info["size"] = len(name)
987        info["magic"] = GNU_MAGIC
988
989        # create extended header + name blocks.
990        return cls._create_header(info, USTAR_FORMAT, encoding, errors) + \
991                cls._create_payload(name)
992
993    @classmethod
994    def _create_pax_generic_header(cls, pax_headers, type, encoding):
995        """Return a POSIX.1-2008 extended or global header sequence
996           that contains a list of keyword, value pairs. The values
997           must be strings.
998        """
999        # Check if one of the fields contains surrogate characters and thereby
1000        # forces hdrcharset=BINARY, see _proc_pax() for more information.
1001        binary = False
1002        for keyword, value in pax_headers.items():
1003            try:
1004                value.encode("utf-8", "strict")
1005            except UnicodeEncodeError:
1006                binary = True
1007                break
1008
1009        records = b""
1010        if binary:
1011            # Put the hdrcharset field at the beginning of the header.
1012            records += b"21 hdrcharset=BINARY\n"
1013
1014        for keyword, value in pax_headers.items():
1015            keyword = keyword.encode("utf-8")
1016            if binary:
1017                # Try to restore the original byte representation of `value'.
1018                # Needless to say, that the encoding must match the string.
1019                value = value.encode(encoding, "surrogateescape")
1020            else:
1021                value = value.encode("utf-8")
1022
1023            l = len(keyword) + len(value) + 3   # ' ' + '=' + '\n'
1024            n = p = 0
1025            while True:
1026                n = l + len(str(p))
1027                if n == p:
1028                    break
1029                p = n
1030            records += bytes(str(p), "ascii") + b" " + keyword + b"=" + value + b"\n"
1031
1032        # We use a hardcoded "././@PaxHeader" name like star does
1033        # instead of the one that POSIX recommends.
1034        info = {}
1035        info["name"] = "././@PaxHeader"
1036        info["type"] = type
1037        info["size"] = len(records)
1038        info["magic"] = POSIX_MAGIC
1039
1040        # Create pax header + record blocks.
1041        return cls._create_header(info, USTAR_FORMAT, "ascii", "replace") + \
1042                cls._create_payload(records)
1043
1044    @classmethod
1045    def frombuf(cls, buf, encoding, errors):
1046        """Construct a TarInfo object from a 512 byte bytes object.
1047        """
1048        if len(buf) == 0:
1049            raise EmptyHeaderError("empty header")
1050        if len(buf) != BLOCKSIZE:
1051            raise TruncatedHeaderError("truncated header")
1052        if buf.count(NUL) == BLOCKSIZE:
1053            raise EOFHeaderError("end of file header")
1054
1055        chksum = nti(buf[148:156])
1056        if chksum not in calc_chksums(buf):
1057            raise InvalidHeaderError("bad checksum")
1058
1059        obj = cls()
1060        obj.name = nts(buf[0:100], encoding, errors)
1061        obj.mode = nti(buf[100:108])
1062        obj.uid = nti(buf[108:116])
1063        obj.gid = nti(buf[116:124])
1064        obj.size = nti(buf[124:136])
1065        obj.mtime = nti(buf[136:148])
1066        obj.chksum = chksum
1067        obj.type = buf[156:157]
1068        obj.linkname = nts(buf[157:257], encoding, errors)
1069        obj.uname = nts(buf[265:297], encoding, errors)
1070        obj.gname = nts(buf[297:329], encoding, errors)
1071        obj.devmajor = nti(buf[329:337])
1072        obj.devminor = nti(buf[337:345])
1073        prefix = nts(buf[345:500], encoding, errors)
1074
1075        # Old V7 tar format represents a directory as a regular
1076        # file with a trailing slash.
1077        if obj.type == AREGTYPE and obj.name.endswith("/"):
1078            obj.type = DIRTYPE
1079
1080        # The old GNU sparse format occupies some of the unused
1081        # space in the buffer for up to 4 sparse structures.
1082        # Save them for later processing in _proc_sparse().
1083        if obj.type == GNUTYPE_SPARSE:
1084            pos = 386
1085            structs = []
1086            for i in range(4):
1087                try:
1088                    offset = nti(buf[pos:pos + 12])
1089                    numbytes = nti(buf[pos + 12:pos + 24])
1090                except ValueError:
1091                    break
1092                structs.append((offset, numbytes))
1093                pos += 24
1094            isextended = bool(buf[482])
1095            origsize = nti(buf[483:495])
1096            obj._sparse_structs = (structs, isextended, origsize)
1097
1098        # Remove redundant slashes from directories.
1099        if obj.isdir():
1100            obj.name = obj.name.rstrip("/")
1101
1102        # Reconstruct a ustar longname.
1103        if prefix and obj.type not in GNU_TYPES:
1104            obj.name = prefix + "/" + obj.name
1105        return obj
1106
1107    @classmethod
1108    def fromtarfile(cls, tarfile):
1109        """Return the next TarInfo object from TarFile object
1110           tarfile.
1111        """
1112        buf = tarfile.fileobj.read(BLOCKSIZE)
1113        obj = cls.frombuf(buf, tarfile.encoding, tarfile.errors)
1114        obj.offset = tarfile.fileobj.tell() - BLOCKSIZE
1115        return obj._proc_member(tarfile)
1116
1117    #--------------------------------------------------------------------------
1118    # The following are methods that are called depending on the type of a
1119    # member. The entry point is _proc_member() which can be overridden in a
1120    # subclass to add custom _proc_*() methods. A _proc_*() method MUST
1121    # implement the following
1122    # operations:
1123    # 1. Set self.offset_data to the position where the data blocks begin,
1124    #    if there is data that follows.
1125    # 2. Set tarfile.offset to the position where the next member's header will
1126    #    begin.
1127    # 3. Return self or another valid TarInfo object.
1128    def _proc_member(self, tarfile):
1129        """Choose the right processing method depending on
1130           the type and call it.
1131        """
1132        if self.type in (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK):
1133            return self._proc_gnulong(tarfile)
1134        elif self.type == GNUTYPE_SPARSE:
1135            return self._proc_sparse(tarfile)
1136        elif self.type in (XHDTYPE, XGLTYPE, SOLARIS_XHDTYPE):
1137            return self._proc_pax(tarfile)
1138        else:
1139            return self._proc_builtin(tarfile)
1140
1141    def _proc_builtin(self, tarfile):
1142        """Process a builtin type or an unknown type which
1143           will be treated as a regular file.
1144        """
1145        self.offset_data = tarfile.fileobj.tell()
1146        offset = self.offset_data
1147        if self.isreg() or self.type not in SUPPORTED_TYPES:
1148            # Skip the following data blocks.
1149            offset += self._block(self.size)
1150        tarfile.offset = offset
1151
1152        # Patch the TarInfo object with saved global
1153        # header information.
1154        self._apply_pax_info(tarfile.pax_headers, tarfile.encoding, tarfile.errors)
1155
1156        return self
1157
1158    def _proc_gnulong(self, tarfile):
1159        """Process the blocks that hold a GNU longname
1160           or longlink member.
1161        """
1162        buf = tarfile.fileobj.read(self._block(self.size))
1163
1164        # Fetch the next header and process it.
1165        try:
1166            next = self.fromtarfile(tarfile)
1167        except HeaderError:
1168            raise SubsequentHeaderError("missing or bad subsequent header")
1169
1170        # Patch the TarInfo object from the next header with
1171        # the longname information.
1172        next.offset = self.offset
1173        if self.type == GNUTYPE_LONGNAME:
1174            next.name = nts(buf, tarfile.encoding, tarfile.errors)
1175        elif self.type == GNUTYPE_LONGLINK:
1176            next.linkname = nts(buf, tarfile.encoding, tarfile.errors)
1177
1178        return next
1179
1180    def _proc_sparse(self, tarfile):
1181        """Process a GNU sparse header plus extra headers.
1182        """
1183        # We already collected some sparse structures in frombuf().
1184        structs, isextended, origsize = self._sparse_structs
1185        del self._sparse_structs
1186
1187        # Collect sparse structures from extended header blocks.
1188        while isextended:
1189            buf = tarfile.fileobj.read(BLOCKSIZE)
1190            pos = 0
1191            for i in range(21):
1192                try:
1193                    offset = nti(buf[pos:pos + 12])
1194                    numbytes = nti(buf[pos + 12:pos + 24])
1195                except ValueError:
1196                    break
1197                if offset and numbytes:
1198                    structs.append((offset, numbytes))
1199                pos += 24
1200            isextended = bool(buf[504])
1201        self.sparse = structs
1202
1203        self.offset_data = tarfile.fileobj.tell()
1204        tarfile.offset = self.offset_data + self._block(self.size)
1205        self.size = origsize
1206        return self
1207
1208    def _proc_pax(self, tarfile):
1209        """Process an extended or global header as described in
1210           POSIX.1-2008.
1211        """
1212        # Read the header information.
1213        buf = tarfile.fileobj.read(self._block(self.size))
1214
1215        # A pax header stores supplemental information for either
1216        # the following file (extended) or all following files
1217        # (global).
1218        if self.type == XGLTYPE:
1219            pax_headers = tarfile.pax_headers
1220        else:
1221            pax_headers = tarfile.pax_headers.copy()
1222
1223        # Check if the pax header contains a hdrcharset field. This tells us
1224        # the encoding of the path, linkpath, uname and gname fields. Normally,
1225        # these fields are UTF-8 encoded but since POSIX.1-2008 tar
1226        # implementations are allowed to store them as raw binary strings if
1227        # the translation to UTF-8 fails.
1228        match = re.search(br"\d+ hdrcharset=([^\n]+)\n", buf)
1229        if match is not None:
1230            pax_headers["hdrcharset"] = match.group(1).decode("utf-8")
1231
1232        # For the time being, we don't care about anything other than "BINARY".
1233        # The only other value that is currently allowed by the standard is
1234        # "ISO-IR 10646 2000 UTF-8" in other words UTF-8.
1235        hdrcharset = pax_headers.get("hdrcharset")
1236        if hdrcharset == "BINARY":
1237            encoding = tarfile.encoding
1238        else:
1239            encoding = "utf-8"
1240
1241        # Parse pax header information. A record looks like that:
1242        # "%d %s=%s\n" % (length, keyword, value). length is the size
1243        # of the complete record including the length field itself and
1244        # the newline. keyword and value are both UTF-8 encoded strings.
1245        regex = re.compile(br"(\d+) ([^=]+)=")
1246        pos = 0
1247        while True:
1248            match = regex.match(buf, pos)
1249            if not match:
1250                break
1251
1252            length, keyword = match.groups()
1253            length = int(length)
1254            if length == 0:
1255                raise InvalidHeaderError("invalid header")
1256            value = buf[match.end(2) + 1:match.start(1) + length - 1]
1257
1258            # Normally, we could just use "utf-8" as the encoding and "strict"
1259            # as the error handler, but we better not take the risk. For
1260            # example, GNU tar <= 1.23 is known to store filenames it cannot
1261            # translate to UTF-8 as raw strings (unfortunately without a
1262            # hdrcharset=BINARY header).
1263            # We first try the strict standard encoding, and if that fails we
1264            # fall back on the user's encoding and error handler.
1265            keyword = self._decode_pax_field(keyword, "utf-8", "utf-8",
1266                    tarfile.errors)
1267            if keyword in PAX_NAME_FIELDS:
1268                value = self._decode_pax_field(value, encoding, tarfile.encoding,
1269                        tarfile.errors)
1270            else:
1271                value = self._decode_pax_field(value, "utf-8", "utf-8",
1272                        tarfile.errors)
1273
1274            pax_headers[keyword] = value
1275            pos += length
1276
1277        # Fetch the next header.
1278        try:
1279            next = self.fromtarfile(tarfile)
1280        except HeaderError:
1281            raise SubsequentHeaderError("missing or bad subsequent header")
1282
1283        # Process GNU sparse information.
1284        if "GNU.sparse.map" in pax_headers:
1285            # GNU extended sparse format version 0.1.
1286            self._proc_gnusparse_01(next, pax_headers)
1287
1288        elif "GNU.sparse.size" in pax_headers:
1289            # GNU extended sparse format version 0.0.
1290            self._proc_gnusparse_00(next, pax_headers, buf)
1291
1292        elif pax_headers.get("GNU.sparse.major") == "1" and pax_headers.get("GNU.sparse.minor") == "0":
1293            # GNU extended sparse format version 1.0.
1294            self._proc_gnusparse_10(next, pax_headers, tarfile)
1295
1296        if self.type in (XHDTYPE, SOLARIS_XHDTYPE):
1297            # Patch the TarInfo object with the extended header info.
1298            next._apply_pax_info(pax_headers, tarfile.encoding, tarfile.errors)
1299            next.offset = self.offset
1300
1301            if "size" in pax_headers:
1302                # If the extended header replaces the size field,
1303                # we need to recalculate the offset where the next
1304                # header starts.
1305                offset = next.offset_data
1306                if next.isreg() or next.type not in SUPPORTED_TYPES:
1307                    offset += next._block(next.size)
1308                tarfile.offset = offset
1309
1310        return next
1311
1312    def _proc_gnusparse_00(self, next, pax_headers, buf):
1313        """Process a GNU tar extended sparse header, version 0.0.
1314        """
1315        offsets = []
1316        for match in re.finditer(br"\d+ GNU.sparse.offset=(\d+)\n", buf):
1317            offsets.append(int(match.group(1)))
1318        numbytes = []
1319        for match in re.finditer(br"\d+ GNU.sparse.numbytes=(\d+)\n", buf):
1320            numbytes.append(int(match.group(1)))
1321        next.sparse = list(zip(offsets, numbytes))
1322
1323    def _proc_gnusparse_01(self, next, pax_headers):
1324        """Process a GNU tar extended sparse header, version 0.1.
1325        """
1326        sparse = [int(x) for x in pax_headers["GNU.sparse.map"].split(",")]
1327        next.sparse = list(zip(sparse[::2], sparse[1::2]))
1328
1329    def _proc_gnusparse_10(self, next, pax_headers, tarfile):
1330        """Process a GNU tar extended sparse header, version 1.0.
1331        """
1332        fields = None
1333        sparse = []
1334        buf = tarfile.fileobj.read(BLOCKSIZE)
1335        fields, buf = buf.split(b"\n", 1)
1336        fields = int(fields)
1337        while len(sparse) < fields * 2:
1338            if b"\n" not in buf:
1339                buf += tarfile.fileobj.read(BLOCKSIZE)
1340            number, buf = buf.split(b"\n", 1)
1341            sparse.append(int(number))
1342        next.offset_data = tarfile.fileobj.tell()
1343        next.sparse = list(zip(sparse[::2], sparse[1::2]))
1344
1345    def _apply_pax_info(self, pax_headers, encoding, errors):
1346        """Replace fields with supplemental information from a previous
1347           pax extended or global header.
1348        """
1349        for keyword, value in pax_headers.items():
1350            if keyword == "GNU.sparse.name":
1351                setattr(self, "path", value)
1352            elif keyword == "GNU.sparse.size":
1353                setattr(self, "size", int(value))
1354            elif keyword == "GNU.sparse.realsize":
1355                setattr(self, "size", int(value))
1356            elif keyword in PAX_FIELDS:
1357                if keyword in PAX_NUMBER_FIELDS:
1358                    try:
1359                        value = PAX_NUMBER_FIELDS[keyword](value)
1360                    except ValueError:
1361                        value = 0
1362                if keyword == "path":
1363                    value = value.rstrip("/")
1364                setattr(self, keyword, value)
1365
1366        self.pax_headers = pax_headers.copy()
1367
1368    def _decode_pax_field(self, value, encoding, fallback_encoding, fallback_errors):
1369        """Decode a single field from a pax record.
1370        """
1371        try:
1372            return value.decode(encoding, "strict")
1373        except UnicodeDecodeError:
1374            return value.decode(fallback_encoding, fallback_errors)
1375
1376    def _block(self, count):
1377        """Round up a byte count by BLOCKSIZE and return it,
1378           e.g. _block(834) => 1024.
1379        """
1380        blocks, remainder = divmod(count, BLOCKSIZE)
1381        if remainder:
1382            blocks += 1
1383        return blocks * BLOCKSIZE
1384
1385    def isreg(self):
1386        'Return True if the Tarinfo object is a regular file.'
1387        return self.type in REGULAR_TYPES
1388
1389    def isfile(self):
1390        'Return True if the Tarinfo object is a regular file.'
1391        return self.isreg()
1392
1393    def isdir(self):
1394        'Return True if it is a directory.'
1395        return self.type == DIRTYPE
1396
1397    def issym(self):
1398        'Return True if it is a symbolic link.'
1399        return self.type == SYMTYPE
1400
1401    def islnk(self):
1402        'Return True if it is a hard link.'
1403        return self.type == LNKTYPE
1404
1405    def ischr(self):
1406        'Return True if it is a character device.'
1407        return self.type == CHRTYPE
1408
1409    def isblk(self):
1410        'Return True if it is a block device.'
1411        return self.type == BLKTYPE
1412
1413    def isfifo(self):
1414        'Return True if it is a FIFO.'
1415        return self.type == FIFOTYPE
1416
1417    def issparse(self):
1418        return self.sparse is not None
1419
1420    def isdev(self):
1421        'Return True if it is one of character device, block device or FIFO.'
1422        return self.type in (CHRTYPE, BLKTYPE, FIFOTYPE)
1423# class TarInfo
1424
1425class TarFile(object):
1426    """The TarFile Class provides an interface to tar archives.
1427    """
1428
1429    debug = 0                   # May be set from 0 (no msgs) to 3 (all msgs)
1430
1431    dereference = False         # If true, add content of linked file to the
1432                                # tar file, else the link.
1433
1434    ignore_zeros = False        # If true, skips empty or invalid blocks and
1435                                # continues processing.
1436
1437    errorlevel = 1              # If 0, fatal errors only appear in debug
1438                                # messages (if debug >= 0). If > 0, errors
1439                                # are passed to the caller as exceptions.
1440
1441    format = DEFAULT_FORMAT     # The format to use when creating an archive.
1442
1443    encoding = ENCODING         # Encoding for 8-bit character strings.
1444
1445    errors = None               # Error handler for unicode conversion.
1446
1447    tarinfo = TarInfo           # The default TarInfo class to use.
1448
1449    fileobject = ExFileObject   # The file-object for extractfile().
1450
1451    def __init__(self, name=None, mode="r", fileobj=None, format=None,
1452            tarinfo=None, dereference=None, ignore_zeros=None, encoding=None,
1453            errors="surrogateescape", pax_headers=None, debug=None,
1454            errorlevel=None, copybufsize=None):
1455        """Open an (uncompressed) tar archive `name'. `mode' is either 'r' to
1456           read from an existing archive, 'a' to append data to an existing
1457           file or 'w' to create a new file overwriting an existing one. `mode'
1458           defaults to 'r'.
1459           If `fileobj' is given, it is used for reading or writing data. If it
1460           can be determined, `mode' is overridden by `fileobj's mode.
1461           `fileobj' is not closed, when TarFile is closed.
1462        """
1463        modes = {"r": "rb", "a": "r+b", "w": "wb", "x": "xb"}
1464        if mode not in modes:
1465            raise ValueError("mode must be 'r', 'a', 'w' or 'x'")
1466        self.mode = mode
1467        self._mode = modes[mode]
1468
1469        if not fileobj:
1470            if self.mode == "a" and not os.path.exists(name):
1471                # Create nonexistent files in append mode.
1472                self.mode = "w"
1473                self._mode = "wb"
1474            fileobj = bltn_open(name, self._mode)
1475            self._extfileobj = False
1476        else:
1477            if (name is None and hasattr(fileobj, "name") and
1478                isinstance(fileobj.name, (str, bytes))):
1479                name = fileobj.name
1480            if hasattr(fileobj, "mode"):
1481                self._mode = fileobj.mode
1482            self._extfileobj = True
1483        self.name = os.path.abspath(name) if name else None
1484        self.fileobj = fileobj
1485
1486        # Init attributes.
1487        if format is not None:
1488            self.format = format
1489        if tarinfo is not None:
1490            self.tarinfo = tarinfo
1491        if dereference is not None:
1492            self.dereference = dereference
1493        if ignore_zeros is not None:
1494            self.ignore_zeros = ignore_zeros
1495        if encoding is not None:
1496            self.encoding = encoding
1497        self.errors = errors
1498
1499        if pax_headers is not None and self.format == PAX_FORMAT:
1500            self.pax_headers = pax_headers
1501        else:
1502            self.pax_headers = {}
1503
1504        if debug is not None:
1505            self.debug = debug
1506        if errorlevel is not None:
1507            self.errorlevel = errorlevel
1508
1509        # Init datastructures.
1510        self.copybufsize = copybufsize
1511        self.closed = False
1512        self.members = []       # list of members as TarInfo objects
1513        self._loaded = False    # flag if all members have been read
1514        self.offset = self.fileobj.tell()
1515                                # current position in the archive file
1516        self.inodes = {}        # dictionary caching the inodes of
1517                                # archive members already added
1518
1519        try:
1520            if self.mode == "r":
1521                self.firstmember = None
1522                self.firstmember = self.next()
1523
1524            if self.mode == "a":
1525                # Move to the end of the archive,
1526                # before the first empty block.
1527                while True:
1528                    self.fileobj.seek(self.offset)
1529                    try:
1530                        tarinfo = self.tarinfo.fromtarfile(self)
1531                        self.members.append(tarinfo)
1532                    except EOFHeaderError:
1533                        self.fileobj.seek(self.offset)
1534                        break
1535                    except HeaderError as e:
1536                        raise ReadError(str(e))
1537
1538            if self.mode in ("a", "w", "x"):
1539                self._loaded = True
1540
1541                if self.pax_headers:
1542                    buf = self.tarinfo.create_pax_global_header(self.pax_headers.copy())
1543                    self.fileobj.write(buf)
1544                    self.offset += len(buf)
1545        except:
1546            if not self._extfileobj:
1547                self.fileobj.close()
1548            self.closed = True
1549            raise
1550
1551    #--------------------------------------------------------------------------
1552    # Below are the classmethods which act as alternate constructors to the
1553    # TarFile class. The open() method is the only one that is needed for
1554    # public use; it is the "super"-constructor and is able to select an
1555    # adequate "sub"-constructor for a particular compression using the mapping
1556    # from OPEN_METH.
1557    #
1558    # This concept allows one to subclass TarFile without losing the comfort of
1559    # the super-constructor. A sub-constructor is registered and made available
1560    # by adding it to the mapping in OPEN_METH.
1561
1562    @classmethod
1563    def open(cls, name=None, mode="r", fileobj=None, bufsize=RECORDSIZE, **kwargs):
1564        """Open a tar archive for reading, writing or appending. Return
1565           an appropriate TarFile class.
1566
1567           mode:
1568           'r' or 'r:*' open for reading with transparent compression
1569           'r:'         open for reading exclusively uncompressed
1570           'r:gz'       open for reading with gzip compression
1571           'r:bz2'      open for reading with bzip2 compression
1572           'r:xz'       open for reading with lzma compression
1573           'a' or 'a:'  open for appending, creating the file if necessary
1574           'w' or 'w:'  open for writing without compression
1575           'w:gz'       open for writing with gzip compression
1576           'w:bz2'      open for writing with bzip2 compression
1577           'w:xz'       open for writing with lzma compression
1578
1579           'x' or 'x:'  create a tarfile exclusively without compression, raise
1580                        an exception if the file is already created
1581           'x:gz'       create a gzip compressed tarfile, raise an exception
1582                        if the file is already created
1583           'x:bz2'      create a bzip2 compressed tarfile, raise an exception
1584                        if the file is already created
1585           'x:xz'       create an lzma compressed tarfile, raise an exception
1586                        if the file is already created
1587
1588           'r|*'        open a stream of tar blocks with transparent compression
1589           'r|'         open an uncompressed stream of tar blocks for reading
1590           'r|gz'       open a gzip compressed stream of tar blocks
1591           'r|bz2'      open a bzip2 compressed stream of tar blocks
1592           'r|xz'       open an lzma compressed stream of tar blocks
1593           'w|'         open an uncompressed stream for writing
1594           'w|gz'       open a gzip compressed stream for writing
1595           'w|bz2'      open a bzip2 compressed stream for writing
1596           'w|xz'       open an lzma compressed stream for writing
1597        """
1598
1599        if not name and not fileobj:
1600            raise ValueError("nothing to open")
1601
1602        if mode in ("r", "r:*"):
1603            # Find out which *open() is appropriate for opening the file.
1604            def not_compressed(comptype):
1605                return cls.OPEN_METH[comptype] == 'taropen'
1606            for comptype in sorted(cls.OPEN_METH, key=not_compressed):
1607                func = getattr(cls, cls.OPEN_METH[comptype])
1608                if fileobj is not None:
1609                    saved_pos = fileobj.tell()
1610                try:
1611                    return func(name, "r", fileobj, **kwargs)
1612                except (ReadError, CompressionError):
1613                    if fileobj is not None:
1614                        fileobj.seek(saved_pos)
1615                    continue
1616            raise ReadError("file could not be opened successfully")
1617
1618        elif ":" in mode:
1619            filemode, comptype = mode.split(":", 1)
1620            filemode = filemode or "r"
1621            comptype = comptype or "tar"
1622
1623            # Select the *open() function according to
1624            # given compression.
1625            if comptype in cls.OPEN_METH:
1626                func = getattr(cls, cls.OPEN_METH[comptype])
1627            else:
1628                raise CompressionError("unknown compression type %r" % comptype)
1629            return func(name, filemode, fileobj, **kwargs)
1630
1631        elif "|" in mode:
1632            filemode, comptype = mode.split("|", 1)
1633            filemode = filemode or "r"
1634            comptype = comptype or "tar"
1635
1636            if filemode not in ("r", "w"):
1637                raise ValueError("mode must be 'r' or 'w'")
1638
1639            stream = _Stream(name, filemode, comptype, fileobj, bufsize)
1640            try:
1641                t = cls(name, filemode, stream, **kwargs)
1642            except:
1643                stream.close()
1644                raise
1645            t._extfileobj = False
1646            return t
1647
1648        elif mode in ("a", "w", "x"):
1649            return cls.taropen(name, mode, fileobj, **kwargs)
1650
1651        raise ValueError("undiscernible mode")
1652
1653    @classmethod
1654    def taropen(cls, name, mode="r", fileobj=None, **kwargs):
1655        """Open uncompressed tar archive name for reading or writing.
1656        """
1657        if mode not in ("r", "a", "w", "x"):
1658            raise ValueError("mode must be 'r', 'a', 'w' or 'x'")
1659        return cls(name, mode, fileobj, **kwargs)
1660
1661    @classmethod
1662    def gzopen(cls, name, mode="r", fileobj=None, compresslevel=9, **kwargs):
1663        """Open gzip compressed tar archive name for reading or writing.
1664           Appending is not allowed.
1665        """
1666        if mode not in ("r", "w", "x"):
1667            raise ValueError("mode must be 'r', 'w' or 'x'")
1668
1669        try:
1670            from gzip import GzipFile
1671        except ImportError:
1672            raise CompressionError("gzip module is not available")
1673
1674        try:
1675            fileobj = GzipFile(name, mode + "b", compresslevel, fileobj)
1676        except OSError:
1677            if fileobj is not None and mode == 'r':
1678                raise ReadError("not a gzip file")
1679            raise
1680
1681        try:
1682            t = cls.taropen(name, mode, fileobj, **kwargs)
1683        except OSError:
1684            fileobj.close()
1685            if mode == 'r':
1686                raise ReadError("not a gzip file")
1687            raise
1688        except:
1689            fileobj.close()
1690            raise
1691        t._extfileobj = False
1692        return t
1693
1694    @classmethod
1695    def bz2open(cls, name, mode="r", fileobj=None, compresslevel=9, **kwargs):
1696        """Open bzip2 compressed tar archive name for reading or writing.
1697           Appending is not allowed.
1698        """
1699        if mode not in ("r", "w", "x"):
1700            raise ValueError("mode must be 'r', 'w' or 'x'")
1701
1702        try:
1703            from bz2 import BZ2File
1704        except ImportError:
1705            raise CompressionError("bz2 module is not available")
1706
1707        fileobj = BZ2File(fileobj or name, mode, compresslevel=compresslevel)
1708
1709        try:
1710            t = cls.taropen(name, mode, fileobj, **kwargs)
1711        except (OSError, EOFError):
1712            fileobj.close()
1713            if mode == 'r':
1714                raise ReadError("not a bzip2 file")
1715            raise
1716        except:
1717            fileobj.close()
1718            raise
1719        t._extfileobj = False
1720        return t
1721
1722    @classmethod
1723    def xzopen(cls, name, mode="r", fileobj=None, preset=None, **kwargs):
1724        """Open lzma compressed tar archive name for reading or writing.
1725           Appending is not allowed.
1726        """
1727        if mode not in ("r", "w", "x"):
1728            raise ValueError("mode must be 'r', 'w' or 'x'")
1729
1730        try:
1731            from lzma import LZMAFile, LZMAError
1732        except ImportError:
1733            raise CompressionError("lzma module is not available")
1734
1735        fileobj = LZMAFile(fileobj or name, mode, preset=preset)
1736
1737        try:
1738            t = cls.taropen(name, mode, fileobj, **kwargs)
1739        except (LZMAError, EOFError):
1740            fileobj.close()
1741            if mode == 'r':
1742                raise ReadError("not an lzma file")
1743            raise
1744        except:
1745            fileobj.close()
1746            raise
1747        t._extfileobj = False
1748        return t
1749
1750    # All *open() methods are registered here.
1751    OPEN_METH = {
1752        "tar": "taropen",   # uncompressed tar
1753        "gz":  "gzopen",    # gzip compressed tar
1754        "bz2": "bz2open",   # bzip2 compressed tar
1755        "xz":  "xzopen"     # lzma compressed tar
1756    }
1757
1758    #--------------------------------------------------------------------------
1759    # The public methods which TarFile provides:
1760
1761    def close(self):
1762        """Close the TarFile. In write-mode, two finishing zero blocks are
1763           appended to the archive.
1764        """
1765        if self.closed:
1766            return
1767
1768        self.closed = True
1769        try:
1770            if self.mode in ("a", "w", "x"):
1771                self.fileobj.write(NUL * (BLOCKSIZE * 2))
1772                self.offset += (BLOCKSIZE * 2)
1773                # fill up the end with zero-blocks
1774                # (like option -b20 for tar does)
1775                blocks, remainder = divmod(self.offset, RECORDSIZE)
1776                if remainder > 0:
1777                    self.fileobj.write(NUL * (RECORDSIZE - remainder))
1778        finally:
1779            if not self._extfileobj:
1780                self.fileobj.close()
1781
1782    def getmember(self, name):
1783        """Return a TarInfo object for member `name'. If `name' can not be
1784           found in the archive, KeyError is raised. If a member occurs more
1785           than once in the archive, its last occurrence is assumed to be the
1786           most up-to-date version.
1787        """
1788        tarinfo = self._getmember(name)
1789        if tarinfo is None:
1790            raise KeyError("filename %r not found" % name)
1791        return tarinfo
1792
1793    def getmembers(self):
1794        """Return the members of the archive as a list of TarInfo objects. The
1795           list has the same order as the members in the archive.
1796        """
1797        self._check()
1798        if not self._loaded:    # if we want to obtain a list of
1799            self._load()        # all members, we first have to
1800                                # scan the whole archive.
1801        return self.members
1802
1803    def getnames(self):
1804        """Return the members of the archive as a list of their names. It has
1805           the same order as the list returned by getmembers().
1806        """
1807        return [tarinfo.name for tarinfo in self.getmembers()]
1808
1809    def gettarinfo(self, name=None, arcname=None, fileobj=None):
1810        """Create a TarInfo object from the result of os.stat or equivalent
1811           on an existing file. The file is either named by `name', or
1812           specified as a file object `fileobj' with a file descriptor. If
1813           given, `arcname' specifies an alternative name for the file in the
1814           archive, otherwise, the name is taken from the 'name' attribute of
1815           'fileobj', or the 'name' argument. The name should be a text
1816           string.
1817        """
1818        self._check("awx")
1819
1820        # When fileobj is given, replace name by
1821        # fileobj's real name.
1822        if fileobj is not None:
1823            name = fileobj.name
1824
1825        # Building the name of the member in the archive.
1826        # Backward slashes are converted to forward slashes,
1827        # Absolute paths are turned to relative paths.
1828        if arcname is None:
1829            arcname = name
1830        drv, arcname = os.path.splitdrive(arcname)
1831        arcname = arcname.replace(os.sep, "/")
1832        arcname = arcname.lstrip("/")
1833
1834        # Now, fill the TarInfo object with
1835        # information specific for the file.
1836        tarinfo = self.tarinfo()
1837        tarinfo.tarfile = self  # Not needed
1838
1839        # Use os.stat or os.lstat, depending on if symlinks shall be resolved.
1840        if fileobj is None:
1841            if not self.dereference:
1842                statres = os.lstat(name)
1843            else:
1844                statres = os.stat(name)
1845        else:
1846            statres = os.fstat(fileobj.fileno())
1847        linkname = ""
1848
1849        stmd = statres.st_mode
1850        if stat.S_ISREG(stmd):
1851            inode = (statres.st_ino, statres.st_dev)
1852            if not self.dereference and statres.st_nlink > 1 and \
1853                    inode in self.inodes and arcname != self.inodes[inode]:
1854                # Is it a hardlink to an already
1855                # archived file?
1856                type = LNKTYPE
1857                linkname = self.inodes[inode]
1858            else:
1859                # The inode is added only if its valid.
1860                # For win32 it is always 0.
1861                type = REGTYPE
1862                if inode[0]:
1863                    self.inodes[inode] = arcname
1864        elif stat.S_ISDIR(stmd):
1865            type = DIRTYPE
1866        elif stat.S_ISFIFO(stmd):
1867            type = FIFOTYPE
1868        elif stat.S_ISLNK(stmd):
1869            type = SYMTYPE
1870            linkname = os.readlink(name)
1871        elif stat.S_ISCHR(stmd):
1872            type = CHRTYPE
1873        elif stat.S_ISBLK(stmd):
1874            type = BLKTYPE
1875        else:
1876            return None
1877
1878        # Fill the TarInfo object with all
1879        # information we can get.
1880        tarinfo.name = arcname
1881        tarinfo.mode = stmd
1882        tarinfo.uid = statres.st_uid
1883        tarinfo.gid = statres.st_gid
1884        if type == REGTYPE:
1885            tarinfo.size = statres.st_size
1886        else:
1887            tarinfo.size = 0
1888        tarinfo.mtime = statres.st_mtime
1889        tarinfo.type = type
1890        tarinfo.linkname = linkname
1891        if pwd:
1892            try:
1893                tarinfo.uname = pwd.getpwuid(tarinfo.uid)[0]
1894            except KeyError:
1895                pass
1896        if grp:
1897            try:
1898                tarinfo.gname = grp.getgrgid(tarinfo.gid)[0]
1899            except KeyError:
1900                pass
1901
1902        if type in (CHRTYPE, BLKTYPE):
1903            if hasattr(os, "major") and hasattr(os, "minor"):
1904                tarinfo.devmajor = os.major(statres.st_rdev)
1905                tarinfo.devminor = os.minor(statres.st_rdev)
1906        return tarinfo
1907
1908    def list(self, verbose=True, *, members=None):
1909        """Print a table of contents to sys.stdout. If `verbose' is False, only
1910           the names of the members are printed. If it is True, an `ls -l'-like
1911           output is produced. `members' is optional and must be a subset of the
1912           list returned by getmembers().
1913        """
1914        self._check()
1915
1916        if members is None:
1917            members = self
1918        for tarinfo in members:
1919            if verbose:
1920                _safe_print(stat.filemode(tarinfo.mode))
1921                _safe_print("%s/%s" % (tarinfo.uname or tarinfo.uid,
1922                                       tarinfo.gname or tarinfo.gid))
1923                if tarinfo.ischr() or tarinfo.isblk():
1924                    _safe_print("%10s" %
1925                            ("%d,%d" % (tarinfo.devmajor, tarinfo.devminor)))
1926                else:
1927                    _safe_print("%10d" % tarinfo.size)
1928                _safe_print("%d-%02d-%02d %02d:%02d:%02d" \
1929                            % time.localtime(tarinfo.mtime)[:6])
1930
1931            _safe_print(tarinfo.name + ("/" if tarinfo.isdir() else ""))
1932
1933            if verbose:
1934                if tarinfo.issym():
1935                    _safe_print("-> " + tarinfo.linkname)
1936                if tarinfo.islnk():
1937                    _safe_print("link to " + tarinfo.linkname)
1938            print()
1939
1940    def add(self, name, arcname=None, recursive=True, *, filter=None):
1941        """Add the file `name' to the archive. `name' may be any type of file
1942           (directory, fifo, symbolic link, etc.). If given, `arcname'
1943           specifies an alternative name for the file in the archive.
1944           Directories are added recursively by default. This can be avoided by
1945           setting `recursive' to False. `filter' is a function
1946           that expects a TarInfo object argument and returns the changed
1947           TarInfo object, if it returns None the TarInfo object will be
1948           excluded from the archive.
1949        """
1950        self._check("awx")
1951
1952        if arcname is None:
1953            arcname = name
1954
1955        # Skip if somebody tries to archive the archive...
1956        if self.name is not None and os.path.abspath(name) == self.name:
1957            self._dbg(2, "tarfile: Skipped %r" % name)
1958            return
1959
1960        self._dbg(1, name)
1961
1962        # Create a TarInfo object from the file.
1963        tarinfo = self.gettarinfo(name, arcname)
1964
1965        if tarinfo is None:
1966            self._dbg(1, "tarfile: Unsupported type %r" % name)
1967            return
1968
1969        # Change or exclude the TarInfo object.
1970        if filter is not None:
1971            tarinfo = filter(tarinfo)
1972            if tarinfo is None:
1973                self._dbg(2, "tarfile: Excluded %r" % name)
1974                return
1975
1976        # Append the tar header and data to the archive.
1977        if tarinfo.isreg():
1978            with bltn_open(name, "rb") as f:
1979                self.addfile(tarinfo, f)
1980
1981        elif tarinfo.isdir():
1982            self.addfile(tarinfo)
1983            if recursive:
1984                for f in sorted(os.listdir(name)):
1985                    self.add(os.path.join(name, f), os.path.join(arcname, f),
1986                            recursive, filter=filter)
1987
1988        else:
1989            self.addfile(tarinfo)
1990
1991    def addfile(self, tarinfo, fileobj=None):
1992        """Add the TarInfo object `tarinfo' to the archive. If `fileobj' is
1993           given, it should be a binary file, and tarinfo.size bytes are read
1994           from it and added to the archive. You can create TarInfo objects
1995           directly, or by using gettarinfo().
1996        """
1997        self._check("awx")
1998
1999        tarinfo = copy.copy(tarinfo)
2000
2001        buf = tarinfo.tobuf(self.format, self.encoding, self.errors)
2002        self.fileobj.write(buf)
2003        self.offset += len(buf)
2004        bufsize=self.copybufsize
2005        # If there's data to follow, append it.
2006        if fileobj is not None:
2007            copyfileobj(fileobj, self.fileobj, tarinfo.size, bufsize=bufsize)
2008            blocks, remainder = divmod(tarinfo.size, BLOCKSIZE)
2009            if remainder > 0:
2010                self.fileobj.write(NUL * (BLOCKSIZE - remainder))
2011                blocks += 1
2012            self.offset += blocks * BLOCKSIZE
2013
2014        self.members.append(tarinfo)
2015
2016    def extractall(self, path=".", members=None, *, numeric_owner=False):
2017        """Extract all members from the archive to the current working
2018           directory and set owner, modification time and permissions on
2019           directories afterwards. `path' specifies a different directory
2020           to extract to. `members' is optional and must be a subset of the
2021           list returned by getmembers(). If `numeric_owner` is True, only
2022           the numbers for user/group names are used and not the names.
2023        """
2024        directories = []
2025
2026        if members is None:
2027            members = self
2028
2029        for tarinfo in members:
2030            if tarinfo.isdir():
2031                # Extract directories with a safe mode.
2032                directories.append(tarinfo)
2033                tarinfo = copy.copy(tarinfo)
2034                tarinfo.mode = 0o700
2035            # Do not set_attrs directories, as we will do that further down
2036            self.extract(tarinfo, path, set_attrs=not tarinfo.isdir(),
2037                         numeric_owner=numeric_owner)
2038
2039        # Reverse sort directories.
2040        directories.sort(key=lambda a: a.name)
2041        directories.reverse()
2042
2043        # Set correct owner, mtime and filemode on directories.
2044        for tarinfo in directories:
2045            dirpath = os.path.join(path, tarinfo.name)
2046            try:
2047                self.chown(tarinfo, dirpath, numeric_owner=numeric_owner)
2048                self.utime(tarinfo, dirpath)
2049                self.chmod(tarinfo, dirpath)
2050            except ExtractError as e:
2051                if self.errorlevel > 1:
2052                    raise
2053                else:
2054                    self._dbg(1, "tarfile: %s" % e)
2055
2056    def extract(self, member, path="", set_attrs=True, *, numeric_owner=False):
2057        """Extract a member from the archive to the current working directory,
2058           using its full name. Its file information is extracted as accurately
2059           as possible. `member' may be a filename or a TarInfo object. You can
2060           specify a different directory using `path'. File attributes (owner,
2061           mtime, mode) are set unless `set_attrs' is False. If `numeric_owner`
2062           is True, only the numbers for user/group names are used and not
2063           the names.
2064        """
2065        self._check("r")
2066
2067        if isinstance(member, str):
2068            tarinfo = self.getmember(member)
2069        else:
2070            tarinfo = member
2071
2072        # Prepare the link target for makelink().
2073        if tarinfo.islnk():
2074            tarinfo._link_target = os.path.join(path, tarinfo.linkname)
2075
2076        try:
2077            self._extract_member(tarinfo, os.path.join(path, tarinfo.name),
2078                                 set_attrs=set_attrs,
2079                                 numeric_owner=numeric_owner)
2080        except OSError as e:
2081            if self.errorlevel > 0:
2082                raise
2083            else:
2084                if e.filename is None:
2085                    self._dbg(1, "tarfile: %s" % e.strerror)
2086                else:
2087                    self._dbg(1, "tarfile: %s %r" % (e.strerror, e.filename))
2088        except ExtractError as e:
2089            if self.errorlevel > 1:
2090                raise
2091            else:
2092                self._dbg(1, "tarfile: %s" % e)
2093
2094    def extractfile(self, member):
2095        """Extract a member from the archive as a file object. `member' may be
2096           a filename or a TarInfo object. If `member' is a regular file or
2097           a link, an io.BufferedReader object is returned. For all other
2098           existing members, None is returned. If `member' does not appear
2099           in the archive, KeyError is raised.
2100        """
2101        self._check("r")
2102
2103        if isinstance(member, str):
2104            tarinfo = self.getmember(member)
2105        else:
2106            tarinfo = member
2107
2108        if tarinfo.isreg() or tarinfo.type not in SUPPORTED_TYPES:
2109            # Members with unknown types are treated as regular files.
2110            return self.fileobject(self, tarinfo)
2111
2112        elif tarinfo.islnk() or tarinfo.issym():
2113            if isinstance(self.fileobj, _Stream):
2114                # A small but ugly workaround for the case that someone tries
2115                # to extract a (sym)link as a file-object from a non-seekable
2116                # stream of tar blocks.
2117                raise StreamError("cannot extract (sym)link as file object")
2118            else:
2119                # A (sym)link's file object is its target's file object.
2120                return self.extractfile(self._find_link_target(tarinfo))
2121        else:
2122            # If there's no data associated with the member (directory, chrdev,
2123            # blkdev, etc.), return None instead of a file object.
2124            return None
2125
2126    def _extract_member(self, tarinfo, targetpath, set_attrs=True,
2127                        numeric_owner=False):
2128        """Extract the TarInfo object tarinfo to a physical
2129           file called targetpath.
2130        """
2131        # Fetch the TarInfo object for the given name
2132        # and build the destination pathname, replacing
2133        # forward slashes to platform specific separators.
2134        targetpath = targetpath.rstrip("/")
2135        targetpath = targetpath.replace("/", os.sep)
2136
2137        # Create all upper directories.
2138        upperdirs = os.path.dirname(targetpath)
2139        if upperdirs and not os.path.exists(upperdirs):
2140            # Create directories that are not part of the archive with
2141            # default permissions.
2142            os.makedirs(upperdirs)
2143
2144        if tarinfo.islnk() or tarinfo.issym():
2145            self._dbg(1, "%s -> %s" % (tarinfo.name, tarinfo.linkname))
2146        else:
2147            self._dbg(1, tarinfo.name)
2148
2149        if tarinfo.isreg():
2150            self.makefile(tarinfo, targetpath)
2151        elif tarinfo.isdir():
2152            self.makedir(tarinfo, targetpath)
2153        elif tarinfo.isfifo():
2154            self.makefifo(tarinfo, targetpath)
2155        elif tarinfo.ischr() or tarinfo.isblk():
2156            self.makedev(tarinfo, targetpath)
2157        elif tarinfo.islnk() or tarinfo.issym():
2158            self.makelink(tarinfo, targetpath)
2159        elif tarinfo.type not in SUPPORTED_TYPES:
2160            self.makeunknown(tarinfo, targetpath)
2161        else:
2162            self.makefile(tarinfo, targetpath)
2163
2164        if set_attrs:
2165            self.chown(tarinfo, targetpath, numeric_owner)
2166            if not tarinfo.issym():
2167                self.chmod(tarinfo, targetpath)
2168                self.utime(tarinfo, targetpath)
2169
2170    #--------------------------------------------------------------------------
2171    # Below are the different file methods. They are called via
2172    # _extract_member() when extract() is called. They can be replaced in a
2173    # subclass to implement other functionality.
2174
2175    def makedir(self, tarinfo, targetpath):
2176        """Make a directory called targetpath.
2177        """
2178        try:
2179            # Use a safe mode for the directory, the real mode is set
2180            # later in _extract_member().
2181            os.mkdir(targetpath, 0o700)
2182        except FileExistsError:
2183            pass
2184
2185    def makefile(self, tarinfo, targetpath):
2186        """Make a file called targetpath.
2187        """
2188        source = self.fileobj
2189        source.seek(tarinfo.offset_data)
2190        bufsize = self.copybufsize
2191        with bltn_open(targetpath, "wb") as target:
2192            if tarinfo.sparse is not None:
2193                for offset, size in tarinfo.sparse:
2194                    target.seek(offset)
2195                    copyfileobj(source, target, size, ReadError, bufsize)
2196                target.seek(tarinfo.size)
2197                target.truncate()
2198            else:
2199                copyfileobj(source, target, tarinfo.size, ReadError, bufsize)
2200
2201    def makeunknown(self, tarinfo, targetpath):
2202        """Make a file from a TarInfo object with an unknown type
2203           at targetpath.
2204        """
2205        self.makefile(tarinfo, targetpath)
2206        self._dbg(1, "tarfile: Unknown file type %r, " \
2207                     "extracted as regular file." % tarinfo.type)
2208
2209    def makefifo(self, tarinfo, targetpath):
2210        """Make a fifo called targetpath.
2211        """
2212        if hasattr(os, "mkfifo"):
2213            os.mkfifo(targetpath)
2214        else:
2215            raise ExtractError("fifo not supported by system")
2216
2217    def makedev(self, tarinfo, targetpath):
2218        """Make a character or block device called targetpath.
2219        """
2220        if not hasattr(os, "mknod") or not hasattr(os, "makedev"):
2221            raise ExtractError("special devices not supported by system")
2222
2223        mode = tarinfo.mode
2224        if tarinfo.isblk():
2225            mode |= stat.S_IFBLK
2226        else:
2227            mode |= stat.S_IFCHR
2228
2229        os.mknod(targetpath, mode,
2230                 os.makedev(tarinfo.devmajor, tarinfo.devminor))
2231
2232    def makelink(self, tarinfo, targetpath):
2233        """Make a (symbolic) link called targetpath. If it cannot be created
2234          (platform limitation), we try to make a copy of the referenced file
2235          instead of a link.
2236        """
2237        try:
2238            # For systems that support symbolic and hard links.
2239            if tarinfo.issym():
2240                if os.path.lexists(targetpath):
2241                    # Avoid FileExistsError on following os.symlink.
2242                    os.unlink(targetpath)
2243                os.symlink(tarinfo.linkname, targetpath)
2244            else:
2245                # See extract().
2246                if os.path.exists(tarinfo._link_target):
2247                    os.link(tarinfo._link_target, targetpath)
2248                else:
2249                    self._extract_member(self._find_link_target(tarinfo),
2250                                         targetpath)
2251        except symlink_exception:
2252            try:
2253                self._extract_member(self._find_link_target(tarinfo),
2254                                     targetpath)
2255            except KeyError:
2256                raise ExtractError("unable to resolve link inside archive")
2257
2258    def chown(self, tarinfo, targetpath, numeric_owner):
2259        """Set owner of targetpath according to tarinfo. If numeric_owner
2260           is True, use .gid/.uid instead of .gname/.uname. If numeric_owner
2261           is False, fall back to .gid/.uid when the search based on name
2262           fails.
2263        """
2264        if hasattr(os, "geteuid") and os.geteuid() == 0:
2265            # We have to be root to do so.
2266            g = tarinfo.gid
2267            u = tarinfo.uid
2268            if not numeric_owner:
2269                try:
2270                    if grp:
2271                        g = grp.getgrnam(tarinfo.gname)[2]
2272                except KeyError:
2273                    pass
2274                try:
2275                    if pwd:
2276                        u = pwd.getpwnam(tarinfo.uname)[2]
2277                except KeyError:
2278                    pass
2279            try:
2280                if tarinfo.issym() and hasattr(os, "lchown"):
2281                    os.lchown(targetpath, u, g)
2282                else:
2283                    os.chown(targetpath, u, g)
2284            except OSError:
2285                raise ExtractError("could not change owner")
2286
2287    def chmod(self, tarinfo, targetpath):
2288        """Set file permissions of targetpath according to tarinfo.
2289        """
2290        try:
2291            os.chmod(targetpath, tarinfo.mode)
2292        except OSError:
2293            raise ExtractError("could not change mode")
2294
2295    def utime(self, tarinfo, targetpath):
2296        """Set modification time of targetpath according to tarinfo.
2297        """
2298        if not hasattr(os, 'utime'):
2299            return
2300        try:
2301            os.utime(targetpath, (tarinfo.mtime, tarinfo.mtime))
2302        except OSError:
2303            raise ExtractError("could not change modification time")
2304
2305    #--------------------------------------------------------------------------
2306    def next(self):
2307        """Return the next member of the archive as a TarInfo object, when
2308           TarFile is opened for reading. Return None if there is no more
2309           available.
2310        """
2311        self._check("ra")
2312        if self.firstmember is not None:
2313            m = self.firstmember
2314            self.firstmember = None
2315            return m
2316
2317        # Advance the file pointer.
2318        if self.offset != self.fileobj.tell():
2319            self.fileobj.seek(self.offset - 1)
2320            if not self.fileobj.read(1):
2321                raise ReadError("unexpected end of data")
2322
2323        # Read the next block.
2324        tarinfo = None
2325        while True:
2326            try:
2327                tarinfo = self.tarinfo.fromtarfile(self)
2328            except EOFHeaderError as e:
2329                if self.ignore_zeros:
2330                    self._dbg(2, "0x%X: %s" % (self.offset, e))
2331                    self.offset += BLOCKSIZE
2332                    continue
2333            except InvalidHeaderError as e:
2334                if self.ignore_zeros:
2335                    self._dbg(2, "0x%X: %s" % (self.offset, e))
2336                    self.offset += BLOCKSIZE
2337                    continue
2338                elif self.offset == 0:
2339                    raise ReadError(str(e))
2340            except EmptyHeaderError:
2341                if self.offset == 0:
2342                    raise ReadError("empty file")
2343            except TruncatedHeaderError as e:
2344                if self.offset == 0:
2345                    raise ReadError(str(e))
2346            except SubsequentHeaderError as e:
2347                raise ReadError(str(e))
2348            break
2349
2350        if tarinfo is not None:
2351            self.members.append(tarinfo)
2352        else:
2353            self._loaded = True
2354
2355        return tarinfo
2356
2357    #--------------------------------------------------------------------------
2358    # Little helper methods:
2359
2360    def _getmember(self, name, tarinfo=None, normalize=False):
2361        """Find an archive member by name from bottom to top.
2362           If tarinfo is given, it is used as the starting point.
2363        """
2364        # Ensure that all members have been loaded.
2365        members = self.getmembers()
2366
2367        # Limit the member search list up to tarinfo.
2368        if tarinfo is not None:
2369            members = members[:members.index(tarinfo)]
2370
2371        if normalize:
2372            name = os.path.normpath(name)
2373
2374        for member in reversed(members):
2375            if normalize:
2376                member_name = os.path.normpath(member.name)
2377            else:
2378                member_name = member.name
2379
2380            if name == member_name:
2381                return member
2382
2383    def _load(self):
2384        """Read through the entire archive file and look for readable
2385           members.
2386        """
2387        while True:
2388            tarinfo = self.next()
2389            if tarinfo is None:
2390                break
2391        self._loaded = True
2392
2393    def _check(self, mode=None):
2394        """Check if TarFile is still open, and if the operation's mode
2395           corresponds to TarFile's mode.
2396        """
2397        if self.closed:
2398            raise OSError("%s is closed" % self.__class__.__name__)
2399        if mode is not None and self.mode not in mode:
2400            raise OSError("bad operation for mode %r" % self.mode)
2401
2402    def _find_link_target(self, tarinfo):
2403        """Find the target member of a symlink or hardlink member in the
2404           archive.
2405        """
2406        if tarinfo.issym():
2407            # Always search the entire archive.
2408            linkname = "/".join(filter(None, (os.path.dirname(tarinfo.name), tarinfo.linkname)))
2409            limit = None
2410        else:
2411            # Search the archive before the link, because a hard link is
2412            # just a reference to an already archived file.
2413            linkname = tarinfo.linkname
2414            limit = tarinfo
2415
2416        member = self._getmember(linkname, tarinfo=limit, normalize=True)
2417        if member is None:
2418            raise KeyError("linkname %r not found" % linkname)
2419        return member
2420
2421    def __iter__(self):
2422        """Provide an iterator object.
2423        """
2424        if self._loaded:
2425            yield from self.members
2426            return
2427
2428        # Yield items using TarFile's next() method.
2429        # When all members have been read, set TarFile as _loaded.
2430        index = 0
2431        # Fix for SF #1100429: Under rare circumstances it can
2432        # happen that getmembers() is called during iteration,
2433        # which will have already exhausted the next() method.
2434        if self.firstmember is not None:
2435            tarinfo = self.next()
2436            index += 1
2437            yield tarinfo
2438
2439        while True:
2440            if index < len(self.members):
2441                tarinfo = self.members[index]
2442            elif not self._loaded:
2443                tarinfo = self.next()
2444                if not tarinfo:
2445                    self._loaded = True
2446                    return
2447            else:
2448                return
2449            index += 1
2450            yield tarinfo
2451
2452    def _dbg(self, level, msg):
2453        """Write debugging output to sys.stderr.
2454        """
2455        if level <= self.debug:
2456            print(msg, file=sys.stderr)
2457
2458    def __enter__(self):
2459        self._check()
2460        return self
2461
2462    def __exit__(self, type, value, traceback):
2463        if type is None:
2464            self.close()
2465        else:
2466            # An exception occurred. We must not call close() because
2467            # it would try to write end-of-archive blocks and padding.
2468            if not self._extfileobj:
2469                self.fileobj.close()
2470            self.closed = True
2471
2472#--------------------
2473# exported functions
2474#--------------------
2475def is_tarfile(name):
2476    """Return True if name points to a tar archive that we
2477       are able to handle, else return False.
2478
2479       'name' should be a string, file, or file-like object.
2480    """
2481    try:
2482        if hasattr(name, "read"):
2483            t = open(fileobj=name)
2484        else:
2485            t = open(name)
2486        t.close()
2487        return True
2488    except TarError:
2489        return False
2490
2491open = TarFile.open
2492
2493
2494def main():
2495    import argparse
2496
2497    description = 'A simple command-line interface for tarfile module.'
2498    parser = argparse.ArgumentParser(description=description)
2499    parser.add_argument('-v', '--verbose', action='store_true', default=False,
2500                        help='Verbose output')
2501    group = parser.add_mutually_exclusive_group(required=True)
2502    group.add_argument('-l', '--list', metavar='<tarfile>',
2503                       help='Show listing of a tarfile')
2504    group.add_argument('-e', '--extract', nargs='+',
2505                       metavar=('<tarfile>', '<output_dir>'),
2506                       help='Extract tarfile into target dir')
2507    group.add_argument('-c', '--create', nargs='+',
2508                       metavar=('<name>', '<file>'),
2509                       help='Create tarfile from sources')
2510    group.add_argument('-t', '--test', metavar='<tarfile>',
2511                       help='Test if a tarfile is valid')
2512    args = parser.parse_args()
2513
2514    if args.test is not None:
2515        src = args.test
2516        if is_tarfile(src):
2517            with open(src, 'r') as tar:
2518                tar.getmembers()
2519                print(tar.getmembers(), file=sys.stderr)
2520            if args.verbose:
2521                print('{!r} is a tar archive.'.format(src))
2522        else:
2523            parser.exit(1, '{!r} is not a tar archive.\n'.format(src))
2524
2525    elif args.list is not None:
2526        src = args.list
2527        if is_tarfile(src):
2528            with TarFile.open(src, 'r:*') as tf:
2529                tf.list(verbose=args.verbose)
2530        else:
2531            parser.exit(1, '{!r} is not a tar archive.\n'.format(src))
2532
2533    elif args.extract is not None:
2534        if len(args.extract) == 1:
2535            src = args.extract[0]
2536            curdir = os.curdir
2537        elif len(args.extract) == 2:
2538            src, curdir = args.extract
2539        else:
2540            parser.exit(1, parser.format_help())
2541
2542        if is_tarfile(src):
2543            with TarFile.open(src, 'r:*') as tf:
2544                tf.extractall(path=curdir)
2545            if args.verbose:
2546                if curdir == '.':
2547                    msg = '{!r} file is extracted.'.format(src)
2548                else:
2549                    msg = ('{!r} file is extracted '
2550                           'into {!r} directory.').format(src, curdir)
2551                print(msg)
2552        else:
2553            parser.exit(1, '{!r} is not a tar archive.\n'.format(src))
2554
2555    elif args.create is not None:
2556        tar_name = args.create.pop(0)
2557        _, ext = os.path.splitext(tar_name)
2558        compressions = {
2559            # gz
2560            '.gz': 'gz',
2561            '.tgz': 'gz',
2562            # xz
2563            '.xz': 'xz',
2564            '.txz': 'xz',
2565            # bz2
2566            '.bz2': 'bz2',
2567            '.tbz': 'bz2',
2568            '.tbz2': 'bz2',
2569            '.tb2': 'bz2',
2570        }
2571        tar_mode = 'w:' + compressions[ext] if ext in compressions else 'w'
2572        tar_files = args.create
2573
2574        with TarFile.open(tar_name, tar_mode) as tf:
2575            for file_name in tar_files:
2576                tf.add(file_name)
2577
2578        if args.verbose:
2579            print('{!r} file created.'.format(tar_name))
2580
2581if __name__ == '__main__':
2582    main()
2583