• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/env python3
2#-------------------------------------------------------------------
3# tarfile.py
4#-------------------------------------------------------------------
5# Copyright (C) 2002 Lars Gustaebel <lars@gustaebel.de>
6# All rights reserved.
7#
8# Permission  is  hereby granted,  free  of charge,  to  any person
9# obtaining a  copy of  this software  and associated documentation
10# files  (the  "Software"),  to   deal  in  the  Software   without
11# restriction,  including  without limitation  the  rights to  use,
12# copy, modify, merge, publish, distribute, sublicense, and/or sell
13# copies  of  the  Software,  and to  permit  persons  to  whom the
14# Software  is  furnished  to  do  so,  subject  to  the  following
15# conditions:
16#
17# The above copyright  notice and this  permission notice shall  be
18# included in all copies or substantial portions of the Software.
19#
20# THE SOFTWARE IS PROVIDED "AS  IS", WITHOUT WARRANTY OF ANY  KIND,
21# EXPRESS OR IMPLIED, INCLUDING  BUT NOT LIMITED TO  THE WARRANTIES
22# OF  MERCHANTABILITY,  FITNESS   FOR  A  PARTICULAR   PURPOSE  AND
23# NONINFRINGEMENT.  IN  NO  EVENT SHALL  THE  AUTHORS  OR COPYRIGHT
24# HOLDERS  BE LIABLE  FOR ANY  CLAIM, DAMAGES  OR OTHER  LIABILITY,
25# WHETHER  IN AN  ACTION OF  CONTRACT, TORT  OR OTHERWISE,  ARISING
26# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
27# OTHER DEALINGS IN THE SOFTWARE.
28#
29"""Read from and write to tar format archives.
30"""
31
32version     = "0.9.0"
33__author__  = "Lars Gust\u00e4bel (lars@gustaebel.de)"
34__date__    = "$Date: 2011-02-25 17:42:01 +0200 (Fri, 25 Feb 2011) $"
35__cvsid__   = "$Id: tarfile.py 88586 2011-02-25 15:42:01Z marc-andre.lemburg $"
36__credits__ = "Gustavo Niemeyer, Niels Gust\u00e4bel, Richard Townsend."
37
38#---------
39# Imports
40#---------
41from builtins import open as bltn_open
42import sys
43import os
44import io
45import shutil
46import stat
47import time
48import struct
49import copy
50import re
51
52try:
53    import pwd
54except ImportError:
55    pwd = None
56try:
57    import grp
58except ImportError:
59    grp = None
60
61# os.symlink on Windows prior to 6.0 raises NotImplementedError
62symlink_exception = (AttributeError, NotImplementedError)
63try:
64    # OSError (winerror=1314) will be raised if the caller does not hold the
65    # SeCreateSymbolicLinkPrivilege privilege
66    symlink_exception += (OSError,)
67except NameError:
68    pass
69
70# from tarfile import *
71__all__ = ["TarFile", "TarInfo", "is_tarfile", "TarError", "ReadError",
72           "CompressionError", "StreamError", "ExtractError", "HeaderError",
73           "ENCODING", "USTAR_FORMAT", "GNU_FORMAT", "PAX_FORMAT",
74           "DEFAULT_FORMAT", "open"]
75
76#---------------------------------------------------------
77# tar constants
78#---------------------------------------------------------
79NUL = b"\0"                     # the null character
80BLOCKSIZE = 512                 # length of processing blocks
81RECORDSIZE = BLOCKSIZE * 20     # length of records
82GNU_MAGIC = b"ustar  \0"        # magic gnu tar string
83POSIX_MAGIC = b"ustar\x0000"    # magic posix tar string
84
85LENGTH_NAME = 100               # maximum length of a filename
86LENGTH_LINK = 100               # maximum length of a linkname
87LENGTH_PREFIX = 155             # maximum length of the prefix field
88
89REGTYPE = b"0"                  # regular file
90AREGTYPE = b"\0"                # regular file
91LNKTYPE = b"1"                  # link (inside tarfile)
92SYMTYPE = b"2"                  # symbolic link
93CHRTYPE = b"3"                  # character special device
94BLKTYPE = b"4"                  # block special device
95DIRTYPE = b"5"                  # directory
96FIFOTYPE = b"6"                 # fifo special device
97CONTTYPE = b"7"                 # contiguous file
98
99GNUTYPE_LONGNAME = b"L"         # GNU tar longname
100GNUTYPE_LONGLINK = b"K"         # GNU tar longlink
101GNUTYPE_SPARSE = b"S"           # GNU tar sparse file
102
103XHDTYPE = b"x"                  # POSIX.1-2001 extended header
104XGLTYPE = b"g"                  # POSIX.1-2001 global header
105SOLARIS_XHDTYPE = b"X"          # Solaris extended header
106
107USTAR_FORMAT = 0                # POSIX.1-1988 (ustar) format
108GNU_FORMAT = 1                  # GNU tar format
109PAX_FORMAT = 2                  # POSIX.1-2001 (pax) format
110DEFAULT_FORMAT = GNU_FORMAT
111
112#---------------------------------------------------------
113# tarfile constants
114#---------------------------------------------------------
115# File types that tarfile supports:
116SUPPORTED_TYPES = (REGTYPE, AREGTYPE, LNKTYPE,
117                   SYMTYPE, DIRTYPE, FIFOTYPE,
118                   CONTTYPE, CHRTYPE, BLKTYPE,
119                   GNUTYPE_LONGNAME, GNUTYPE_LONGLINK,
120                   GNUTYPE_SPARSE)
121
122# File types that will be treated as a regular file.
123REGULAR_TYPES = (REGTYPE, AREGTYPE,
124                 CONTTYPE, GNUTYPE_SPARSE)
125
126# File types that are part of the GNU tar format.
127GNU_TYPES = (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK,
128             GNUTYPE_SPARSE)
129
130# Fields from a pax header that override a TarInfo attribute.
131PAX_FIELDS = ("path", "linkpath", "size", "mtime",
132              "uid", "gid", "uname", "gname")
133
134# Fields from a pax header that are affected by hdrcharset.
135PAX_NAME_FIELDS = {"path", "linkpath", "uname", "gname"}
136
137# Fields in a pax header that are numbers, all other fields
138# are treated as strings.
139PAX_NUMBER_FIELDS = {
140    "atime": float,
141    "ctime": float,
142    "mtime": float,
143    "uid": int,
144    "gid": int,
145    "size": int
146}
147
148#---------------------------------------------------------
149# initialization
150#---------------------------------------------------------
151if os.name == "nt":
152    ENCODING = "utf-8"
153else:
154    ENCODING = sys.getfilesystemencoding()
155
156#---------------------------------------------------------
157# Some useful functions
158#---------------------------------------------------------
159
160def stn(s, length, encoding, errors):
161    """Convert a string to a null-terminated bytes object.
162    """
163    s = s.encode(encoding, errors)
164    return s[:length] + (length - len(s)) * NUL
165
166def nts(s, encoding, errors):
167    """Convert a null-terminated bytes object to a string.
168    """
169    p = s.find(b"\0")
170    if p != -1:
171        s = s[:p]
172    return s.decode(encoding, errors)
173
174def nti(s):
175    """Convert a number field to a python number.
176    """
177    # There are two possible encodings for a number field, see
178    # itn() below.
179    if s[0] in (0o200, 0o377):
180        n = 0
181        for i in range(len(s) - 1):
182            n <<= 8
183            n += s[i + 1]
184        if s[0] == 0o377:
185            n = -(256 ** (len(s) - 1) - n)
186    else:
187        try:
188            s = nts(s, "ascii", "strict")
189            n = int(s.strip() or "0", 8)
190        except ValueError:
191            raise InvalidHeaderError("invalid header")
192    return n
193
194def itn(n, digits=8, format=DEFAULT_FORMAT):
195    """Convert a python number to a number field.
196    """
197    # POSIX 1003.1-1988 requires numbers to be encoded as a string of
198    # octal digits followed by a null-byte, this allows values up to
199    # (8**(digits-1))-1. GNU tar allows storing numbers greater than
200    # that if necessary. A leading 0o200 or 0o377 byte indicate this
201    # particular encoding, the following digits-1 bytes are a big-endian
202    # base-256 representation. This allows values up to (256**(digits-1))-1.
203    # A 0o200 byte indicates a positive number, a 0o377 byte a negative
204    # number.
205    if 0 <= n < 8 ** (digits - 1):
206        s = bytes("%0*o" % (digits - 1, int(n)), "ascii") + NUL
207    elif format == GNU_FORMAT and -256 ** (digits - 1) <= n < 256 ** (digits - 1):
208        if n >= 0:
209            s = bytearray([0o200])
210        else:
211            s = bytearray([0o377])
212            n = 256 ** digits + n
213
214        for i in range(digits - 1):
215            s.insert(1, n & 0o377)
216            n >>= 8
217    else:
218        raise ValueError("overflow in number field")
219
220    return s
221
222def calc_chksums(buf):
223    """Calculate the checksum for a member's header by summing up all
224       characters except for the chksum field which is treated as if
225       it was filled with spaces. According to the GNU tar sources,
226       some tars (Sun and NeXT) calculate chksum with signed char,
227       which will be different if there are chars in the buffer with
228       the high bit set. So we calculate two checksums, unsigned and
229       signed.
230    """
231    unsigned_chksum = 256 + sum(struct.unpack_from("148B8x356B", buf))
232    signed_chksum = 256 + sum(struct.unpack_from("148b8x356b", buf))
233    return unsigned_chksum, signed_chksum
234
235def copyfileobj(src, dst, length=None, exception=OSError, bufsize=None):
236    """Copy length bytes from fileobj src to fileobj dst.
237       If length is None, copy the entire content.
238    """
239    bufsize = bufsize or 16 * 1024
240    if length == 0:
241        return
242    if length is None:
243        shutil.copyfileobj(src, dst, bufsize)
244        return
245
246    blocks, remainder = divmod(length, bufsize)
247    for b in range(blocks):
248        buf = src.read(bufsize)
249        if len(buf) < bufsize:
250            raise exception("unexpected end of data")
251        dst.write(buf)
252
253    if remainder != 0:
254        buf = src.read(remainder)
255        if len(buf) < remainder:
256            raise exception("unexpected end of data")
257        dst.write(buf)
258    return
259
260def filemode(mode):
261    """Deprecated in this location; use stat.filemode."""
262    import warnings
263    warnings.warn("deprecated in favor of stat.filemode",
264                  DeprecationWarning, 2)
265    return stat.filemode(mode)
266
267def _safe_print(s):
268    encoding = getattr(sys.stdout, 'encoding', None)
269    if encoding is not None:
270        s = s.encode(encoding, 'backslashreplace').decode(encoding)
271    print(s, end=' ')
272
273
274class TarError(Exception):
275    """Base exception."""
276    pass
277class ExtractError(TarError):
278    """General exception for extract errors."""
279    pass
280class ReadError(TarError):
281    """Exception for unreadable tar archives."""
282    pass
283class CompressionError(TarError):
284    """Exception for unavailable compression methods."""
285    pass
286class StreamError(TarError):
287    """Exception for unsupported operations on stream-like TarFiles."""
288    pass
289class HeaderError(TarError):
290    """Base exception for header errors."""
291    pass
292class EmptyHeaderError(HeaderError):
293    """Exception for empty headers."""
294    pass
295class TruncatedHeaderError(HeaderError):
296    """Exception for truncated headers."""
297    pass
298class EOFHeaderError(HeaderError):
299    """Exception for end of file headers."""
300    pass
301class InvalidHeaderError(HeaderError):
302    """Exception for invalid headers."""
303    pass
304class SubsequentHeaderError(HeaderError):
305    """Exception for missing and invalid extended headers."""
306    pass
307
308#---------------------------
309# internal stream interface
310#---------------------------
311class _LowLevelFile:
312    """Low-level file object. Supports reading and writing.
313       It is used instead of a regular file object for streaming
314       access.
315    """
316
317    def __init__(self, name, mode):
318        mode = {
319            "r": os.O_RDONLY,
320            "w": os.O_WRONLY | os.O_CREAT | os.O_TRUNC,
321        }[mode]
322        if hasattr(os, "O_BINARY"):
323            mode |= os.O_BINARY
324        self.fd = os.open(name, mode, 0o666)
325
326    def close(self):
327        os.close(self.fd)
328
329    def read(self, size):
330        return os.read(self.fd, size)
331
332    def write(self, s):
333        os.write(self.fd, s)
334
335class _Stream:
336    """Class that serves as an adapter between TarFile and
337       a stream-like object.  The stream-like object only
338       needs to have a read() or write() method and is accessed
339       blockwise.  Use of gzip or bzip2 compression is possible.
340       A stream-like object could be for example: sys.stdin,
341       sys.stdout, a socket, a tape device etc.
342
343       _Stream is intended to be used only internally.
344    """
345
346    def __init__(self, name, mode, comptype, fileobj, bufsize):
347        """Construct a _Stream object.
348        """
349        self._extfileobj = True
350        if fileobj is None:
351            fileobj = _LowLevelFile(name, mode)
352            self._extfileobj = False
353
354        if comptype == '*':
355            # Enable transparent compression detection for the
356            # stream interface
357            fileobj = _StreamProxy(fileobj)
358            comptype = fileobj.getcomptype()
359
360        self.name     = name or ""
361        self.mode     = mode
362        self.comptype = comptype
363        self.fileobj  = fileobj
364        self.bufsize  = bufsize
365        self.buf      = b""
366        self.pos      = 0
367        self.closed   = False
368
369        try:
370            if comptype == "gz":
371                try:
372                    import zlib
373                except ImportError:
374                    raise CompressionError("zlib module is not available")
375                self.zlib = zlib
376                self.crc = zlib.crc32(b"")
377                if mode == "r":
378                    self._init_read_gz()
379                    self.exception = zlib.error
380                else:
381                    self._init_write_gz()
382
383            elif comptype == "bz2":
384                try:
385                    import bz2
386                except ImportError:
387                    raise CompressionError("bz2 module is not available")
388                if mode == "r":
389                    self.dbuf = b""
390                    self.cmp = bz2.BZ2Decompressor()
391                    self.exception = OSError
392                else:
393                    self.cmp = bz2.BZ2Compressor()
394
395            elif comptype == "xz":
396                try:
397                    import lzma
398                except ImportError:
399                    raise CompressionError("lzma module is not available")
400                if mode == "r":
401                    self.dbuf = b""
402                    self.cmp = lzma.LZMADecompressor()
403                    self.exception = lzma.LZMAError
404                else:
405                    self.cmp = lzma.LZMACompressor()
406
407            elif comptype != "tar":
408                raise CompressionError("unknown compression type %r" % comptype)
409
410        except:
411            if not self._extfileobj:
412                self.fileobj.close()
413            self.closed = True
414            raise
415
416    def __del__(self):
417        if hasattr(self, "closed") and not self.closed:
418            self.close()
419
420    def _init_write_gz(self):
421        """Initialize for writing with gzip compression.
422        """
423        self.cmp = self.zlib.compressobj(9, self.zlib.DEFLATED,
424                                            -self.zlib.MAX_WBITS,
425                                            self.zlib.DEF_MEM_LEVEL,
426                                            0)
427        timestamp = struct.pack("<L", int(time.time()))
428        self.__write(b"\037\213\010\010" + timestamp + b"\002\377")
429        if self.name.endswith(".gz"):
430            self.name = self.name[:-3]
431        # RFC1952 says we must use ISO-8859-1 for the FNAME field.
432        self.__write(self.name.encode("iso-8859-1", "replace") + NUL)
433
434    def write(self, s):
435        """Write string s to the stream.
436        """
437        if self.comptype == "gz":
438            self.crc = self.zlib.crc32(s, self.crc)
439        self.pos += len(s)
440        if self.comptype != "tar":
441            s = self.cmp.compress(s)
442        self.__write(s)
443
444    def __write(self, s):
445        """Write string s to the stream if a whole new block
446           is ready to be written.
447        """
448        self.buf += s
449        while len(self.buf) > self.bufsize:
450            self.fileobj.write(self.buf[:self.bufsize])
451            self.buf = self.buf[self.bufsize:]
452
453    def close(self):
454        """Close the _Stream object. No operation should be
455           done on it afterwards.
456        """
457        if self.closed:
458            return
459
460        self.closed = True
461        try:
462            if self.mode == "w" and self.comptype != "tar":
463                self.buf += self.cmp.flush()
464
465            if self.mode == "w" and self.buf:
466                self.fileobj.write(self.buf)
467                self.buf = b""
468                if self.comptype == "gz":
469                    self.fileobj.write(struct.pack("<L", self.crc))
470                    self.fileobj.write(struct.pack("<L", self.pos & 0xffffFFFF))
471        finally:
472            if not self._extfileobj:
473                self.fileobj.close()
474
475    def _init_read_gz(self):
476        """Initialize for reading a gzip compressed fileobj.
477        """
478        self.cmp = self.zlib.decompressobj(-self.zlib.MAX_WBITS)
479        self.dbuf = b""
480
481        # taken from gzip.GzipFile with some alterations
482        if self.__read(2) != b"\037\213":
483            raise ReadError("not a gzip file")
484        if self.__read(1) != b"\010":
485            raise CompressionError("unsupported compression method")
486
487        flag = ord(self.__read(1))
488        self.__read(6)
489
490        if flag & 4:
491            xlen = ord(self.__read(1)) + 256 * ord(self.__read(1))
492            self.read(xlen)
493        if flag & 8:
494            while True:
495                s = self.__read(1)
496                if not s or s == NUL:
497                    break
498        if flag & 16:
499            while True:
500                s = self.__read(1)
501                if not s or s == NUL:
502                    break
503        if flag & 2:
504            self.__read(2)
505
506    def tell(self):
507        """Return the stream's file pointer position.
508        """
509        return self.pos
510
511    def seek(self, pos=0):
512        """Set the stream's file pointer to pos. Negative seeking
513           is forbidden.
514        """
515        if pos - self.pos >= 0:
516            blocks, remainder = divmod(pos - self.pos, self.bufsize)
517            for i in range(blocks):
518                self.read(self.bufsize)
519            self.read(remainder)
520        else:
521            raise StreamError("seeking backwards is not allowed")
522        return self.pos
523
524    def read(self, size=None):
525        """Return the next size number of bytes from the stream.
526           If size is not defined, return all bytes of the stream
527           up to EOF.
528        """
529        if size is None:
530            t = []
531            while True:
532                buf = self._read(self.bufsize)
533                if not buf:
534                    break
535                t.append(buf)
536            buf = "".join(t)
537        else:
538            buf = self._read(size)
539        self.pos += len(buf)
540        return buf
541
542    def _read(self, size):
543        """Return size bytes from the stream.
544        """
545        if self.comptype == "tar":
546            return self.__read(size)
547
548        c = len(self.dbuf)
549        while c < size:
550            buf = self.__read(self.bufsize)
551            if not buf:
552                break
553            try:
554                buf = self.cmp.decompress(buf)
555            except self.exception:
556                raise ReadError("invalid compressed data")
557            self.dbuf += buf
558            c += len(buf)
559        buf = self.dbuf[:size]
560        self.dbuf = self.dbuf[size:]
561        return buf
562
563    def __read(self, size):
564        """Return size bytes from stream. If internal buffer is empty,
565           read another block from the stream.
566        """
567        c = len(self.buf)
568        while c < size:
569            buf = self.fileobj.read(self.bufsize)
570            if not buf:
571                break
572            self.buf += buf
573            c += len(buf)
574        buf = self.buf[:size]
575        self.buf = self.buf[size:]
576        return buf
577# class _Stream
578
579class _StreamProxy(object):
580    """Small proxy class that enables transparent compression
581       detection for the Stream interface (mode 'r|*').
582    """
583
584    def __init__(self, fileobj):
585        self.fileobj = fileobj
586        self.buf = self.fileobj.read(BLOCKSIZE)
587
588    def read(self, size):
589        self.read = self.fileobj.read
590        return self.buf
591
592    def getcomptype(self):
593        if self.buf.startswith(b"\x1f\x8b\x08"):
594            return "gz"
595        elif self.buf[0:3] == b"BZh" and self.buf[4:10] == b"1AY&SY":
596            return "bz2"
597        elif self.buf.startswith((b"\x5d\x00\x00\x80", b"\xfd7zXZ")):
598            return "xz"
599        else:
600            return "tar"
601
602    def close(self):
603        self.fileobj.close()
604# class StreamProxy
605
606#------------------------
607# Extraction file object
608#------------------------
609class _FileInFile(object):
610    """A thin wrapper around an existing file object that
611       provides a part of its data as an individual file
612       object.
613    """
614
615    def __init__(self, fileobj, offset, size, blockinfo=None):
616        self.fileobj = fileobj
617        self.offset = offset
618        self.size = size
619        self.position = 0
620        self.name = getattr(fileobj, "name", None)
621        self.closed = False
622
623        if blockinfo is None:
624            blockinfo = [(0, size)]
625
626        # Construct a map with data and zero blocks.
627        self.map_index = 0
628        self.map = []
629        lastpos = 0
630        realpos = self.offset
631        for offset, size in blockinfo:
632            if offset > lastpos:
633                self.map.append((False, lastpos, offset, None))
634            self.map.append((True, offset, offset + size, realpos))
635            realpos += size
636            lastpos = offset + size
637        if lastpos < self.size:
638            self.map.append((False, lastpos, self.size, None))
639
640    def flush(self):
641        pass
642
643    def readable(self):
644        return True
645
646    def writable(self):
647        return False
648
649    def seekable(self):
650        return self.fileobj.seekable()
651
652    def tell(self):
653        """Return the current file position.
654        """
655        return self.position
656
657    def seek(self, position, whence=io.SEEK_SET):
658        """Seek to a position in the file.
659        """
660        if whence == io.SEEK_SET:
661            self.position = min(max(position, 0), self.size)
662        elif whence == io.SEEK_CUR:
663            if position < 0:
664                self.position = max(self.position + position, 0)
665            else:
666                self.position = min(self.position + position, self.size)
667        elif whence == io.SEEK_END:
668            self.position = max(min(self.size + position, self.size), 0)
669        else:
670            raise ValueError("Invalid argument")
671        return self.position
672
673    def read(self, size=None):
674        """Read data from the file.
675        """
676        if size is None:
677            size = self.size - self.position
678        else:
679            size = min(size, self.size - self.position)
680
681        buf = b""
682        while size > 0:
683            while True:
684                data, start, stop, offset = self.map[self.map_index]
685                if start <= self.position < stop:
686                    break
687                else:
688                    self.map_index += 1
689                    if self.map_index == len(self.map):
690                        self.map_index = 0
691            length = min(size, stop - self.position)
692            if data:
693                self.fileobj.seek(offset + (self.position - start))
694                b = self.fileobj.read(length)
695                if len(b) != length:
696                    raise ReadError("unexpected end of data")
697                buf += b
698            else:
699                buf += NUL * length
700            size -= length
701            self.position += length
702        return buf
703
704    def readinto(self, b):
705        buf = self.read(len(b))
706        b[:len(buf)] = buf
707        return len(buf)
708
709    def close(self):
710        self.closed = True
711#class _FileInFile
712
713class ExFileObject(io.BufferedReader):
714
715    def __init__(self, tarfile, tarinfo):
716        fileobj = _FileInFile(tarfile.fileobj, tarinfo.offset_data,
717                tarinfo.size, tarinfo.sparse)
718        super().__init__(fileobj)
719#class ExFileObject
720
721#------------------
722# Exported Classes
723#------------------
724class TarInfo(object):
725    """Informational class which holds the details about an
726       archive member given by a tar header block.
727       TarInfo objects are returned by TarFile.getmember(),
728       TarFile.getmembers() and TarFile.gettarinfo() and are
729       usually created internally.
730    """
731
732    __slots__ = ("name", "mode", "uid", "gid", "size", "mtime",
733                 "chksum", "type", "linkname", "uname", "gname",
734                 "devmajor", "devminor",
735                 "offset", "offset_data", "pax_headers", "sparse",
736                 "tarfile", "_sparse_structs", "_link_target")
737
738    def __init__(self, name=""):
739        """Construct a TarInfo object. name is the optional name
740           of the member.
741        """
742        self.name = name        # member name
743        self.mode = 0o644       # file permissions
744        self.uid = 0            # user id
745        self.gid = 0            # group id
746        self.size = 0           # file size
747        self.mtime = 0          # modification time
748        self.chksum = 0         # header checksum
749        self.type = REGTYPE     # member type
750        self.linkname = ""      # link name
751        self.uname = ""         # user name
752        self.gname = ""         # group name
753        self.devmajor = 0       # device major number
754        self.devminor = 0       # device minor number
755
756        self.offset = 0         # the tar header starts here
757        self.offset_data = 0    # the file's data starts here
758
759        self.sparse = None      # sparse member information
760        self.pax_headers = {}   # pax header information
761
762    # In pax headers the "name" and "linkname" field are called
763    # "path" and "linkpath".
764    def _getpath(self):
765        return self.name
766    def _setpath(self, name):
767        self.name = name
768    path = property(_getpath, _setpath)
769
770    def _getlinkpath(self):
771        return self.linkname
772    def _setlinkpath(self, linkname):
773        self.linkname = linkname
774    linkpath = property(_getlinkpath, _setlinkpath)
775
776    def __repr__(self):
777        return "<%s %r at %#x>" % (self.__class__.__name__,self.name,id(self))
778
779    def get_info(self):
780        """Return the TarInfo's attributes as a dictionary.
781        """
782        info = {
783            "name":     self.name,
784            "mode":     self.mode & 0o7777,
785            "uid":      self.uid,
786            "gid":      self.gid,
787            "size":     self.size,
788            "mtime":    self.mtime,
789            "chksum":   self.chksum,
790            "type":     self.type,
791            "linkname": self.linkname,
792            "uname":    self.uname,
793            "gname":    self.gname,
794            "devmajor": self.devmajor,
795            "devminor": self.devminor
796        }
797
798        if info["type"] == DIRTYPE and not info["name"].endswith("/"):
799            info["name"] += "/"
800
801        return info
802
803    def tobuf(self, format=DEFAULT_FORMAT, encoding=ENCODING, errors="surrogateescape"):
804        """Return a tar header as a string of 512 byte blocks.
805        """
806        info = self.get_info()
807
808        if format == USTAR_FORMAT:
809            return self.create_ustar_header(info, encoding, errors)
810        elif format == GNU_FORMAT:
811            return self.create_gnu_header(info, encoding, errors)
812        elif format == PAX_FORMAT:
813            return self.create_pax_header(info, encoding)
814        else:
815            raise ValueError("invalid format")
816
817    def create_ustar_header(self, info, encoding, errors):
818        """Return the object as a ustar header block.
819        """
820        info["magic"] = POSIX_MAGIC
821
822        if len(info["linkname"].encode(encoding, errors)) > LENGTH_LINK:
823            raise ValueError("linkname is too long")
824
825        if len(info["name"].encode(encoding, errors)) > LENGTH_NAME:
826            info["prefix"], info["name"] = self._posix_split_name(info["name"], encoding, errors)
827
828        return self._create_header(info, USTAR_FORMAT, encoding, errors)
829
830    def create_gnu_header(self, info, encoding, errors):
831        """Return the object as a GNU header block sequence.
832        """
833        info["magic"] = GNU_MAGIC
834
835        buf = b""
836        if len(info["linkname"].encode(encoding, errors)) > LENGTH_LINK:
837            buf += self._create_gnu_long_header(info["linkname"], GNUTYPE_LONGLINK, encoding, errors)
838
839        if len(info["name"].encode(encoding, errors)) > LENGTH_NAME:
840            buf += self._create_gnu_long_header(info["name"], GNUTYPE_LONGNAME, encoding, errors)
841
842        return buf + self._create_header(info, GNU_FORMAT, encoding, errors)
843
844    def create_pax_header(self, info, encoding):
845        """Return the object as a ustar header block. If it cannot be
846           represented this way, prepend a pax extended header sequence
847           with supplement information.
848        """
849        info["magic"] = POSIX_MAGIC
850        pax_headers = self.pax_headers.copy()
851
852        # Test string fields for values that exceed the field length or cannot
853        # be represented in ASCII encoding.
854        for name, hname, length in (
855                ("name", "path", LENGTH_NAME), ("linkname", "linkpath", LENGTH_LINK),
856                ("uname", "uname", 32), ("gname", "gname", 32)):
857
858            if hname in pax_headers:
859                # The pax header has priority.
860                continue
861
862            # Try to encode the string as ASCII.
863            try:
864                info[name].encode("ascii", "strict")
865            except UnicodeEncodeError:
866                pax_headers[hname] = info[name]
867                continue
868
869            if len(info[name]) > length:
870                pax_headers[hname] = info[name]
871
872        # Test number fields for values that exceed the field limit or values
873        # that like to be stored as float.
874        for name, digits in (("uid", 8), ("gid", 8), ("size", 12), ("mtime", 12)):
875            if name in pax_headers:
876                # The pax header has priority. Avoid overflow.
877                info[name] = 0
878                continue
879
880            val = info[name]
881            if not 0 <= val < 8 ** (digits - 1) or isinstance(val, float):
882                pax_headers[name] = str(val)
883                info[name] = 0
884
885        # Create a pax extended header if necessary.
886        if pax_headers:
887            buf = self._create_pax_generic_header(pax_headers, XHDTYPE, encoding)
888        else:
889            buf = b""
890
891        return buf + self._create_header(info, USTAR_FORMAT, "ascii", "replace")
892
893    @classmethod
894    def create_pax_global_header(cls, pax_headers):
895        """Return the object as a pax global header block sequence.
896        """
897        return cls._create_pax_generic_header(pax_headers, XGLTYPE, "utf-8")
898
899    def _posix_split_name(self, name, encoding, errors):
900        """Split a name longer than 100 chars into a prefix
901           and a name part.
902        """
903        components = name.split("/")
904        for i in range(1, len(components)):
905            prefix = "/".join(components[:i])
906            name = "/".join(components[i:])
907            if len(prefix.encode(encoding, errors)) <= LENGTH_PREFIX and \
908                    len(name.encode(encoding, errors)) <= LENGTH_NAME:
909                break
910        else:
911            raise ValueError("name is too long")
912
913        return prefix, name
914
915    @staticmethod
916    def _create_header(info, format, encoding, errors):
917        """Return a header block. info is a dictionary with file
918           information, format must be one of the *_FORMAT constants.
919        """
920        parts = [
921            stn(info.get("name", ""), 100, encoding, errors),
922            itn(info.get("mode", 0) & 0o7777, 8, format),
923            itn(info.get("uid", 0), 8, format),
924            itn(info.get("gid", 0), 8, format),
925            itn(info.get("size", 0), 12, format),
926            itn(info.get("mtime", 0), 12, format),
927            b"        ", # checksum field
928            info.get("type", REGTYPE),
929            stn(info.get("linkname", ""), 100, encoding, errors),
930            info.get("magic", POSIX_MAGIC),
931            stn(info.get("uname", ""), 32, encoding, errors),
932            stn(info.get("gname", ""), 32, encoding, errors),
933            itn(info.get("devmajor", 0), 8, format),
934            itn(info.get("devminor", 0), 8, format),
935            stn(info.get("prefix", ""), 155, encoding, errors)
936        ]
937
938        buf = struct.pack("%ds" % BLOCKSIZE, b"".join(parts))
939        chksum = calc_chksums(buf[-BLOCKSIZE:])[0]
940        buf = buf[:-364] + bytes("%06o\0" % chksum, "ascii") + buf[-357:]
941        return buf
942
943    @staticmethod
944    def _create_payload(payload):
945        """Return the string payload filled with zero bytes
946           up to the next 512 byte border.
947        """
948        blocks, remainder = divmod(len(payload), BLOCKSIZE)
949        if remainder > 0:
950            payload += (BLOCKSIZE - remainder) * NUL
951        return payload
952
953    @classmethod
954    def _create_gnu_long_header(cls, name, type, encoding, errors):
955        """Return a GNUTYPE_LONGNAME or GNUTYPE_LONGLINK sequence
956           for name.
957        """
958        name = name.encode(encoding, errors) + NUL
959
960        info = {}
961        info["name"] = "././@LongLink"
962        info["type"] = type
963        info["size"] = len(name)
964        info["magic"] = GNU_MAGIC
965
966        # create extended header + name blocks.
967        return cls._create_header(info, USTAR_FORMAT, encoding, errors) + \
968                cls._create_payload(name)
969
970    @classmethod
971    def _create_pax_generic_header(cls, pax_headers, type, encoding):
972        """Return a POSIX.1-2008 extended or global header sequence
973           that contains a list of keyword, value pairs. The values
974           must be strings.
975        """
976        # Check if one of the fields contains surrogate characters and thereby
977        # forces hdrcharset=BINARY, see _proc_pax() for more information.
978        binary = False
979        for keyword, value in pax_headers.items():
980            try:
981                value.encode("utf-8", "strict")
982            except UnicodeEncodeError:
983                binary = True
984                break
985
986        records = b""
987        if binary:
988            # Put the hdrcharset field at the beginning of the header.
989            records += b"21 hdrcharset=BINARY\n"
990
991        for keyword, value in pax_headers.items():
992            keyword = keyword.encode("utf-8")
993            if binary:
994                # Try to restore the original byte representation of `value'.
995                # Needless to say, that the encoding must match the string.
996                value = value.encode(encoding, "surrogateescape")
997            else:
998                value = value.encode("utf-8")
999
1000            l = len(keyword) + len(value) + 3   # ' ' + '=' + '\n'
1001            n = p = 0
1002            while True:
1003                n = l + len(str(p))
1004                if n == p:
1005                    break
1006                p = n
1007            records += bytes(str(p), "ascii") + b" " + keyword + b"=" + value + b"\n"
1008
1009        # We use a hardcoded "././@PaxHeader" name like star does
1010        # instead of the one that POSIX recommends.
1011        info = {}
1012        info["name"] = "././@PaxHeader"
1013        info["type"] = type
1014        info["size"] = len(records)
1015        info["magic"] = POSIX_MAGIC
1016
1017        # Create pax header + record blocks.
1018        return cls._create_header(info, USTAR_FORMAT, "ascii", "replace") + \
1019                cls._create_payload(records)
1020
1021    @classmethod
1022    def frombuf(cls, buf, encoding, errors):
1023        """Construct a TarInfo object from a 512 byte bytes object.
1024        """
1025        if len(buf) == 0:
1026            raise EmptyHeaderError("empty header")
1027        if len(buf) != BLOCKSIZE:
1028            raise TruncatedHeaderError("truncated header")
1029        if buf.count(NUL) == BLOCKSIZE:
1030            raise EOFHeaderError("end of file header")
1031
1032        chksum = nti(buf[148:156])
1033        if chksum not in calc_chksums(buf):
1034            raise InvalidHeaderError("bad checksum")
1035
1036        obj = cls()
1037        obj.name = nts(buf[0:100], encoding, errors)
1038        obj.mode = nti(buf[100:108])
1039        obj.uid = nti(buf[108:116])
1040        obj.gid = nti(buf[116:124])
1041        obj.size = nti(buf[124:136])
1042        obj.mtime = nti(buf[136:148])
1043        obj.chksum = chksum
1044        obj.type = buf[156:157]
1045        obj.linkname = nts(buf[157:257], encoding, errors)
1046        obj.uname = nts(buf[265:297], encoding, errors)
1047        obj.gname = nts(buf[297:329], encoding, errors)
1048        obj.devmajor = nti(buf[329:337])
1049        obj.devminor = nti(buf[337:345])
1050        prefix = nts(buf[345:500], encoding, errors)
1051
1052        # Old V7 tar format represents a directory as a regular
1053        # file with a trailing slash.
1054        if obj.type == AREGTYPE and obj.name.endswith("/"):
1055            obj.type = DIRTYPE
1056
1057        # The old GNU sparse format occupies some of the unused
1058        # space in the buffer for up to 4 sparse structures.
1059        # Save the them for later processing in _proc_sparse().
1060        if obj.type == GNUTYPE_SPARSE:
1061            pos = 386
1062            structs = []
1063            for i in range(4):
1064                try:
1065                    offset = nti(buf[pos:pos + 12])
1066                    numbytes = nti(buf[pos + 12:pos + 24])
1067                except ValueError:
1068                    break
1069                structs.append((offset, numbytes))
1070                pos += 24
1071            isextended = bool(buf[482])
1072            origsize = nti(buf[483:495])
1073            obj._sparse_structs = (structs, isextended, origsize)
1074
1075        # Remove redundant slashes from directories.
1076        if obj.isdir():
1077            obj.name = obj.name.rstrip("/")
1078
1079        # Reconstruct a ustar longname.
1080        if prefix and obj.type not in GNU_TYPES:
1081            obj.name = prefix + "/" + obj.name
1082        return obj
1083
1084    @classmethod
1085    def fromtarfile(cls, tarfile):
1086        """Return the next TarInfo object from TarFile object
1087           tarfile.
1088        """
1089        buf = tarfile.fileobj.read(BLOCKSIZE)
1090        obj = cls.frombuf(buf, tarfile.encoding, tarfile.errors)
1091        obj.offset = tarfile.fileobj.tell() - BLOCKSIZE
1092        return obj._proc_member(tarfile)
1093
1094    #--------------------------------------------------------------------------
1095    # The following are methods that are called depending on the type of a
1096    # member. The entry point is _proc_member() which can be overridden in a
1097    # subclass to add custom _proc_*() methods. A _proc_*() method MUST
1098    # implement the following
1099    # operations:
1100    # 1. Set self.offset_data to the position where the data blocks begin,
1101    #    if there is data that follows.
1102    # 2. Set tarfile.offset to the position where the next member's header will
1103    #    begin.
1104    # 3. Return self or another valid TarInfo object.
1105    def _proc_member(self, tarfile):
1106        """Choose the right processing method depending on
1107           the type and call it.
1108        """
1109        if self.type in (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK):
1110            return self._proc_gnulong(tarfile)
1111        elif self.type == GNUTYPE_SPARSE:
1112            return self._proc_sparse(tarfile)
1113        elif self.type in (XHDTYPE, XGLTYPE, SOLARIS_XHDTYPE):
1114            return self._proc_pax(tarfile)
1115        else:
1116            return self._proc_builtin(tarfile)
1117
1118    def _proc_builtin(self, tarfile):
1119        """Process a builtin type or an unknown type which
1120           will be treated as a regular file.
1121        """
1122        self.offset_data = tarfile.fileobj.tell()
1123        offset = self.offset_data
1124        if self.isreg() or self.type not in SUPPORTED_TYPES:
1125            # Skip the following data blocks.
1126            offset += self._block(self.size)
1127        tarfile.offset = offset
1128
1129        # Patch the TarInfo object with saved global
1130        # header information.
1131        self._apply_pax_info(tarfile.pax_headers, tarfile.encoding, tarfile.errors)
1132
1133        return self
1134
1135    def _proc_gnulong(self, tarfile):
1136        """Process the blocks that hold a GNU longname
1137           or longlink member.
1138        """
1139        buf = tarfile.fileobj.read(self._block(self.size))
1140
1141        # Fetch the next header and process it.
1142        try:
1143            next = self.fromtarfile(tarfile)
1144        except HeaderError:
1145            raise SubsequentHeaderError("missing or bad subsequent header")
1146
1147        # Patch the TarInfo object from the next header with
1148        # the longname information.
1149        next.offset = self.offset
1150        if self.type == GNUTYPE_LONGNAME:
1151            next.name = nts(buf, tarfile.encoding, tarfile.errors)
1152        elif self.type == GNUTYPE_LONGLINK:
1153            next.linkname = nts(buf, tarfile.encoding, tarfile.errors)
1154
1155        return next
1156
1157    def _proc_sparse(self, tarfile):
1158        """Process a GNU sparse header plus extra headers.
1159        """
1160        # We already collected some sparse structures in frombuf().
1161        structs, isextended, origsize = self._sparse_structs
1162        del self._sparse_structs
1163
1164        # Collect sparse structures from extended header blocks.
1165        while isextended:
1166            buf = tarfile.fileobj.read(BLOCKSIZE)
1167            pos = 0
1168            for i in range(21):
1169                try:
1170                    offset = nti(buf[pos:pos + 12])
1171                    numbytes = nti(buf[pos + 12:pos + 24])
1172                except ValueError:
1173                    break
1174                if offset and numbytes:
1175                    structs.append((offset, numbytes))
1176                pos += 24
1177            isextended = bool(buf[504])
1178        self.sparse = structs
1179
1180        self.offset_data = tarfile.fileobj.tell()
1181        tarfile.offset = self.offset_data + self._block(self.size)
1182        self.size = origsize
1183        return self
1184
1185    def _proc_pax(self, tarfile):
1186        """Process an extended or global header as described in
1187           POSIX.1-2008.
1188        """
1189        # Read the header information.
1190        buf = tarfile.fileobj.read(self._block(self.size))
1191
1192        # A pax header stores supplemental information for either
1193        # the following file (extended) or all following files
1194        # (global).
1195        if self.type == XGLTYPE:
1196            pax_headers = tarfile.pax_headers
1197        else:
1198            pax_headers = tarfile.pax_headers.copy()
1199
1200        # Check if the pax header contains a hdrcharset field. This tells us
1201        # the encoding of the path, linkpath, uname and gname fields. Normally,
1202        # these fields are UTF-8 encoded but since POSIX.1-2008 tar
1203        # implementations are allowed to store them as raw binary strings if
1204        # the translation to UTF-8 fails.
1205        match = re.search(br"\d+ hdrcharset=([^\n]+)\n", buf)
1206        if match is not None:
1207            pax_headers["hdrcharset"] = match.group(1).decode("utf-8")
1208
1209        # For the time being, we don't care about anything other than "BINARY".
1210        # The only other value that is currently allowed by the standard is
1211        # "ISO-IR 10646 2000 UTF-8" in other words UTF-8.
1212        hdrcharset = pax_headers.get("hdrcharset")
1213        if hdrcharset == "BINARY":
1214            encoding = tarfile.encoding
1215        else:
1216            encoding = "utf-8"
1217
1218        # Parse pax header information. A record looks like that:
1219        # "%d %s=%s\n" % (length, keyword, value). length is the size
1220        # of the complete record including the length field itself and
1221        # the newline. keyword and value are both UTF-8 encoded strings.
1222        regex = re.compile(br"(\d+) ([^=]+)=")
1223        pos = 0
1224        while True:
1225            match = regex.match(buf, pos)
1226            if not match:
1227                break
1228
1229            length, keyword = match.groups()
1230            length = int(length)
1231            value = buf[match.end(2) + 1:match.start(1) + length - 1]
1232
1233            # Normally, we could just use "utf-8" as the encoding and "strict"
1234            # as the error handler, but we better not take the risk. For
1235            # example, GNU tar <= 1.23 is known to store filenames it cannot
1236            # translate to UTF-8 as raw strings (unfortunately without a
1237            # hdrcharset=BINARY header).
1238            # We first try the strict standard encoding, and if that fails we
1239            # fall back on the user's encoding and error handler.
1240            keyword = self._decode_pax_field(keyword, "utf-8", "utf-8",
1241                    tarfile.errors)
1242            if keyword in PAX_NAME_FIELDS:
1243                value = self._decode_pax_field(value, encoding, tarfile.encoding,
1244                        tarfile.errors)
1245            else:
1246                value = self._decode_pax_field(value, "utf-8", "utf-8",
1247                        tarfile.errors)
1248
1249            pax_headers[keyword] = value
1250            pos += length
1251
1252        # Fetch the next header.
1253        try:
1254            next = self.fromtarfile(tarfile)
1255        except HeaderError:
1256            raise SubsequentHeaderError("missing or bad subsequent header")
1257
1258        # Process GNU sparse information.
1259        if "GNU.sparse.map" in pax_headers:
1260            # GNU extended sparse format version 0.1.
1261            self._proc_gnusparse_01(next, pax_headers)
1262
1263        elif "GNU.sparse.size" in pax_headers:
1264            # GNU extended sparse format version 0.0.
1265            self._proc_gnusparse_00(next, pax_headers, buf)
1266
1267        elif pax_headers.get("GNU.sparse.major") == "1" and pax_headers.get("GNU.sparse.minor") == "0":
1268            # GNU extended sparse format version 1.0.
1269            self._proc_gnusparse_10(next, pax_headers, tarfile)
1270
1271        if self.type in (XHDTYPE, SOLARIS_XHDTYPE):
1272            # Patch the TarInfo object with the extended header info.
1273            next._apply_pax_info(pax_headers, tarfile.encoding, tarfile.errors)
1274            next.offset = self.offset
1275
1276            if "size" in pax_headers:
1277                # If the extended header replaces the size field,
1278                # we need to recalculate the offset where the next
1279                # header starts.
1280                offset = next.offset_data
1281                if next.isreg() or next.type not in SUPPORTED_TYPES:
1282                    offset += next._block(next.size)
1283                tarfile.offset = offset
1284
1285        return next
1286
1287    def _proc_gnusparse_00(self, next, pax_headers, buf):
1288        """Process a GNU tar extended sparse header, version 0.0.
1289        """
1290        offsets = []
1291        for match in re.finditer(br"\d+ GNU.sparse.offset=(\d+)\n", buf):
1292            offsets.append(int(match.group(1)))
1293        numbytes = []
1294        for match in re.finditer(br"\d+ GNU.sparse.numbytes=(\d+)\n", buf):
1295            numbytes.append(int(match.group(1)))
1296        next.sparse = list(zip(offsets, numbytes))
1297
1298    def _proc_gnusparse_01(self, next, pax_headers):
1299        """Process a GNU tar extended sparse header, version 0.1.
1300        """
1301        sparse = [int(x) for x in pax_headers["GNU.sparse.map"].split(",")]
1302        next.sparse = list(zip(sparse[::2], sparse[1::2]))
1303
1304    def _proc_gnusparse_10(self, next, pax_headers, tarfile):
1305        """Process a GNU tar extended sparse header, version 1.0.
1306        """
1307        fields = None
1308        sparse = []
1309        buf = tarfile.fileobj.read(BLOCKSIZE)
1310        fields, buf = buf.split(b"\n", 1)
1311        fields = int(fields)
1312        while len(sparse) < fields * 2:
1313            if b"\n" not in buf:
1314                buf += tarfile.fileobj.read(BLOCKSIZE)
1315            number, buf = buf.split(b"\n", 1)
1316            sparse.append(int(number))
1317        next.offset_data = tarfile.fileobj.tell()
1318        next.sparse = list(zip(sparse[::2], sparse[1::2]))
1319
1320    def _apply_pax_info(self, pax_headers, encoding, errors):
1321        """Replace fields with supplemental information from a previous
1322           pax extended or global header.
1323        """
1324        for keyword, value in pax_headers.items():
1325            if keyword == "GNU.sparse.name":
1326                setattr(self, "path", value)
1327            elif keyword == "GNU.sparse.size":
1328                setattr(self, "size", int(value))
1329            elif keyword == "GNU.sparse.realsize":
1330                setattr(self, "size", int(value))
1331            elif keyword in PAX_FIELDS:
1332                if keyword in PAX_NUMBER_FIELDS:
1333                    try:
1334                        value = PAX_NUMBER_FIELDS[keyword](value)
1335                    except ValueError:
1336                        value = 0
1337                if keyword == "path":
1338                    value = value.rstrip("/")
1339                setattr(self, keyword, value)
1340
1341        self.pax_headers = pax_headers.copy()
1342
1343    def _decode_pax_field(self, value, encoding, fallback_encoding, fallback_errors):
1344        """Decode a single field from a pax record.
1345        """
1346        try:
1347            return value.decode(encoding, "strict")
1348        except UnicodeDecodeError:
1349            return value.decode(fallback_encoding, fallback_errors)
1350
1351    def _block(self, count):
1352        """Round up a byte count by BLOCKSIZE and return it,
1353           e.g. _block(834) => 1024.
1354        """
1355        blocks, remainder = divmod(count, BLOCKSIZE)
1356        if remainder:
1357            blocks += 1
1358        return blocks * BLOCKSIZE
1359
1360    def isreg(self):
1361        return self.type in REGULAR_TYPES
1362    def isfile(self):
1363        return self.isreg()
1364    def isdir(self):
1365        return self.type == DIRTYPE
1366    def issym(self):
1367        return self.type == SYMTYPE
1368    def islnk(self):
1369        return self.type == LNKTYPE
1370    def ischr(self):
1371        return self.type == CHRTYPE
1372    def isblk(self):
1373        return self.type == BLKTYPE
1374    def isfifo(self):
1375        return self.type == FIFOTYPE
1376    def issparse(self):
1377        return self.sparse is not None
1378    def isdev(self):
1379        return self.type in (CHRTYPE, BLKTYPE, FIFOTYPE)
1380# class TarInfo
1381
1382class TarFile(object):
1383    """The TarFile Class provides an interface to tar archives.
1384    """
1385
1386    debug = 0                   # May be set from 0 (no msgs) to 3 (all msgs)
1387
1388    dereference = False         # If true, add content of linked file to the
1389                                # tar file, else the link.
1390
1391    ignore_zeros = False        # If true, skips empty or invalid blocks and
1392                                # continues processing.
1393
1394    errorlevel = 1              # If 0, fatal errors only appear in debug
1395                                # messages (if debug >= 0). If > 0, errors
1396                                # are passed to the caller as exceptions.
1397
1398    format = DEFAULT_FORMAT     # The format to use when creating an archive.
1399
1400    encoding = ENCODING         # Encoding for 8-bit character strings.
1401
1402    errors = None               # Error handler for unicode conversion.
1403
1404    tarinfo = TarInfo           # The default TarInfo class to use.
1405
1406    fileobject = ExFileObject   # The file-object for extractfile().
1407
1408    def __init__(self, name=None, mode="r", fileobj=None, format=None,
1409            tarinfo=None, dereference=None, ignore_zeros=None, encoding=None,
1410            errors="surrogateescape", pax_headers=None, debug=None,
1411            errorlevel=None, copybufsize=None):
1412        """Open an (uncompressed) tar archive `name'. `mode' is either 'r' to
1413           read from an existing archive, 'a' to append data to an existing
1414           file or 'w' to create a new file overwriting an existing one. `mode'
1415           defaults to 'r'.
1416           If `fileobj' is given, it is used for reading or writing data. If it
1417           can be determined, `mode' is overridden by `fileobj's mode.
1418           `fileobj' is not closed, when TarFile is closed.
1419        """
1420        modes = {"r": "rb", "a": "r+b", "w": "wb", "x": "xb"}
1421        if mode not in modes:
1422            raise ValueError("mode must be 'r', 'a', 'w' or 'x'")
1423        self.mode = mode
1424        self._mode = modes[mode]
1425
1426        if not fileobj:
1427            if self.mode == "a" and not os.path.exists(name):
1428                # Create nonexistent files in append mode.
1429                self.mode = "w"
1430                self._mode = "wb"
1431            fileobj = bltn_open(name, self._mode)
1432            self._extfileobj = False
1433        else:
1434            if (name is None and hasattr(fileobj, "name") and
1435                isinstance(fileobj.name, (str, bytes))):
1436                name = fileobj.name
1437            if hasattr(fileobj, "mode"):
1438                self._mode = fileobj.mode
1439            self._extfileobj = True
1440        self.name = os.path.abspath(name) if name else None
1441        self.fileobj = fileobj
1442
1443        # Init attributes.
1444        if format is not None:
1445            self.format = format
1446        if tarinfo is not None:
1447            self.tarinfo = tarinfo
1448        if dereference is not None:
1449            self.dereference = dereference
1450        if ignore_zeros is not None:
1451            self.ignore_zeros = ignore_zeros
1452        if encoding is not None:
1453            self.encoding = encoding
1454        self.errors = errors
1455
1456        if pax_headers is not None and self.format == PAX_FORMAT:
1457            self.pax_headers = pax_headers
1458        else:
1459            self.pax_headers = {}
1460
1461        if debug is not None:
1462            self.debug = debug
1463        if errorlevel is not None:
1464            self.errorlevel = errorlevel
1465
1466        # Init datastructures.
1467        self.copybufsize = copybufsize
1468        self.closed = False
1469        self.members = []       # list of members as TarInfo objects
1470        self._loaded = False    # flag if all members have been read
1471        self.offset = self.fileobj.tell()
1472                                # current position in the archive file
1473        self.inodes = {}        # dictionary caching the inodes of
1474                                # archive members already added
1475
1476        try:
1477            if self.mode == "r":
1478                self.firstmember = None
1479                self.firstmember = self.next()
1480
1481            if self.mode == "a":
1482                # Move to the end of the archive,
1483                # before the first empty block.
1484                while True:
1485                    self.fileobj.seek(self.offset)
1486                    try:
1487                        tarinfo = self.tarinfo.fromtarfile(self)
1488                        self.members.append(tarinfo)
1489                    except EOFHeaderError:
1490                        self.fileobj.seek(self.offset)
1491                        break
1492                    except HeaderError as e:
1493                        raise ReadError(str(e))
1494
1495            if self.mode in ("a", "w", "x"):
1496                self._loaded = True
1497
1498                if self.pax_headers:
1499                    buf = self.tarinfo.create_pax_global_header(self.pax_headers.copy())
1500                    self.fileobj.write(buf)
1501                    self.offset += len(buf)
1502        except:
1503            if not self._extfileobj:
1504                self.fileobj.close()
1505            self.closed = True
1506            raise
1507
1508    #--------------------------------------------------------------------------
1509    # Below are the classmethods which act as alternate constructors to the
1510    # TarFile class. The open() method is the only one that is needed for
1511    # public use; it is the "super"-constructor and is able to select an
1512    # adequate "sub"-constructor for a particular compression using the mapping
1513    # from OPEN_METH.
1514    #
1515    # This concept allows one to subclass TarFile without losing the comfort of
1516    # the super-constructor. A sub-constructor is registered and made available
1517    # by adding it to the mapping in OPEN_METH.
1518
1519    @classmethod
1520    def open(cls, name=None, mode="r", fileobj=None, bufsize=RECORDSIZE, **kwargs):
1521        """Open a tar archive for reading, writing or appending. Return
1522           an appropriate TarFile class.
1523
1524           mode:
1525           'r' or 'r:*' open for reading with transparent compression
1526           'r:'         open for reading exclusively uncompressed
1527           'r:gz'       open for reading with gzip compression
1528           'r:bz2'      open for reading with bzip2 compression
1529           'r:xz'       open for reading with lzma compression
1530           'a' or 'a:'  open for appending, creating the file if necessary
1531           'w' or 'w:'  open for writing without compression
1532           'w:gz'       open for writing with gzip compression
1533           'w:bz2'      open for writing with bzip2 compression
1534           'w:xz'       open for writing with lzma compression
1535
1536           'x' or 'x:'  create a tarfile exclusively without compression, raise
1537                        an exception if the file is already created
1538           'x:gz'       create a gzip compressed tarfile, raise an exception
1539                        if the file is already created
1540           'x:bz2'      create a bzip2 compressed tarfile, raise an exception
1541                        if the file is already created
1542           'x:xz'       create an lzma compressed tarfile, raise an exception
1543                        if the file is already created
1544
1545           'r|*'        open a stream of tar blocks with transparent compression
1546           'r|'         open an uncompressed stream of tar blocks for reading
1547           'r|gz'       open a gzip compressed stream of tar blocks
1548           'r|bz2'      open a bzip2 compressed stream of tar blocks
1549           'r|xz'       open an lzma compressed stream of tar blocks
1550           'w|'         open an uncompressed stream for writing
1551           'w|gz'       open a gzip compressed stream for writing
1552           'w|bz2'      open a bzip2 compressed stream for writing
1553           'w|xz'       open an lzma compressed stream for writing
1554        """
1555
1556        if not name and not fileobj:
1557            raise ValueError("nothing to open")
1558
1559        if mode in ("r", "r:*"):
1560            # Find out which *open() is appropriate for opening the file.
1561            def not_compressed(comptype):
1562                return cls.OPEN_METH[comptype] == 'taropen'
1563            for comptype in sorted(cls.OPEN_METH, key=not_compressed):
1564                func = getattr(cls, cls.OPEN_METH[comptype])
1565                if fileobj is not None:
1566                    saved_pos = fileobj.tell()
1567                try:
1568                    return func(name, "r", fileobj, **kwargs)
1569                except (ReadError, CompressionError):
1570                    if fileobj is not None:
1571                        fileobj.seek(saved_pos)
1572                    continue
1573            raise ReadError("file could not be opened successfully")
1574
1575        elif ":" in mode:
1576            filemode, comptype = mode.split(":", 1)
1577            filemode = filemode or "r"
1578            comptype = comptype or "tar"
1579
1580            # Select the *open() function according to
1581            # given compression.
1582            if comptype in cls.OPEN_METH:
1583                func = getattr(cls, cls.OPEN_METH[comptype])
1584            else:
1585                raise CompressionError("unknown compression type %r" % comptype)
1586            return func(name, filemode, fileobj, **kwargs)
1587
1588        elif "|" in mode:
1589            filemode, comptype = mode.split("|", 1)
1590            filemode = filemode or "r"
1591            comptype = comptype or "tar"
1592
1593            if filemode not in ("r", "w"):
1594                raise ValueError("mode must be 'r' or 'w'")
1595
1596            stream = _Stream(name, filemode, comptype, fileobj, bufsize)
1597            try:
1598                t = cls(name, filemode, stream, **kwargs)
1599            except:
1600                stream.close()
1601                raise
1602            t._extfileobj = False
1603            return t
1604
1605        elif mode in ("a", "w", "x"):
1606            return cls.taropen(name, mode, fileobj, **kwargs)
1607
1608        raise ValueError("undiscernible mode")
1609
1610    @classmethod
1611    def taropen(cls, name, mode="r", fileobj=None, **kwargs):
1612        """Open uncompressed tar archive name for reading or writing.
1613        """
1614        if mode not in ("r", "a", "w", "x"):
1615            raise ValueError("mode must be 'r', 'a', 'w' or 'x'")
1616        return cls(name, mode, fileobj, **kwargs)
1617
1618    @classmethod
1619    def gzopen(cls, name, mode="r", fileobj=None, compresslevel=9, **kwargs):
1620        """Open gzip compressed tar archive name for reading or writing.
1621           Appending is not allowed.
1622        """
1623        if mode not in ("r", "w", "x"):
1624            raise ValueError("mode must be 'r', 'w' or 'x'")
1625
1626        try:
1627            import gzip
1628            gzip.GzipFile
1629        except (ImportError, AttributeError):
1630            raise CompressionError("gzip module is not available")
1631
1632        try:
1633            fileobj = gzip.GzipFile(name, mode + "b", compresslevel, fileobj)
1634        except OSError:
1635            if fileobj is not None and mode == 'r':
1636                raise ReadError("not a gzip file")
1637            raise
1638
1639        try:
1640            t = cls.taropen(name, mode, fileobj, **kwargs)
1641        except OSError:
1642            fileobj.close()
1643            if mode == 'r':
1644                raise ReadError("not a gzip file")
1645            raise
1646        except:
1647            fileobj.close()
1648            raise
1649        t._extfileobj = False
1650        return t
1651
1652    @classmethod
1653    def bz2open(cls, name, mode="r", fileobj=None, compresslevel=9, **kwargs):
1654        """Open bzip2 compressed tar archive name for reading or writing.
1655           Appending is not allowed.
1656        """
1657        if mode not in ("r", "w", "x"):
1658            raise ValueError("mode must be 'r', 'w' or 'x'")
1659
1660        try:
1661            import bz2
1662        except ImportError:
1663            raise CompressionError("bz2 module is not available")
1664
1665        fileobj = bz2.BZ2File(fileobj or name, mode,
1666                              compresslevel=compresslevel)
1667
1668        try:
1669            t = cls.taropen(name, mode, fileobj, **kwargs)
1670        except (OSError, EOFError):
1671            fileobj.close()
1672            if mode == 'r':
1673                raise ReadError("not a bzip2 file")
1674            raise
1675        except:
1676            fileobj.close()
1677            raise
1678        t._extfileobj = False
1679        return t
1680
1681    @classmethod
1682    def xzopen(cls, name, mode="r", fileobj=None, preset=None, **kwargs):
1683        """Open lzma compressed tar archive name for reading or writing.
1684           Appending is not allowed.
1685        """
1686        if mode not in ("r", "w", "x"):
1687            raise ValueError("mode must be 'r', 'w' or 'x'")
1688
1689        try:
1690            import lzma
1691        except ImportError:
1692            raise CompressionError("lzma module is not available")
1693
1694        fileobj = lzma.LZMAFile(fileobj or name, mode, preset=preset)
1695
1696        try:
1697            t = cls.taropen(name, mode, fileobj, **kwargs)
1698        except (lzma.LZMAError, EOFError):
1699            fileobj.close()
1700            if mode == 'r':
1701                raise ReadError("not an lzma file")
1702            raise
1703        except:
1704            fileobj.close()
1705            raise
1706        t._extfileobj = False
1707        return t
1708
1709    # All *open() methods are registered here.
1710    OPEN_METH = {
1711        "tar": "taropen",   # uncompressed tar
1712        "gz":  "gzopen",    # gzip compressed tar
1713        "bz2": "bz2open",   # bzip2 compressed tar
1714        "xz":  "xzopen"     # lzma compressed tar
1715    }
1716
1717    #--------------------------------------------------------------------------
1718    # The public methods which TarFile provides:
1719
1720    def close(self):
1721        """Close the TarFile. In write-mode, two finishing zero blocks are
1722           appended to the archive.
1723        """
1724        if self.closed:
1725            return
1726
1727        self.closed = True
1728        try:
1729            if self.mode in ("a", "w", "x"):
1730                self.fileobj.write(NUL * (BLOCKSIZE * 2))
1731                self.offset += (BLOCKSIZE * 2)
1732                # fill up the end with zero-blocks
1733                # (like option -b20 for tar does)
1734                blocks, remainder = divmod(self.offset, RECORDSIZE)
1735                if remainder > 0:
1736                    self.fileobj.write(NUL * (RECORDSIZE - remainder))
1737        finally:
1738            if not self._extfileobj:
1739                self.fileobj.close()
1740
1741    def getmember(self, name):
1742        """Return a TarInfo object for member `name'. If `name' can not be
1743           found in the archive, KeyError is raised. If a member occurs more
1744           than once in the archive, its last occurrence is assumed to be the
1745           most up-to-date version.
1746        """
1747        tarinfo = self._getmember(name)
1748        if tarinfo is None:
1749            raise KeyError("filename %r not found" % name)
1750        return tarinfo
1751
1752    def getmembers(self):
1753        """Return the members of the archive as a list of TarInfo objects. The
1754           list has the same order as the members in the archive.
1755        """
1756        self._check()
1757        if not self._loaded:    # if we want to obtain a list of
1758            self._load()        # all members, we first have to
1759                                # scan the whole archive.
1760        return self.members
1761
1762    def getnames(self):
1763        """Return the members of the archive as a list of their names. It has
1764           the same order as the list returned by getmembers().
1765        """
1766        return [tarinfo.name for tarinfo in self.getmembers()]
1767
1768    def gettarinfo(self, name=None, arcname=None, fileobj=None):
1769        """Create a TarInfo object from the result of os.stat or equivalent
1770           on an existing file. The file is either named by `name', or
1771           specified as a file object `fileobj' with a file descriptor. If
1772           given, `arcname' specifies an alternative name for the file in the
1773           archive, otherwise, the name is taken from the 'name' attribute of
1774           'fileobj', or the 'name' argument. The name should be a text
1775           string.
1776        """
1777        self._check("awx")
1778
1779        # When fileobj is given, replace name by
1780        # fileobj's real name.
1781        if fileobj is not None:
1782            name = fileobj.name
1783
1784        # Building the name of the member in the archive.
1785        # Backward slashes are converted to forward slashes,
1786        # Absolute paths are turned to relative paths.
1787        if arcname is None:
1788            arcname = name
1789        drv, arcname = os.path.splitdrive(arcname)
1790        arcname = arcname.replace(os.sep, "/")
1791        arcname = arcname.lstrip("/")
1792
1793        # Now, fill the TarInfo object with
1794        # information specific for the file.
1795        tarinfo = self.tarinfo()
1796        tarinfo.tarfile = self  # Not needed
1797
1798        # Use os.stat or os.lstat, depending on platform
1799        # and if symlinks shall be resolved.
1800        if fileobj is None:
1801            if hasattr(os, "lstat") and not self.dereference:
1802                statres = os.lstat(name)
1803            else:
1804                statres = os.stat(name)
1805        else:
1806            statres = os.fstat(fileobj.fileno())
1807        linkname = ""
1808
1809        stmd = statres.st_mode
1810        if stat.S_ISREG(stmd):
1811            inode = (statres.st_ino, statres.st_dev)
1812            if not self.dereference and statres.st_nlink > 1 and \
1813                    inode in self.inodes and arcname != self.inodes[inode]:
1814                # Is it a hardlink to an already
1815                # archived file?
1816                type = LNKTYPE
1817                linkname = self.inodes[inode]
1818            else:
1819                # The inode is added only if its valid.
1820                # For win32 it is always 0.
1821                type = REGTYPE
1822                if inode[0]:
1823                    self.inodes[inode] = arcname
1824        elif stat.S_ISDIR(stmd):
1825            type = DIRTYPE
1826        elif stat.S_ISFIFO(stmd):
1827            type = FIFOTYPE
1828        elif stat.S_ISLNK(stmd):
1829            type = SYMTYPE
1830            linkname = os.readlink(name)
1831        elif stat.S_ISCHR(stmd):
1832            type = CHRTYPE
1833        elif stat.S_ISBLK(stmd):
1834            type = BLKTYPE
1835        else:
1836            return None
1837
1838        # Fill the TarInfo object with all
1839        # information we can get.
1840        tarinfo.name = arcname
1841        tarinfo.mode = stmd
1842        tarinfo.uid = statres.st_uid
1843        tarinfo.gid = statres.st_gid
1844        if type == REGTYPE:
1845            tarinfo.size = statres.st_size
1846        else:
1847            tarinfo.size = 0
1848        tarinfo.mtime = statres.st_mtime
1849        tarinfo.type = type
1850        tarinfo.linkname = linkname
1851        if pwd:
1852            try:
1853                tarinfo.uname = pwd.getpwuid(tarinfo.uid)[0]
1854            except KeyError:
1855                pass
1856        if grp:
1857            try:
1858                tarinfo.gname = grp.getgrgid(tarinfo.gid)[0]
1859            except KeyError:
1860                pass
1861
1862        if type in (CHRTYPE, BLKTYPE):
1863            if hasattr(os, "major") and hasattr(os, "minor"):
1864                tarinfo.devmajor = os.major(statres.st_rdev)
1865                tarinfo.devminor = os.minor(statres.st_rdev)
1866        return tarinfo
1867
1868    def list(self, verbose=True, *, members=None):
1869        """Print a table of contents to sys.stdout. If `verbose' is False, only
1870           the names of the members are printed. If it is True, an `ls -l'-like
1871           output is produced. `members' is optional and must be a subset of the
1872           list returned by getmembers().
1873        """
1874        self._check()
1875
1876        if members is None:
1877            members = self
1878        for tarinfo in members:
1879            if verbose:
1880                _safe_print(stat.filemode(tarinfo.mode))
1881                _safe_print("%s/%s" % (tarinfo.uname or tarinfo.uid,
1882                                       tarinfo.gname or tarinfo.gid))
1883                if tarinfo.ischr() or tarinfo.isblk():
1884                    _safe_print("%10s" %
1885                            ("%d,%d" % (tarinfo.devmajor, tarinfo.devminor)))
1886                else:
1887                    _safe_print("%10d" % tarinfo.size)
1888                _safe_print("%d-%02d-%02d %02d:%02d:%02d" \
1889                            % time.localtime(tarinfo.mtime)[:6])
1890
1891            _safe_print(tarinfo.name + ("/" if tarinfo.isdir() else ""))
1892
1893            if verbose:
1894                if tarinfo.issym():
1895                    _safe_print("-> " + tarinfo.linkname)
1896                if tarinfo.islnk():
1897                    _safe_print("link to " + tarinfo.linkname)
1898            print()
1899
1900    def add(self, name, arcname=None, recursive=True, exclude=None, *, filter=None):
1901        """Add the file `name' to the archive. `name' may be any type of file
1902           (directory, fifo, symbolic link, etc.). If given, `arcname'
1903           specifies an alternative name for the file in the archive.
1904           Directories are added recursively by default. This can be avoided by
1905           setting `recursive' to False. `exclude' is a function that should
1906           return True for each filename to be excluded. `filter' is a function
1907           that expects a TarInfo object argument and returns the changed
1908           TarInfo object, if it returns None the TarInfo object will be
1909           excluded from the archive.
1910        """
1911        self._check("awx")
1912
1913        if arcname is None:
1914            arcname = name
1915
1916        # Exclude pathnames.
1917        if exclude is not None:
1918            import warnings
1919            warnings.warn("use the filter argument instead",
1920                    DeprecationWarning, 2)
1921            if exclude(name):
1922                self._dbg(2, "tarfile: Excluded %r" % name)
1923                return
1924
1925        # Skip if somebody tries to archive the archive...
1926        if self.name is not None and os.path.abspath(name) == self.name:
1927            self._dbg(2, "tarfile: Skipped %r" % name)
1928            return
1929
1930        self._dbg(1, name)
1931
1932        # Create a TarInfo object from the file.
1933        tarinfo = self.gettarinfo(name, arcname)
1934
1935        if tarinfo is None:
1936            self._dbg(1, "tarfile: Unsupported type %r" % name)
1937            return
1938
1939        # Change or exclude the TarInfo object.
1940        if filter is not None:
1941            tarinfo = filter(tarinfo)
1942            if tarinfo is None:
1943                self._dbg(2, "tarfile: Excluded %r" % name)
1944                return
1945
1946        # Append the tar header and data to the archive.
1947        if tarinfo.isreg():
1948            with bltn_open(name, "rb") as f:
1949                self.addfile(tarinfo, f)
1950
1951        elif tarinfo.isdir():
1952            self.addfile(tarinfo)
1953            if recursive:
1954                for f in os.listdir(name):
1955                    self.add(os.path.join(name, f), os.path.join(arcname, f),
1956                            recursive, exclude, filter=filter)
1957
1958        else:
1959            self.addfile(tarinfo)
1960
1961    def addfile(self, tarinfo, fileobj=None):
1962        """Add the TarInfo object `tarinfo' to the archive. If `fileobj' is
1963           given, it should be a binary file, and tarinfo.size bytes are read
1964           from it and added to the archive. You can create TarInfo objects
1965           directly, or by using gettarinfo().
1966        """
1967        self._check("awx")
1968
1969        tarinfo = copy.copy(tarinfo)
1970
1971        buf = tarinfo.tobuf(self.format, self.encoding, self.errors)
1972        self.fileobj.write(buf)
1973        self.offset += len(buf)
1974        bufsize=self.copybufsize
1975        # If there's data to follow, append it.
1976        if fileobj is not None:
1977            copyfileobj(fileobj, self.fileobj, tarinfo.size, bufsize=bufsize)
1978            blocks, remainder = divmod(tarinfo.size, BLOCKSIZE)
1979            if remainder > 0:
1980                self.fileobj.write(NUL * (BLOCKSIZE - remainder))
1981                blocks += 1
1982            self.offset += blocks * BLOCKSIZE
1983
1984        self.members.append(tarinfo)
1985
1986    def extractall(self, path=".", members=None, *, numeric_owner=False):
1987        """Extract all members from the archive to the current working
1988           directory and set owner, modification time and permissions on
1989           directories afterwards. `path' specifies a different directory
1990           to extract to. `members' is optional and must be a subset of the
1991           list returned by getmembers(). If `numeric_owner` is True, only
1992           the numbers for user/group names are used and not the names.
1993        """
1994        directories = []
1995
1996        if members is None:
1997            members = self
1998
1999        for tarinfo in members:
2000            if tarinfo.isdir():
2001                # Extract directories with a safe mode.
2002                directories.append(tarinfo)
2003                tarinfo = copy.copy(tarinfo)
2004                tarinfo.mode = 0o700
2005            # Do not set_attrs directories, as we will do that further down
2006            self.extract(tarinfo, path, set_attrs=not tarinfo.isdir(),
2007                         numeric_owner=numeric_owner)
2008
2009        # Reverse sort directories.
2010        directories.sort(key=lambda a: a.name)
2011        directories.reverse()
2012
2013        # Set correct owner, mtime and filemode on directories.
2014        for tarinfo in directories:
2015            dirpath = os.path.join(path, tarinfo.name)
2016            try:
2017                self.chown(tarinfo, dirpath, numeric_owner=numeric_owner)
2018                self.utime(tarinfo, dirpath)
2019                self.chmod(tarinfo, dirpath)
2020            except ExtractError as e:
2021                if self.errorlevel > 1:
2022                    raise
2023                else:
2024                    self._dbg(1, "tarfile: %s" % e)
2025
2026    def extract(self, member, path="", set_attrs=True, *, numeric_owner=False):
2027        """Extract a member from the archive to the current working directory,
2028           using its full name. Its file information is extracted as accurately
2029           as possible. `member' may be a filename or a TarInfo object. You can
2030           specify a different directory using `path'. File attributes (owner,
2031           mtime, mode) are set unless `set_attrs' is False. If `numeric_owner`
2032           is True, only the numbers for user/group names are used and not
2033           the names.
2034        """
2035        self._check("r")
2036
2037        if isinstance(member, str):
2038            tarinfo = self.getmember(member)
2039        else:
2040            tarinfo = member
2041
2042        # Prepare the link target for makelink().
2043        if tarinfo.islnk():
2044            tarinfo._link_target = os.path.join(path, tarinfo.linkname)
2045
2046        try:
2047            self._extract_member(tarinfo, os.path.join(path, tarinfo.name),
2048                                 set_attrs=set_attrs,
2049                                 numeric_owner=numeric_owner)
2050        except OSError as e:
2051            if self.errorlevel > 0:
2052                raise
2053            else:
2054                if e.filename is None:
2055                    self._dbg(1, "tarfile: %s" % e.strerror)
2056                else:
2057                    self._dbg(1, "tarfile: %s %r" % (e.strerror, e.filename))
2058        except ExtractError as e:
2059            if self.errorlevel > 1:
2060                raise
2061            else:
2062                self._dbg(1, "tarfile: %s" % e)
2063
2064    def extractfile(self, member):
2065        """Extract a member from the archive as a file object. `member' may be
2066           a filename or a TarInfo object. If `member' is a regular file or a
2067           link, an io.BufferedReader object is returned. Otherwise, None is
2068           returned.
2069        """
2070        self._check("r")
2071
2072        if isinstance(member, str):
2073            tarinfo = self.getmember(member)
2074        else:
2075            tarinfo = member
2076
2077        if tarinfo.isreg() or tarinfo.type not in SUPPORTED_TYPES:
2078            # Members with unknown types are treated as regular files.
2079            return self.fileobject(self, tarinfo)
2080
2081        elif tarinfo.islnk() or tarinfo.issym():
2082            if isinstance(self.fileobj, _Stream):
2083                # A small but ugly workaround for the case that someone tries
2084                # to extract a (sym)link as a file-object from a non-seekable
2085                # stream of tar blocks.
2086                raise StreamError("cannot extract (sym)link as file object")
2087            else:
2088                # A (sym)link's file object is its target's file object.
2089                return self.extractfile(self._find_link_target(tarinfo))
2090        else:
2091            # If there's no data associated with the member (directory, chrdev,
2092            # blkdev, etc.), return None instead of a file object.
2093            return None
2094
2095    def _extract_member(self, tarinfo, targetpath, set_attrs=True,
2096                        numeric_owner=False):
2097        """Extract the TarInfo object tarinfo to a physical
2098           file called targetpath.
2099        """
2100        # Fetch the TarInfo object for the given name
2101        # and build the destination pathname, replacing
2102        # forward slashes to platform specific separators.
2103        targetpath = targetpath.rstrip("/")
2104        targetpath = targetpath.replace("/", os.sep)
2105
2106        # Create all upper directories.
2107        upperdirs = os.path.dirname(targetpath)
2108        if upperdirs and not os.path.exists(upperdirs):
2109            # Create directories that are not part of the archive with
2110            # default permissions.
2111            os.makedirs(upperdirs)
2112
2113        if tarinfo.islnk() or tarinfo.issym():
2114            self._dbg(1, "%s -> %s" % (tarinfo.name, tarinfo.linkname))
2115        else:
2116            self._dbg(1, tarinfo.name)
2117
2118        if tarinfo.isreg():
2119            self.makefile(tarinfo, targetpath)
2120        elif tarinfo.isdir():
2121            self.makedir(tarinfo, targetpath)
2122        elif tarinfo.isfifo():
2123            self.makefifo(tarinfo, targetpath)
2124        elif tarinfo.ischr() or tarinfo.isblk():
2125            self.makedev(tarinfo, targetpath)
2126        elif tarinfo.islnk() or tarinfo.issym():
2127            self.makelink(tarinfo, targetpath)
2128        elif tarinfo.type not in SUPPORTED_TYPES:
2129            self.makeunknown(tarinfo, targetpath)
2130        else:
2131            self.makefile(tarinfo, targetpath)
2132
2133        if set_attrs:
2134            self.chown(tarinfo, targetpath, numeric_owner)
2135            if not tarinfo.issym():
2136                self.chmod(tarinfo, targetpath)
2137                self.utime(tarinfo, targetpath)
2138
2139    #--------------------------------------------------------------------------
2140    # Below are the different file methods. They are called via
2141    # _extract_member() when extract() is called. They can be replaced in a
2142    # subclass to implement other functionality.
2143
2144    def makedir(self, tarinfo, targetpath):
2145        """Make a directory called targetpath.
2146        """
2147        try:
2148            # Use a safe mode for the directory, the real mode is set
2149            # later in _extract_member().
2150            os.mkdir(targetpath, 0o700)
2151        except FileExistsError:
2152            pass
2153
2154    def makefile(self, tarinfo, targetpath):
2155        """Make a file called targetpath.
2156        """
2157        source = self.fileobj
2158        source.seek(tarinfo.offset_data)
2159        bufsize = self.copybufsize
2160        with bltn_open(targetpath, "wb") as target:
2161            if tarinfo.sparse is not None:
2162                for offset, size in tarinfo.sparse:
2163                    target.seek(offset)
2164                    copyfileobj(source, target, size, ReadError, bufsize)
2165                target.seek(tarinfo.size)
2166                target.truncate()
2167            else:
2168                copyfileobj(source, target, tarinfo.size, ReadError, bufsize)
2169
2170    def makeunknown(self, tarinfo, targetpath):
2171        """Make a file from a TarInfo object with an unknown type
2172           at targetpath.
2173        """
2174        self.makefile(tarinfo, targetpath)
2175        self._dbg(1, "tarfile: Unknown file type %r, " \
2176                     "extracted as regular file." % tarinfo.type)
2177
2178    def makefifo(self, tarinfo, targetpath):
2179        """Make a fifo called targetpath.
2180        """
2181        if hasattr(os, "mkfifo"):
2182            os.mkfifo(targetpath)
2183        else:
2184            raise ExtractError("fifo not supported by system")
2185
2186    def makedev(self, tarinfo, targetpath):
2187        """Make a character or block device called targetpath.
2188        """
2189        if not hasattr(os, "mknod") or not hasattr(os, "makedev"):
2190            raise ExtractError("special devices not supported by system")
2191
2192        mode = tarinfo.mode
2193        if tarinfo.isblk():
2194            mode |= stat.S_IFBLK
2195        else:
2196            mode |= stat.S_IFCHR
2197
2198        os.mknod(targetpath, mode,
2199                 os.makedev(tarinfo.devmajor, tarinfo.devminor))
2200
2201    def makelink(self, tarinfo, targetpath):
2202        """Make a (symbolic) link called targetpath. If it cannot be created
2203          (platform limitation), we try to make a copy of the referenced file
2204          instead of a link.
2205        """
2206        try:
2207            # For systems that support symbolic and hard links.
2208            if tarinfo.issym():
2209                os.symlink(tarinfo.linkname, targetpath)
2210            else:
2211                # See extract().
2212                if os.path.exists(tarinfo._link_target):
2213                    os.link(tarinfo._link_target, targetpath)
2214                else:
2215                    self._extract_member(self._find_link_target(tarinfo),
2216                                         targetpath)
2217        except symlink_exception:
2218            try:
2219                self._extract_member(self._find_link_target(tarinfo),
2220                                     targetpath)
2221            except KeyError:
2222                raise ExtractError("unable to resolve link inside archive")
2223
2224    def chown(self, tarinfo, targetpath, numeric_owner):
2225        """Set owner of targetpath according to tarinfo. If numeric_owner
2226           is True, use .gid/.uid instead of .gname/.uname. If numeric_owner
2227           is False, fall back to .gid/.uid when the search based on name
2228           fails.
2229        """
2230        if hasattr(os, "geteuid") and os.geteuid() == 0:
2231            # We have to be root to do so.
2232            g = tarinfo.gid
2233            u = tarinfo.uid
2234            if not numeric_owner:
2235                try:
2236                    if grp:
2237                        g = grp.getgrnam(tarinfo.gname)[2]
2238                except KeyError:
2239                    pass
2240                try:
2241                    if pwd:
2242                        u = pwd.getpwnam(tarinfo.uname)[2]
2243                except KeyError:
2244                    pass
2245            try:
2246                if tarinfo.issym() and hasattr(os, "lchown"):
2247                    os.lchown(targetpath, u, g)
2248                else:
2249                    os.chown(targetpath, u, g)
2250            except OSError:
2251                raise ExtractError("could not change owner")
2252
2253    def chmod(self, tarinfo, targetpath):
2254        """Set file permissions of targetpath according to tarinfo.
2255        """
2256        if hasattr(os, 'chmod'):
2257            try:
2258                os.chmod(targetpath, tarinfo.mode)
2259            except OSError:
2260                raise ExtractError("could not change mode")
2261
2262    def utime(self, tarinfo, targetpath):
2263        """Set modification time of targetpath according to tarinfo.
2264        """
2265        if not hasattr(os, 'utime'):
2266            return
2267        try:
2268            os.utime(targetpath, (tarinfo.mtime, tarinfo.mtime))
2269        except OSError:
2270            raise ExtractError("could not change modification time")
2271
2272    #--------------------------------------------------------------------------
2273    def next(self):
2274        """Return the next member of the archive as a TarInfo object, when
2275           TarFile is opened for reading. Return None if there is no more
2276           available.
2277        """
2278        self._check("ra")
2279        if self.firstmember is not None:
2280            m = self.firstmember
2281            self.firstmember = None
2282            return m
2283
2284        # Advance the file pointer.
2285        if self.offset != self.fileobj.tell():
2286            self.fileobj.seek(self.offset - 1)
2287            if not self.fileobj.read(1):
2288                raise ReadError("unexpected end of data")
2289
2290        # Read the next block.
2291        tarinfo = None
2292        while True:
2293            try:
2294                tarinfo = self.tarinfo.fromtarfile(self)
2295            except EOFHeaderError as e:
2296                if self.ignore_zeros:
2297                    self._dbg(2, "0x%X: %s" % (self.offset, e))
2298                    self.offset += BLOCKSIZE
2299                    continue
2300            except InvalidHeaderError as e:
2301                if self.ignore_zeros:
2302                    self._dbg(2, "0x%X: %s" % (self.offset, e))
2303                    self.offset += BLOCKSIZE
2304                    continue
2305                elif self.offset == 0:
2306                    raise ReadError(str(e))
2307            except EmptyHeaderError:
2308                if self.offset == 0:
2309                    raise ReadError("empty file")
2310            except TruncatedHeaderError as e:
2311                if self.offset == 0:
2312                    raise ReadError(str(e))
2313            except SubsequentHeaderError as e:
2314                raise ReadError(str(e))
2315            break
2316
2317        if tarinfo is not None:
2318            self.members.append(tarinfo)
2319        else:
2320            self._loaded = True
2321
2322        return tarinfo
2323
2324    #--------------------------------------------------------------------------
2325    # Little helper methods:
2326
2327    def _getmember(self, name, tarinfo=None, normalize=False):
2328        """Find an archive member by name from bottom to top.
2329           If tarinfo is given, it is used as the starting point.
2330        """
2331        # Ensure that all members have been loaded.
2332        members = self.getmembers()
2333
2334        # Limit the member search list up to tarinfo.
2335        if tarinfo is not None:
2336            members = members[:members.index(tarinfo)]
2337
2338        if normalize:
2339            name = os.path.normpath(name)
2340
2341        for member in reversed(members):
2342            if normalize:
2343                member_name = os.path.normpath(member.name)
2344            else:
2345                member_name = member.name
2346
2347            if name == member_name:
2348                return member
2349
2350    def _load(self):
2351        """Read through the entire archive file and look for readable
2352           members.
2353        """
2354        while True:
2355            tarinfo = self.next()
2356            if tarinfo is None:
2357                break
2358        self._loaded = True
2359
2360    def _check(self, mode=None):
2361        """Check if TarFile is still open, and if the operation's mode
2362           corresponds to TarFile's mode.
2363        """
2364        if self.closed:
2365            raise OSError("%s is closed" % self.__class__.__name__)
2366        if mode is not None and self.mode not in mode:
2367            raise OSError("bad operation for mode %r" % self.mode)
2368
2369    def _find_link_target(self, tarinfo):
2370        """Find the target member of a symlink or hardlink member in the
2371           archive.
2372        """
2373        if tarinfo.issym():
2374            # Always search the entire archive.
2375            linkname = "/".join(filter(None, (os.path.dirname(tarinfo.name), tarinfo.linkname)))
2376            limit = None
2377        else:
2378            # Search the archive before the link, because a hard link is
2379            # just a reference to an already archived file.
2380            linkname = tarinfo.linkname
2381            limit = tarinfo
2382
2383        member = self._getmember(linkname, tarinfo=limit, normalize=True)
2384        if member is None:
2385            raise KeyError("linkname %r not found" % linkname)
2386        return member
2387
2388    def __iter__(self):
2389        """Provide an iterator object.
2390        """
2391        if self._loaded:
2392            yield from self.members
2393            return
2394
2395        # Yield items using TarFile's next() method.
2396        # When all members have been read, set TarFile as _loaded.
2397        index = 0
2398        # Fix for SF #1100429: Under rare circumstances it can
2399        # happen that getmembers() is called during iteration,
2400        # which will have already exhausted the next() method.
2401        if self.firstmember is not None:
2402            tarinfo = self.next()
2403            index += 1
2404            yield tarinfo
2405
2406        while True:
2407            if index < len(self.members):
2408                tarinfo = self.members[index]
2409            elif not self._loaded:
2410                tarinfo = self.next()
2411                if not tarinfo:
2412                    self._loaded = True
2413                    return
2414            else:
2415                return
2416            index += 1
2417            yield tarinfo
2418
2419    def _dbg(self, level, msg):
2420        """Write debugging output to sys.stderr.
2421        """
2422        if level <= self.debug:
2423            print(msg, file=sys.stderr)
2424
2425    def __enter__(self):
2426        self._check()
2427        return self
2428
2429    def __exit__(self, type, value, traceback):
2430        if type is None:
2431            self.close()
2432        else:
2433            # An exception occurred. We must not call close() because
2434            # it would try to write end-of-archive blocks and padding.
2435            if not self._extfileobj:
2436                self.fileobj.close()
2437            self.closed = True
2438
2439#--------------------
2440# exported functions
2441#--------------------
2442def is_tarfile(name):
2443    """Return True if name points to a tar archive that we
2444       are able to handle, else return False.
2445    """
2446    try:
2447        t = open(name)
2448        t.close()
2449        return True
2450    except TarError:
2451        return False
2452
2453open = TarFile.open
2454
2455
2456def main():
2457    import argparse
2458
2459    description = 'A simple command line interface for tarfile module.'
2460    parser = argparse.ArgumentParser(description=description)
2461    parser.add_argument('-v', '--verbose', action='store_true', default=False,
2462                        help='Verbose output')
2463    group = parser.add_mutually_exclusive_group()
2464    group.add_argument('-l', '--list', metavar='<tarfile>',
2465                       help='Show listing of a tarfile')
2466    group.add_argument('-e', '--extract', nargs='+',
2467                       metavar=('<tarfile>', '<output_dir>'),
2468                       help='Extract tarfile into target dir')
2469    group.add_argument('-c', '--create', nargs='+',
2470                       metavar=('<name>', '<file>'),
2471                       help='Create tarfile from sources')
2472    group.add_argument('-t', '--test', metavar='<tarfile>',
2473                       help='Test if a tarfile is valid')
2474    args = parser.parse_args()
2475
2476    if args.test:
2477        src = args.test
2478        if is_tarfile(src):
2479            with open(src, 'r') as tar:
2480                tar.getmembers()
2481                print(tar.getmembers(), file=sys.stderr)
2482            if args.verbose:
2483                print('{!r} is a tar archive.'.format(src))
2484        else:
2485            parser.exit(1, '{!r} is not a tar archive.\n'.format(src))
2486
2487    elif args.list:
2488        src = args.list
2489        if is_tarfile(src):
2490            with TarFile.open(src, 'r:*') as tf:
2491                tf.list(verbose=args.verbose)
2492        else:
2493            parser.exit(1, '{!r} is not a tar archive.\n'.format(src))
2494
2495    elif args.extract:
2496        if len(args.extract) == 1:
2497            src = args.extract[0]
2498            curdir = os.curdir
2499        elif len(args.extract) == 2:
2500            src, curdir = args.extract
2501        else:
2502            parser.exit(1, parser.format_help())
2503
2504        if is_tarfile(src):
2505            with TarFile.open(src, 'r:*') as tf:
2506                tf.extractall(path=curdir)
2507            if args.verbose:
2508                if curdir == '.':
2509                    msg = '{!r} file is extracted.'.format(src)
2510                else:
2511                    msg = ('{!r} file is extracted '
2512                           'into {!r} directory.').format(src, curdir)
2513                print(msg)
2514        else:
2515            parser.exit(1, '{!r} is not a tar archive.\n'.format(src))
2516
2517    elif args.create:
2518        tar_name = args.create.pop(0)
2519        _, ext = os.path.splitext(tar_name)
2520        compressions = {
2521            # gz
2522            '.gz': 'gz',
2523            '.tgz': 'gz',
2524            # xz
2525            '.xz': 'xz',
2526            '.txz': 'xz',
2527            # bz2
2528            '.bz2': 'bz2',
2529            '.tbz': 'bz2',
2530            '.tbz2': 'bz2',
2531            '.tb2': 'bz2',
2532        }
2533        tar_mode = 'w:' + compressions[ext] if ext in compressions else 'w'
2534        tar_files = args.create
2535
2536        with TarFile.open(tar_name, tar_mode) as tf:
2537            for file_name in tar_files:
2538                tf.add(file_name)
2539
2540        if args.verbose:
2541            print('{!r} file created.'.format(tar_name))
2542
2543    else:
2544        parser.exit(1, parser.format_help())
2545
2546if __name__ == '__main__':
2547    main()
2548