• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1"""Interface to the libbzip2 compression library.
2
3This module provides a file interface, classes for incremental
4(de)compression, and functions for one-shot (de)compression.
5"""
6
7__all__ = ["BZ2File", "BZ2Compressor", "BZ2Decompressor",
8           "open", "compress", "decompress"]
9
10__author__ = "Nadeem Vawda <nadeem.vawda@gmail.com>"
11
12from builtins import open as _builtin_open
13import io
14import os
15import _compression
16
17from _bz2 import BZ2Compressor, BZ2Decompressor
18
19
20_MODE_CLOSED   = 0
21_MODE_READ     = 1
22# Value 2 no longer used
23_MODE_WRITE    = 3
24
25
26class BZ2File(_compression.BaseStream):
27
28    """A file object providing transparent bzip2 (de)compression.
29
30    A BZ2File can act as a wrapper for an existing file object, or refer
31    directly to a named file on disk.
32
33    Note that BZ2File provides a *binary* file interface - data read is
34    returned as bytes, and data to be written should be given as bytes.
35    """
36
37    def __init__(self, filename, mode="r", *, compresslevel=9):
38        """Open a bzip2-compressed file.
39
40        If filename is a str, bytes, or PathLike object, it gives the
41        name of the file to be opened. Otherwise, it should be a file
42        object, which will be used to read or write the compressed data.
43
44        mode can be 'r' for reading (default), 'w' for (over)writing,
45        'x' for creating exclusively, or 'a' for appending. These can
46        equivalently be given as 'rb', 'wb', 'xb', and 'ab'.
47
48        If mode is 'w', 'x' or 'a', compresslevel can be a number between 1
49        and 9 specifying the level of compression: 1 produces the least
50        compression, and 9 (default) produces the most compression.
51
52        If mode is 'r', the input file may be the concatenation of
53        multiple compressed streams.
54        """
55        self._fp = None
56        self._closefp = False
57        self._mode = _MODE_CLOSED
58
59        if not (1 <= compresslevel <= 9):
60            raise ValueError("compresslevel must be between 1 and 9")
61
62        if mode in ("", "r", "rb"):
63            mode = "rb"
64            mode_code = _MODE_READ
65        elif mode in ("w", "wb"):
66            mode = "wb"
67            mode_code = _MODE_WRITE
68            self._compressor = BZ2Compressor(compresslevel)
69        elif mode in ("x", "xb"):
70            mode = "xb"
71            mode_code = _MODE_WRITE
72            self._compressor = BZ2Compressor(compresslevel)
73        elif mode in ("a", "ab"):
74            mode = "ab"
75            mode_code = _MODE_WRITE
76            self._compressor = BZ2Compressor(compresslevel)
77        else:
78            raise ValueError("Invalid mode: %r" % (mode,))
79
80        if isinstance(filename, (str, bytes, os.PathLike)):
81            self._fp = _builtin_open(filename, mode)
82            self._closefp = True
83            self._mode = mode_code
84        elif hasattr(filename, "read") or hasattr(filename, "write"):
85            self._fp = filename
86            self._mode = mode_code
87        else:
88            raise TypeError("filename must be a str, bytes, file or PathLike object")
89
90        if self._mode == _MODE_READ:
91            raw = _compression.DecompressReader(self._fp,
92                BZ2Decompressor, trailing_error=OSError)
93            self._buffer = io.BufferedReader(raw)
94        else:
95            self._pos = 0
96
97    def close(self):
98        """Flush and close the file.
99
100        May be called more than once without error. Once the file is
101        closed, any other operation on it will raise a ValueError.
102        """
103        if self._mode == _MODE_CLOSED:
104            return
105        try:
106            if self._mode == _MODE_READ:
107                self._buffer.close()
108            elif self._mode == _MODE_WRITE:
109                self._fp.write(self._compressor.flush())
110                self._compressor = None
111        finally:
112            try:
113                if self._closefp:
114                    self._fp.close()
115            finally:
116                self._fp = None
117                self._closefp = False
118                self._mode = _MODE_CLOSED
119                self._buffer = None
120
121    @property
122    def closed(self):
123        """True if this file is closed."""
124        return self._mode == _MODE_CLOSED
125
126    def fileno(self):
127        """Return the file descriptor for the underlying file."""
128        self._check_not_closed()
129        return self._fp.fileno()
130
131    def seekable(self):
132        """Return whether the file supports seeking."""
133        return self.readable() and self._buffer.seekable()
134
135    def readable(self):
136        """Return whether the file was opened for reading."""
137        self._check_not_closed()
138        return self._mode == _MODE_READ
139
140    def writable(self):
141        """Return whether the file was opened for writing."""
142        self._check_not_closed()
143        return self._mode == _MODE_WRITE
144
145    def peek(self, n=0):
146        """Return buffered data without advancing the file position.
147
148        Always returns at least one byte of data, unless at EOF.
149        The exact number of bytes returned is unspecified.
150        """
151        self._check_can_read()
152        # Relies on the undocumented fact that BufferedReader.peek()
153        # always returns at least one byte (except at EOF), independent
154        # of the value of n
155        return self._buffer.peek(n)
156
157    def read(self, size=-1):
158        """Read up to size uncompressed bytes from the file.
159
160        If size is negative or omitted, read until EOF is reached.
161        Returns b'' if the file is already at EOF.
162        """
163        self._check_can_read()
164        return self._buffer.read(size)
165
166    def read1(self, size=-1):
167        """Read up to size uncompressed bytes, while trying to avoid
168        making multiple reads from the underlying stream. Reads up to a
169        buffer's worth of data if size is negative.
170
171        Returns b'' if the file is at EOF.
172        """
173        self._check_can_read()
174        if size < 0:
175            size = io.DEFAULT_BUFFER_SIZE
176        return self._buffer.read1(size)
177
178    def readinto(self, b):
179        """Read bytes into b.
180
181        Returns the number of bytes read (0 for EOF).
182        """
183        self._check_can_read()
184        return self._buffer.readinto(b)
185
186    def readline(self, size=-1):
187        """Read a line of uncompressed bytes from the file.
188
189        The terminating newline (if present) is retained. If size is
190        non-negative, no more than size bytes will be read (in which
191        case the line may be incomplete). Returns b'' if already at EOF.
192        """
193        if not isinstance(size, int):
194            if not hasattr(size, "__index__"):
195                raise TypeError("Integer argument expected")
196            size = size.__index__()
197        self._check_can_read()
198        return self._buffer.readline(size)
199
200    def readlines(self, size=-1):
201        """Read a list of lines of uncompressed bytes from the file.
202
203        size can be specified to control the number of lines read: no
204        further lines will be read once the total size of the lines read
205        so far equals or exceeds size.
206        """
207        if not isinstance(size, int):
208            if not hasattr(size, "__index__"):
209                raise TypeError("Integer argument expected")
210            size = size.__index__()
211        self._check_can_read()
212        return self._buffer.readlines(size)
213
214    def write(self, data):
215        """Write a byte string to the file.
216
217        Returns the number of uncompressed bytes written, which is
218        always the length of data in bytes. Note that due to buffering,
219        the file on disk may not reflect the data written until close()
220        is called.
221        """
222        self._check_can_write()
223        if isinstance(data, (bytes, bytearray)):
224            length = len(data)
225        else:
226            # accept any data that supports the buffer protocol
227            data = memoryview(data)
228            length = data.nbytes
229
230        compressed = self._compressor.compress(data)
231        self._fp.write(compressed)
232        self._pos += length
233        return length
234
235    def writelines(self, seq):
236        """Write a sequence of byte strings to the file.
237
238        Returns the number of uncompressed bytes written.
239        seq can be any iterable yielding byte strings.
240
241        Line separators are not added between the written byte strings.
242        """
243        return _compression.BaseStream.writelines(self, seq)
244
245    def seek(self, offset, whence=io.SEEK_SET):
246        """Change the file position.
247
248        The new position is specified by offset, relative to the
249        position indicated by whence. Values for whence are:
250
251            0: start of stream (default); offset must not be negative
252            1: current stream position
253            2: end of stream; offset must not be positive
254
255        Returns the new file position.
256
257        Note that seeking is emulated, so depending on the parameters,
258        this operation may be extremely slow.
259        """
260        self._check_can_seek()
261        return self._buffer.seek(offset, whence)
262
263    def tell(self):
264        """Return the current file position."""
265        self._check_not_closed()
266        if self._mode == _MODE_READ:
267            return self._buffer.tell()
268        return self._pos
269
270
271def open(filename, mode="rb", compresslevel=9,
272         encoding=None, errors=None, newline=None):
273    """Open a bzip2-compressed file in binary or text mode.
274
275    The filename argument can be an actual filename (a str, bytes, or
276    PathLike object), or an existing file object to read from or write
277    to.
278
279    The mode argument can be "r", "rb", "w", "wb", "x", "xb", "a" or
280    "ab" for binary mode, or "rt", "wt", "xt" or "at" for text mode.
281    The default mode is "rb", and the default compresslevel is 9.
282
283    For binary mode, this function is equivalent to the BZ2File
284    constructor: BZ2File(filename, mode, compresslevel). In this case,
285    the encoding, errors and newline arguments must not be provided.
286
287    For text mode, a BZ2File object is created, and wrapped in an
288    io.TextIOWrapper instance with the specified encoding, error
289    handling behavior, and line ending(s).
290
291    """
292    if "t" in mode:
293        if "b" in mode:
294            raise ValueError("Invalid mode: %r" % (mode,))
295    else:
296        if encoding is not None:
297            raise ValueError("Argument 'encoding' not supported in binary mode")
298        if errors is not None:
299            raise ValueError("Argument 'errors' not supported in binary mode")
300        if newline is not None:
301            raise ValueError("Argument 'newline' not supported in binary mode")
302
303    bz_mode = mode.replace("t", "")
304    binary_file = BZ2File(filename, bz_mode, compresslevel=compresslevel)
305
306    if "t" in mode:
307        encoding = io.text_encoding(encoding)
308        return io.TextIOWrapper(binary_file, encoding, errors, newline)
309    else:
310        return binary_file
311
312
313def compress(data, compresslevel=9):
314    """Compress a block of data.
315
316    compresslevel, if given, must be a number between 1 and 9.
317
318    For incremental compression, use a BZ2Compressor object instead.
319    """
320    comp = BZ2Compressor(compresslevel)
321    return comp.compress(data) + comp.flush()
322
323
324def decompress(data):
325    """Decompress a block of data.
326
327    For incremental decompression, use a BZ2Decompressor object instead.
328    """
329    results = []
330    while data:
331        decomp = BZ2Decompressor()
332        try:
333            res = decomp.decompress(data)
334        except OSError:
335            if results:
336                break  # Leftover data is not a valid bzip2 stream; ignore it.
337            else:
338                raise  # Error on the first iteration; bail out.
339        results.append(res)
340        if not decomp.eof:
341            raise ValueError("Compressed data ended before the "
342                             "end-of-stream marker was reached")
343        data = decomp.unused_data
344    return b"".join(results)
345