1"""Interface to the libbzip2 compression library. 2 3This module provides a file interface, classes for incremental 4(de)compression, and functions for one-shot (de)compression. 5""" 6 7__all__ = ["BZ2File", "BZ2Compressor", "BZ2Decompressor", 8 "open", "compress", "decompress"] 9 10__author__ = "Nadeem Vawda <nadeem.vawda@gmail.com>" 11 12from builtins import open as _builtin_open 13import io 14import os 15import _compression 16 17from _bz2 import BZ2Compressor, BZ2Decompressor 18 19 20_MODE_CLOSED = 0 21_MODE_READ = 1 22# Value 2 no longer used 23_MODE_WRITE = 3 24 25 26class BZ2File(_compression.BaseStream): 27 28 """A file object providing transparent bzip2 (de)compression. 29 30 A BZ2File can act as a wrapper for an existing file object, or refer 31 directly to a named file on disk. 32 33 Note that BZ2File provides a *binary* file interface - data read is 34 returned as bytes, and data to be written should be given as bytes. 35 """ 36 37 def __init__(self, filename, mode="r", *, compresslevel=9): 38 """Open a bzip2-compressed file. 39 40 If filename is a str, bytes, or PathLike object, it gives the 41 name of the file to be opened. Otherwise, it should be a file 42 object, which will be used to read or write the compressed data. 43 44 mode can be 'r' for reading (default), 'w' for (over)writing, 45 'x' for creating exclusively, or 'a' for appending. These can 46 equivalently be given as 'rb', 'wb', 'xb', and 'ab'. 47 48 If mode is 'w', 'x' or 'a', compresslevel can be a number between 1 49 and 9 specifying the level of compression: 1 produces the least 50 compression, and 9 (default) produces the most compression. 51 52 If mode is 'r', the input file may be the concatenation of 53 multiple compressed streams. 54 """ 55 self._fp = None 56 self._closefp = False 57 self._mode = _MODE_CLOSED 58 59 if not (1 <= compresslevel <= 9): 60 raise ValueError("compresslevel must be between 1 and 9") 61 62 if mode in ("", "r", "rb"): 63 mode = "rb" 64 mode_code = _MODE_READ 65 elif mode in ("w", "wb"): 66 mode = "wb" 67 mode_code = _MODE_WRITE 68 self._compressor = BZ2Compressor(compresslevel) 69 elif mode in ("x", "xb"): 70 mode = "xb" 71 mode_code = _MODE_WRITE 72 self._compressor = BZ2Compressor(compresslevel) 73 elif mode in ("a", "ab"): 74 mode = "ab" 75 mode_code = _MODE_WRITE 76 self._compressor = BZ2Compressor(compresslevel) 77 else: 78 raise ValueError("Invalid mode: %r" % (mode,)) 79 80 if isinstance(filename, (str, bytes, os.PathLike)): 81 self._fp = _builtin_open(filename, mode) 82 self._closefp = True 83 self._mode = mode_code 84 elif hasattr(filename, "read") or hasattr(filename, "write"): 85 self._fp = filename 86 self._mode = mode_code 87 else: 88 raise TypeError("filename must be a str, bytes, file or PathLike object") 89 90 if self._mode == _MODE_READ: 91 raw = _compression.DecompressReader(self._fp, 92 BZ2Decompressor, trailing_error=OSError) 93 self._buffer = io.BufferedReader(raw) 94 else: 95 self._pos = 0 96 97 def close(self): 98 """Flush and close the file. 99 100 May be called more than once without error. Once the file is 101 closed, any other operation on it will raise a ValueError. 102 """ 103 if self._mode == _MODE_CLOSED: 104 return 105 try: 106 if self._mode == _MODE_READ: 107 self._buffer.close() 108 elif self._mode == _MODE_WRITE: 109 self._fp.write(self._compressor.flush()) 110 self._compressor = None 111 finally: 112 try: 113 if self._closefp: 114 self._fp.close() 115 finally: 116 self._fp = None 117 self._closefp = False 118 self._mode = _MODE_CLOSED 119 self._buffer = None 120 121 @property 122 def closed(self): 123 """True if this file is closed.""" 124 return self._mode == _MODE_CLOSED 125 126 def fileno(self): 127 """Return the file descriptor for the underlying file.""" 128 self._check_not_closed() 129 return self._fp.fileno() 130 131 def seekable(self): 132 """Return whether the file supports seeking.""" 133 return self.readable() and self._buffer.seekable() 134 135 def readable(self): 136 """Return whether the file was opened for reading.""" 137 self._check_not_closed() 138 return self._mode == _MODE_READ 139 140 def writable(self): 141 """Return whether the file was opened for writing.""" 142 self._check_not_closed() 143 return self._mode == _MODE_WRITE 144 145 def peek(self, n=0): 146 """Return buffered data without advancing the file position. 147 148 Always returns at least one byte of data, unless at EOF. 149 The exact number of bytes returned is unspecified. 150 """ 151 self._check_can_read() 152 # Relies on the undocumented fact that BufferedReader.peek() 153 # always returns at least one byte (except at EOF), independent 154 # of the value of n 155 return self._buffer.peek(n) 156 157 def read(self, size=-1): 158 """Read up to size uncompressed bytes from the file. 159 160 If size is negative or omitted, read until EOF is reached. 161 Returns b'' if the file is already at EOF. 162 """ 163 self._check_can_read() 164 return self._buffer.read(size) 165 166 def read1(self, size=-1): 167 """Read up to size uncompressed bytes, while trying to avoid 168 making multiple reads from the underlying stream. Reads up to a 169 buffer's worth of data if size is negative. 170 171 Returns b'' if the file is at EOF. 172 """ 173 self._check_can_read() 174 if size < 0: 175 size = io.DEFAULT_BUFFER_SIZE 176 return self._buffer.read1(size) 177 178 def readinto(self, b): 179 """Read bytes into b. 180 181 Returns the number of bytes read (0 for EOF). 182 """ 183 self._check_can_read() 184 return self._buffer.readinto(b) 185 186 def readline(self, size=-1): 187 """Read a line of uncompressed bytes from the file. 188 189 The terminating newline (if present) is retained. If size is 190 non-negative, no more than size bytes will be read (in which 191 case the line may be incomplete). Returns b'' if already at EOF. 192 """ 193 if not isinstance(size, int): 194 if not hasattr(size, "__index__"): 195 raise TypeError("Integer argument expected") 196 size = size.__index__() 197 self._check_can_read() 198 return self._buffer.readline(size) 199 200 def readlines(self, size=-1): 201 """Read a list of lines of uncompressed bytes from the file. 202 203 size can be specified to control the number of lines read: no 204 further lines will be read once the total size of the lines read 205 so far equals or exceeds size. 206 """ 207 if not isinstance(size, int): 208 if not hasattr(size, "__index__"): 209 raise TypeError("Integer argument expected") 210 size = size.__index__() 211 self._check_can_read() 212 return self._buffer.readlines(size) 213 214 def write(self, data): 215 """Write a byte string to the file. 216 217 Returns the number of uncompressed bytes written, which is 218 always the length of data in bytes. Note that due to buffering, 219 the file on disk may not reflect the data written until close() 220 is called. 221 """ 222 self._check_can_write() 223 if isinstance(data, (bytes, bytearray)): 224 length = len(data) 225 else: 226 # accept any data that supports the buffer protocol 227 data = memoryview(data) 228 length = data.nbytes 229 230 compressed = self._compressor.compress(data) 231 self._fp.write(compressed) 232 self._pos += length 233 return length 234 235 def writelines(self, seq): 236 """Write a sequence of byte strings to the file. 237 238 Returns the number of uncompressed bytes written. 239 seq can be any iterable yielding byte strings. 240 241 Line separators are not added between the written byte strings. 242 """ 243 return _compression.BaseStream.writelines(self, seq) 244 245 def seek(self, offset, whence=io.SEEK_SET): 246 """Change the file position. 247 248 The new position is specified by offset, relative to the 249 position indicated by whence. Values for whence are: 250 251 0: start of stream (default); offset must not be negative 252 1: current stream position 253 2: end of stream; offset must not be positive 254 255 Returns the new file position. 256 257 Note that seeking is emulated, so depending on the parameters, 258 this operation may be extremely slow. 259 """ 260 self._check_can_seek() 261 return self._buffer.seek(offset, whence) 262 263 def tell(self): 264 """Return the current file position.""" 265 self._check_not_closed() 266 if self._mode == _MODE_READ: 267 return self._buffer.tell() 268 return self._pos 269 270 271def open(filename, mode="rb", compresslevel=9, 272 encoding=None, errors=None, newline=None): 273 """Open a bzip2-compressed file in binary or text mode. 274 275 The filename argument can be an actual filename (a str, bytes, or 276 PathLike object), or an existing file object to read from or write 277 to. 278 279 The mode argument can be "r", "rb", "w", "wb", "x", "xb", "a" or 280 "ab" for binary mode, or "rt", "wt", "xt" or "at" for text mode. 281 The default mode is "rb", and the default compresslevel is 9. 282 283 For binary mode, this function is equivalent to the BZ2File 284 constructor: BZ2File(filename, mode, compresslevel). In this case, 285 the encoding, errors and newline arguments must not be provided. 286 287 For text mode, a BZ2File object is created, and wrapped in an 288 io.TextIOWrapper instance with the specified encoding, error 289 handling behavior, and line ending(s). 290 291 """ 292 if "t" in mode: 293 if "b" in mode: 294 raise ValueError("Invalid mode: %r" % (mode,)) 295 else: 296 if encoding is not None: 297 raise ValueError("Argument 'encoding' not supported in binary mode") 298 if errors is not None: 299 raise ValueError("Argument 'errors' not supported in binary mode") 300 if newline is not None: 301 raise ValueError("Argument 'newline' not supported in binary mode") 302 303 bz_mode = mode.replace("t", "") 304 binary_file = BZ2File(filename, bz_mode, compresslevel=compresslevel) 305 306 if "t" in mode: 307 encoding = io.text_encoding(encoding) 308 return io.TextIOWrapper(binary_file, encoding, errors, newline) 309 else: 310 return binary_file 311 312 313def compress(data, compresslevel=9): 314 """Compress a block of data. 315 316 compresslevel, if given, must be a number between 1 and 9. 317 318 For incremental compression, use a BZ2Compressor object instead. 319 """ 320 comp = BZ2Compressor(compresslevel) 321 return comp.compress(data) + comp.flush() 322 323 324def decompress(data): 325 """Decompress a block of data. 326 327 For incremental decompression, use a BZ2Decompressor object instead. 328 """ 329 results = [] 330 while data: 331 decomp = BZ2Decompressor() 332 try: 333 res = decomp.decompress(data) 334 except OSError: 335 if results: 336 break # Leftover data is not a valid bzip2 stream; ignore it. 337 else: 338 raise # Error on the first iteration; bail out. 339 results.append(res) 340 if not decomp.eof: 341 raise ValueError("Compressed data ended before the " 342 "end-of-stream marker was reached") 343 data = decomp.unused_data 344 return b"".join(results) 345