1"""Interface to the libbzip2 compression library. 2 3This module provides a file interface, classes for incremental 4(de)compression, and functions for one-shot (de)compression. 5""" 6 7__all__ = ["BZ2File", "BZ2Compressor", "BZ2Decompressor", 8 "open", "compress", "decompress"] 9 10__author__ = "Nadeem Vawda <nadeem.vawda@gmail.com>" 11 12from builtins import open as _builtin_open 13import io 14import os 15import warnings 16import _compression 17from threading import RLock 18 19from _bz2 import BZ2Compressor, BZ2Decompressor 20 21 22_MODE_CLOSED = 0 23_MODE_READ = 1 24# Value 2 no longer used 25_MODE_WRITE = 3 26 27 28class BZ2File(_compression.BaseStream): 29 30 """A file object providing transparent bzip2 (de)compression. 31 32 A BZ2File can act as a wrapper for an existing file object, or refer 33 directly to a named file on disk. 34 35 Note that BZ2File provides a *binary* file interface - data read is 36 returned as bytes, and data to be written should be given as bytes. 37 """ 38 39 def __init__(self, filename, mode="r", buffering=None, compresslevel=9): 40 """Open a bzip2-compressed file. 41 42 If filename is a str, bytes, or PathLike object, it gives the 43 name of the file to be opened. Otherwise, it should be a file 44 object, which will be used to read or write the compressed data. 45 46 mode can be 'r' for reading (default), 'w' for (over)writing, 47 'x' for creating exclusively, or 'a' for appending. These can 48 equivalently be given as 'rb', 'wb', 'xb', and 'ab'. 49 50 buffering is ignored. Its use is deprecated. 51 52 If mode is 'w', 'x' or 'a', compresslevel can be a number between 1 53 and 9 specifying the level of compression: 1 produces the least 54 compression, and 9 (default) produces the most compression. 55 56 If mode is 'r', the input file may be the concatenation of 57 multiple compressed streams. 58 """ 59 # This lock must be recursive, so that BufferedIOBase's 60 # writelines() does not deadlock. 61 self._lock = RLock() 62 self._fp = None 63 self._closefp = False 64 self._mode = _MODE_CLOSED 65 66 if buffering is not None: 67 warnings.warn("Use of 'buffering' argument is deprecated", 68 DeprecationWarning) 69 70 if not (1 <= compresslevel <= 9): 71 raise ValueError("compresslevel must be between 1 and 9") 72 73 if mode in ("", "r", "rb"): 74 mode = "rb" 75 mode_code = _MODE_READ 76 elif mode in ("w", "wb"): 77 mode = "wb" 78 mode_code = _MODE_WRITE 79 self._compressor = BZ2Compressor(compresslevel) 80 elif mode in ("x", "xb"): 81 mode = "xb" 82 mode_code = _MODE_WRITE 83 self._compressor = BZ2Compressor(compresslevel) 84 elif mode in ("a", "ab"): 85 mode = "ab" 86 mode_code = _MODE_WRITE 87 self._compressor = BZ2Compressor(compresslevel) 88 else: 89 raise ValueError("Invalid mode: %r" % (mode,)) 90 91 if isinstance(filename, (str, bytes, os.PathLike)): 92 self._fp = _builtin_open(filename, mode) 93 self._closefp = True 94 self._mode = mode_code 95 elif hasattr(filename, "read") or hasattr(filename, "write"): 96 self._fp = filename 97 self._mode = mode_code 98 else: 99 raise TypeError("filename must be a str, bytes, file or PathLike object") 100 101 if self._mode == _MODE_READ: 102 raw = _compression.DecompressReader(self._fp, 103 BZ2Decompressor, trailing_error=OSError) 104 self._buffer = io.BufferedReader(raw) 105 else: 106 self._pos = 0 107 108 def close(self): 109 """Flush and close the file. 110 111 May be called more than once without error. Once the file is 112 closed, any other operation on it will raise a ValueError. 113 """ 114 with self._lock: 115 if self._mode == _MODE_CLOSED: 116 return 117 try: 118 if self._mode == _MODE_READ: 119 self._buffer.close() 120 elif self._mode == _MODE_WRITE: 121 self._fp.write(self._compressor.flush()) 122 self._compressor = None 123 finally: 124 try: 125 if self._closefp: 126 self._fp.close() 127 finally: 128 self._fp = None 129 self._closefp = False 130 self._mode = _MODE_CLOSED 131 self._buffer = None 132 133 @property 134 def closed(self): 135 """True if this file is closed.""" 136 return self._mode == _MODE_CLOSED 137 138 def fileno(self): 139 """Return the file descriptor for the underlying file.""" 140 self._check_not_closed() 141 return self._fp.fileno() 142 143 def seekable(self): 144 """Return whether the file supports seeking.""" 145 return self.readable() and self._buffer.seekable() 146 147 def readable(self): 148 """Return whether the file was opened for reading.""" 149 self._check_not_closed() 150 return self._mode == _MODE_READ 151 152 def writable(self): 153 """Return whether the file was opened for writing.""" 154 self._check_not_closed() 155 return self._mode == _MODE_WRITE 156 157 def peek(self, n=0): 158 """Return buffered data without advancing the file position. 159 160 Always returns at least one byte of data, unless at EOF. 161 The exact number of bytes returned is unspecified. 162 """ 163 with self._lock: 164 self._check_can_read() 165 # Relies on the undocumented fact that BufferedReader.peek() 166 # always returns at least one byte (except at EOF), independent 167 # of the value of n 168 return self._buffer.peek(n) 169 170 def read(self, size=-1): 171 """Read up to size uncompressed bytes from the file. 172 173 If size is negative or omitted, read until EOF is reached. 174 Returns b'' if the file is already at EOF. 175 """ 176 with self._lock: 177 self._check_can_read() 178 return self._buffer.read(size) 179 180 def read1(self, size=-1): 181 """Read up to size uncompressed bytes, while trying to avoid 182 making multiple reads from the underlying stream. Reads up to a 183 buffer's worth of data if size is negative. 184 185 Returns b'' if the file is at EOF. 186 """ 187 with self._lock: 188 self._check_can_read() 189 if size < 0: 190 size = io.DEFAULT_BUFFER_SIZE 191 return self._buffer.read1(size) 192 193 def readinto(self, b): 194 """Read bytes into b. 195 196 Returns the number of bytes read (0 for EOF). 197 """ 198 with self._lock: 199 self._check_can_read() 200 return self._buffer.readinto(b) 201 202 def readline(self, size=-1): 203 """Read a line of uncompressed bytes from the file. 204 205 The terminating newline (if present) is retained. If size is 206 non-negative, no more than size bytes will be read (in which 207 case the line may be incomplete). Returns b'' if already at EOF. 208 """ 209 if not isinstance(size, int): 210 if not hasattr(size, "__index__"): 211 raise TypeError("Integer argument expected") 212 size = size.__index__() 213 with self._lock: 214 self._check_can_read() 215 return self._buffer.readline(size) 216 217 def readlines(self, size=-1): 218 """Read a list of lines of uncompressed bytes from the file. 219 220 size can be specified to control the number of lines read: no 221 further lines will be read once the total size of the lines read 222 so far equals or exceeds size. 223 """ 224 if not isinstance(size, int): 225 if not hasattr(size, "__index__"): 226 raise TypeError("Integer argument expected") 227 size = size.__index__() 228 with self._lock: 229 self._check_can_read() 230 return self._buffer.readlines(size) 231 232 def write(self, data): 233 """Write a byte string to the file. 234 235 Returns the number of uncompressed bytes written, which is 236 always len(data). Note that due to buffering, the file on disk 237 may not reflect the data written until close() is called. 238 """ 239 with self._lock: 240 self._check_can_write() 241 compressed = self._compressor.compress(data) 242 self._fp.write(compressed) 243 self._pos += len(data) 244 return len(data) 245 246 def writelines(self, seq): 247 """Write a sequence of byte strings to the file. 248 249 Returns the number of uncompressed bytes written. 250 seq can be any iterable yielding byte strings. 251 252 Line separators are not added between the written byte strings. 253 """ 254 with self._lock: 255 return _compression.BaseStream.writelines(self, seq) 256 257 def seek(self, offset, whence=io.SEEK_SET): 258 """Change the file position. 259 260 The new position is specified by offset, relative to the 261 position indicated by whence. Values for whence are: 262 263 0: start of stream (default); offset must not be negative 264 1: current stream position 265 2: end of stream; offset must not be positive 266 267 Returns the new file position. 268 269 Note that seeking is emulated, so depending on the parameters, 270 this operation may be extremely slow. 271 """ 272 with self._lock: 273 self._check_can_seek() 274 return self._buffer.seek(offset, whence) 275 276 def tell(self): 277 """Return the current file position.""" 278 with self._lock: 279 self._check_not_closed() 280 if self._mode == _MODE_READ: 281 return self._buffer.tell() 282 return self._pos 283 284 285def open(filename, mode="rb", compresslevel=9, 286 encoding=None, errors=None, newline=None): 287 """Open a bzip2-compressed file in binary or text mode. 288 289 The filename argument can be an actual filename (a str, bytes, or 290 PathLike object), or an existing file object to read from or write 291 to. 292 293 The mode argument can be "r", "rb", "w", "wb", "x", "xb", "a" or 294 "ab" for binary mode, or "rt", "wt", "xt" or "at" for text mode. 295 The default mode is "rb", and the default compresslevel is 9. 296 297 For binary mode, this function is equivalent to the BZ2File 298 constructor: BZ2File(filename, mode, compresslevel). In this case, 299 the encoding, errors and newline arguments must not be provided. 300 301 For text mode, a BZ2File object is created, and wrapped in an 302 io.TextIOWrapper instance with the specified encoding, error 303 handling behavior, and line ending(s). 304 305 """ 306 if "t" in mode: 307 if "b" in mode: 308 raise ValueError("Invalid mode: %r" % (mode,)) 309 else: 310 if encoding is not None: 311 raise ValueError("Argument 'encoding' not supported in binary mode") 312 if errors is not None: 313 raise ValueError("Argument 'errors' not supported in binary mode") 314 if newline is not None: 315 raise ValueError("Argument 'newline' not supported in binary mode") 316 317 bz_mode = mode.replace("t", "") 318 binary_file = BZ2File(filename, bz_mode, compresslevel=compresslevel) 319 320 if "t" in mode: 321 return io.TextIOWrapper(binary_file, encoding, errors, newline) 322 else: 323 return binary_file 324 325 326def compress(data, compresslevel=9): 327 """Compress a block of data. 328 329 compresslevel, if given, must be a number between 1 and 9. 330 331 For incremental compression, use a BZ2Compressor object instead. 332 """ 333 comp = BZ2Compressor(compresslevel) 334 return comp.compress(data) + comp.flush() 335 336 337def decompress(data): 338 """Decompress a block of data. 339 340 For incremental decompression, use a BZ2Decompressor object instead. 341 """ 342 results = [] 343 while data: 344 decomp = BZ2Decompressor() 345 try: 346 res = decomp.decompress(data) 347 except OSError: 348 if results: 349 break # Leftover data is not a valid bzip2 stream; ignore it. 350 else: 351 raise # Error on the first iteration; bail out. 352 results.append(res) 353 if not decomp.eof: 354 raise ValueError("Compressed data ended before the " 355 "end-of-stream marker was reached") 356 data = decomp.unused_data 357 return b"".join(results) 358