1"""Interface to the libbzip2 compression library. 2 3This module provides a file interface, classes for incremental 4(de)compression, and functions for one-shot (de)compression. 5""" 6 7__all__ = ["BZ2File", "BZ2Compressor", "BZ2Decompressor", 8 "open", "compress", "decompress"] 9 10__author__ = "Nadeem Vawda <nadeem.vawda@gmail.com>" 11 12from builtins import open as _builtin_open 13import io 14import os 15import _compression 16from threading import RLock 17 18from _bz2 import BZ2Compressor, BZ2Decompressor 19 20 21_MODE_CLOSED = 0 22_MODE_READ = 1 23# Value 2 no longer used 24_MODE_WRITE = 3 25 26 27class BZ2File(_compression.BaseStream): 28 29 """A file object providing transparent bzip2 (de)compression. 30 31 A BZ2File can act as a wrapper for an existing file object, or refer 32 directly to a named file on disk. 33 34 Note that BZ2File provides a *binary* file interface - data read is 35 returned as bytes, and data to be written should be given as bytes. 36 """ 37 38 def __init__(self, filename, mode="r", *, compresslevel=9): 39 """Open a bzip2-compressed file. 40 41 If filename is a str, bytes, or PathLike object, it gives the 42 name of the file to be opened. Otherwise, it should be a file 43 object, which will be used to read or write the compressed data. 44 45 mode can be 'r' for reading (default), 'w' for (over)writing, 46 'x' for creating exclusively, or 'a' for appending. These can 47 equivalently be given as 'rb', 'wb', 'xb', and 'ab'. 48 49 If mode is 'w', 'x' or 'a', compresslevel can be a number between 1 50 and 9 specifying the level of compression: 1 produces the least 51 compression, and 9 (default) produces the most compression. 52 53 If mode is 'r', the input file may be the concatenation of 54 multiple compressed streams. 55 """ 56 # This lock must be recursive, so that BufferedIOBase's 57 # writelines() does not deadlock. 58 self._lock = RLock() 59 self._fp = None 60 self._closefp = False 61 self._mode = _MODE_CLOSED 62 63 if not (1 <= compresslevel <= 9): 64 raise ValueError("compresslevel must be between 1 and 9") 65 66 if mode in ("", "r", "rb"): 67 mode = "rb" 68 mode_code = _MODE_READ 69 elif mode in ("w", "wb"): 70 mode = "wb" 71 mode_code = _MODE_WRITE 72 self._compressor = BZ2Compressor(compresslevel) 73 elif mode in ("x", "xb"): 74 mode = "xb" 75 mode_code = _MODE_WRITE 76 self._compressor = BZ2Compressor(compresslevel) 77 elif mode in ("a", "ab"): 78 mode = "ab" 79 mode_code = _MODE_WRITE 80 self._compressor = BZ2Compressor(compresslevel) 81 else: 82 raise ValueError("Invalid mode: %r" % (mode,)) 83 84 if isinstance(filename, (str, bytes, os.PathLike)): 85 self._fp = _builtin_open(filename, mode) 86 self._closefp = True 87 self._mode = mode_code 88 elif hasattr(filename, "read") or hasattr(filename, "write"): 89 self._fp = filename 90 self._mode = mode_code 91 else: 92 raise TypeError("filename must be a str, bytes, file or PathLike object") 93 94 if self._mode == _MODE_READ: 95 raw = _compression.DecompressReader(self._fp, 96 BZ2Decompressor, trailing_error=OSError) 97 self._buffer = io.BufferedReader(raw) 98 else: 99 self._pos = 0 100 101 def close(self): 102 """Flush and close the file. 103 104 May be called more than once without error. Once the file is 105 closed, any other operation on it will raise a ValueError. 106 """ 107 with self._lock: 108 if self._mode == _MODE_CLOSED: 109 return 110 try: 111 if self._mode == _MODE_READ: 112 self._buffer.close() 113 elif self._mode == _MODE_WRITE: 114 self._fp.write(self._compressor.flush()) 115 self._compressor = None 116 finally: 117 try: 118 if self._closefp: 119 self._fp.close() 120 finally: 121 self._fp = None 122 self._closefp = False 123 self._mode = _MODE_CLOSED 124 self._buffer = None 125 126 @property 127 def closed(self): 128 """True if this file is closed.""" 129 return self._mode == _MODE_CLOSED 130 131 def fileno(self): 132 """Return the file descriptor for the underlying file.""" 133 self._check_not_closed() 134 return self._fp.fileno() 135 136 def seekable(self): 137 """Return whether the file supports seeking.""" 138 return self.readable() and self._buffer.seekable() 139 140 def readable(self): 141 """Return whether the file was opened for reading.""" 142 self._check_not_closed() 143 return self._mode == _MODE_READ 144 145 def writable(self): 146 """Return whether the file was opened for writing.""" 147 self._check_not_closed() 148 return self._mode == _MODE_WRITE 149 150 def peek(self, n=0): 151 """Return buffered data without advancing the file position. 152 153 Always returns at least one byte of data, unless at EOF. 154 The exact number of bytes returned is unspecified. 155 """ 156 with self._lock: 157 self._check_can_read() 158 # Relies on the undocumented fact that BufferedReader.peek() 159 # always returns at least one byte (except at EOF), independent 160 # of the value of n 161 return self._buffer.peek(n) 162 163 def read(self, size=-1): 164 """Read up to size uncompressed bytes from the file. 165 166 If size is negative or omitted, read until EOF is reached. 167 Returns b'' if the file is already at EOF. 168 """ 169 with self._lock: 170 self._check_can_read() 171 return self._buffer.read(size) 172 173 def read1(self, size=-1): 174 """Read up to size uncompressed bytes, while trying to avoid 175 making multiple reads from the underlying stream. Reads up to a 176 buffer's worth of data if size is negative. 177 178 Returns b'' if the file is at EOF. 179 """ 180 with self._lock: 181 self._check_can_read() 182 if size < 0: 183 size = io.DEFAULT_BUFFER_SIZE 184 return self._buffer.read1(size) 185 186 def readinto(self, b): 187 """Read bytes into b. 188 189 Returns the number of bytes read (0 for EOF). 190 """ 191 with self._lock: 192 self._check_can_read() 193 return self._buffer.readinto(b) 194 195 def readline(self, size=-1): 196 """Read a line of uncompressed bytes from the file. 197 198 The terminating newline (if present) is retained. If size is 199 non-negative, no more than size bytes will be read (in which 200 case the line may be incomplete). Returns b'' if already at EOF. 201 """ 202 if not isinstance(size, int): 203 if not hasattr(size, "__index__"): 204 raise TypeError("Integer argument expected") 205 size = size.__index__() 206 with self._lock: 207 self._check_can_read() 208 return self._buffer.readline(size) 209 210 def readlines(self, size=-1): 211 """Read a list of lines of uncompressed bytes from the file. 212 213 size can be specified to control the number of lines read: no 214 further lines will be read once the total size of the lines read 215 so far equals or exceeds size. 216 """ 217 if not isinstance(size, int): 218 if not hasattr(size, "__index__"): 219 raise TypeError("Integer argument expected") 220 size = size.__index__() 221 with self._lock: 222 self._check_can_read() 223 return self._buffer.readlines(size) 224 225 def write(self, data): 226 """Write a byte string to the file. 227 228 Returns the number of uncompressed bytes written, which is 229 always len(data). Note that due to buffering, the file on disk 230 may not reflect the data written until close() is called. 231 """ 232 with self._lock: 233 self._check_can_write() 234 compressed = self._compressor.compress(data) 235 self._fp.write(compressed) 236 self._pos += len(data) 237 return len(data) 238 239 def writelines(self, seq): 240 """Write a sequence of byte strings to the file. 241 242 Returns the number of uncompressed bytes written. 243 seq can be any iterable yielding byte strings. 244 245 Line separators are not added between the written byte strings. 246 """ 247 with self._lock: 248 return _compression.BaseStream.writelines(self, seq) 249 250 def seek(self, offset, whence=io.SEEK_SET): 251 """Change the file position. 252 253 The new position is specified by offset, relative to the 254 position indicated by whence. Values for whence are: 255 256 0: start of stream (default); offset must not be negative 257 1: current stream position 258 2: end of stream; offset must not be positive 259 260 Returns the new file position. 261 262 Note that seeking is emulated, so depending on the parameters, 263 this operation may be extremely slow. 264 """ 265 with self._lock: 266 self._check_can_seek() 267 return self._buffer.seek(offset, whence) 268 269 def tell(self): 270 """Return the current file position.""" 271 with self._lock: 272 self._check_not_closed() 273 if self._mode == _MODE_READ: 274 return self._buffer.tell() 275 return self._pos 276 277 278def open(filename, mode="rb", compresslevel=9, 279 encoding=None, errors=None, newline=None): 280 """Open a bzip2-compressed file in binary or text mode. 281 282 The filename argument can be an actual filename (a str, bytes, or 283 PathLike object), or an existing file object to read from or write 284 to. 285 286 The mode argument can be "r", "rb", "w", "wb", "x", "xb", "a" or 287 "ab" for binary mode, or "rt", "wt", "xt" or "at" for text mode. 288 The default mode is "rb", and the default compresslevel is 9. 289 290 For binary mode, this function is equivalent to the BZ2File 291 constructor: BZ2File(filename, mode, compresslevel). In this case, 292 the encoding, errors and newline arguments must not be provided. 293 294 For text mode, a BZ2File object is created, and wrapped in an 295 io.TextIOWrapper instance with the specified encoding, error 296 handling behavior, and line ending(s). 297 298 """ 299 if "t" in mode: 300 if "b" in mode: 301 raise ValueError("Invalid mode: %r" % (mode,)) 302 else: 303 if encoding is not None: 304 raise ValueError("Argument 'encoding' not supported in binary mode") 305 if errors is not None: 306 raise ValueError("Argument 'errors' not supported in binary mode") 307 if newline is not None: 308 raise ValueError("Argument 'newline' not supported in binary mode") 309 310 bz_mode = mode.replace("t", "") 311 binary_file = BZ2File(filename, bz_mode, compresslevel=compresslevel) 312 313 if "t" in mode: 314 return io.TextIOWrapper(binary_file, encoding, errors, newline) 315 else: 316 return binary_file 317 318 319def compress(data, compresslevel=9): 320 """Compress a block of data. 321 322 compresslevel, if given, must be a number between 1 and 9. 323 324 For incremental compression, use a BZ2Compressor object instead. 325 """ 326 comp = BZ2Compressor(compresslevel) 327 return comp.compress(data) + comp.flush() 328 329 330def decompress(data): 331 """Decompress a block of data. 332 333 For incremental decompression, use a BZ2Decompressor object instead. 334 """ 335 results = [] 336 while data: 337 decomp = BZ2Decompressor() 338 try: 339 res = decomp.decompress(data) 340 except OSError: 341 if results: 342 break # Leftover data is not a valid bzip2 stream; ignore it. 343 else: 344 raise # Error on the first iteration; bail out. 345 results.append(res) 346 if not decomp.eof: 347 raise ValueError("Compressed data ended before the " 348 "end-of-stream marker was reached") 349 data = decomp.unused_data 350 return b"".join(results) 351