1"""Interface to the libbzip2 compression library. 2 3This module provides a file interface, classes for incremental 4(de)compression, and functions for one-shot (de)compression. 5""" 6 7__all__ = ["BZ2File", "BZ2Compressor", "BZ2Decompressor", 8 "open", "compress", "decompress"] 9 10__author__ = "Nadeem Vawda <nadeem.vawda@gmail.com>" 11 12from builtins import open as _builtin_open 13import io 14import os 15import _compression 16 17from _bz2 import BZ2Compressor, BZ2Decompressor 18 19 20# Value 0 no longer used 21_MODE_READ = 1 22# Value 2 no longer used 23_MODE_WRITE = 3 24 25 26class BZ2File(_compression.BaseStream): 27 28 """A file object providing transparent bzip2 (de)compression. 29 30 A BZ2File can act as a wrapper for an existing file object, or refer 31 directly to a named file on disk. 32 33 Note that BZ2File provides a *binary* file interface - data read is 34 returned as bytes, and data to be written should be given as bytes. 35 """ 36 37 def __init__(self, filename, mode="r", *, compresslevel=9): 38 """Open a bzip2-compressed file. 39 40 If filename is a str, bytes, or PathLike object, it gives the 41 name of the file to be opened. Otherwise, it should be a file 42 object, which will be used to read or write the compressed data. 43 44 mode can be 'r' for reading (default), 'w' for (over)writing, 45 'x' for creating exclusively, or 'a' for appending. These can 46 equivalently be given as 'rb', 'wb', 'xb', and 'ab'. 47 48 If mode is 'w', 'x' or 'a', compresslevel can be a number between 1 49 and 9 specifying the level of compression: 1 produces the least 50 compression, and 9 (default) produces the most compression. 51 52 If mode is 'r', the input file may be the concatenation of 53 multiple compressed streams. 54 """ 55 self._fp = None 56 self._closefp = False 57 self._mode = None 58 59 if not (1 <= compresslevel <= 9): 60 raise ValueError("compresslevel must be between 1 and 9") 61 62 if mode in ("", "r", "rb"): 63 mode = "rb" 64 mode_code = _MODE_READ 65 elif mode in ("w", "wb"): 66 mode = "wb" 67 mode_code = _MODE_WRITE 68 self._compressor = BZ2Compressor(compresslevel) 69 elif mode in ("x", "xb"): 70 mode = "xb" 71 mode_code = _MODE_WRITE 72 self._compressor = BZ2Compressor(compresslevel) 73 elif mode in ("a", "ab"): 74 mode = "ab" 75 mode_code = _MODE_WRITE 76 self._compressor = BZ2Compressor(compresslevel) 77 else: 78 raise ValueError("Invalid mode: %r" % (mode,)) 79 80 if isinstance(filename, (str, bytes, os.PathLike)): 81 self._fp = _builtin_open(filename, mode) 82 self._closefp = True 83 self._mode = mode_code 84 elif hasattr(filename, "read") or hasattr(filename, "write"): 85 self._fp = filename 86 self._mode = mode_code 87 else: 88 raise TypeError("filename must be a str, bytes, file or PathLike object") 89 90 if self._mode == _MODE_READ: 91 raw = _compression.DecompressReader(self._fp, 92 BZ2Decompressor, trailing_error=OSError) 93 self._buffer = io.BufferedReader(raw) 94 else: 95 self._pos = 0 96 97 def close(self): 98 """Flush and close the file. 99 100 May be called more than once without error. Once the file is 101 closed, any other operation on it will raise a ValueError. 102 """ 103 if self.closed: 104 return 105 try: 106 if self._mode == _MODE_READ: 107 self._buffer.close() 108 elif self._mode == _MODE_WRITE: 109 self._fp.write(self._compressor.flush()) 110 self._compressor = None 111 finally: 112 try: 113 if self._closefp: 114 self._fp.close() 115 finally: 116 self._fp = None 117 self._closefp = False 118 self._buffer = None 119 120 @property 121 def closed(self): 122 """True if this file is closed.""" 123 return self._fp is None 124 125 @property 126 def name(self): 127 self._check_not_closed() 128 return self._fp.name 129 130 @property 131 def mode(self): 132 return 'wb' if self._mode == _MODE_WRITE else 'rb' 133 134 def fileno(self): 135 """Return the file descriptor for the underlying file.""" 136 self._check_not_closed() 137 return self._fp.fileno() 138 139 def seekable(self): 140 """Return whether the file supports seeking.""" 141 return self.readable() and self._buffer.seekable() 142 143 def readable(self): 144 """Return whether the file was opened for reading.""" 145 self._check_not_closed() 146 return self._mode == _MODE_READ 147 148 def writable(self): 149 """Return whether the file was opened for writing.""" 150 self._check_not_closed() 151 return self._mode == _MODE_WRITE 152 153 def peek(self, n=0): 154 """Return buffered data without advancing the file position. 155 156 Always returns at least one byte of data, unless at EOF. 157 The exact number of bytes returned is unspecified. 158 """ 159 self._check_can_read() 160 # Relies on the undocumented fact that BufferedReader.peek() 161 # always returns at least one byte (except at EOF), independent 162 # of the value of n 163 return self._buffer.peek(n) 164 165 def read(self, size=-1): 166 """Read up to size uncompressed bytes from the file. 167 168 If size is negative or omitted, read until EOF is reached. 169 Returns b'' if the file is already at EOF. 170 """ 171 self._check_can_read() 172 return self._buffer.read(size) 173 174 def read1(self, size=-1): 175 """Read up to size uncompressed bytes, while trying to avoid 176 making multiple reads from the underlying stream. Reads up to a 177 buffer's worth of data if size is negative. 178 179 Returns b'' if the file is at EOF. 180 """ 181 self._check_can_read() 182 if size < 0: 183 size = io.DEFAULT_BUFFER_SIZE 184 return self._buffer.read1(size) 185 186 def readinto(self, b): 187 """Read bytes into b. 188 189 Returns the number of bytes read (0 for EOF). 190 """ 191 self._check_can_read() 192 return self._buffer.readinto(b) 193 194 def readline(self, size=-1): 195 """Read a line of uncompressed bytes from the file. 196 197 The terminating newline (if present) is retained. If size is 198 non-negative, no more than size bytes will be read (in which 199 case the line may be incomplete). Returns b'' if already at EOF. 200 """ 201 if not isinstance(size, int): 202 if not hasattr(size, "__index__"): 203 raise TypeError("Integer argument expected") 204 size = size.__index__() 205 self._check_can_read() 206 return self._buffer.readline(size) 207 208 def readlines(self, size=-1): 209 """Read a list of lines of uncompressed bytes from the file. 210 211 size can be specified to control the number of lines read: no 212 further lines will be read once the total size of the lines read 213 so far equals or exceeds size. 214 """ 215 if not isinstance(size, int): 216 if not hasattr(size, "__index__"): 217 raise TypeError("Integer argument expected") 218 size = size.__index__() 219 self._check_can_read() 220 return self._buffer.readlines(size) 221 222 def write(self, data): 223 """Write a byte string to the file. 224 225 Returns the number of uncompressed bytes written, which is 226 always the length of data in bytes. Note that due to buffering, 227 the file on disk may not reflect the data written until close() 228 is called. 229 """ 230 self._check_can_write() 231 if isinstance(data, (bytes, bytearray)): 232 length = len(data) 233 else: 234 # accept any data that supports the buffer protocol 235 data = memoryview(data) 236 length = data.nbytes 237 238 compressed = self._compressor.compress(data) 239 self._fp.write(compressed) 240 self._pos += length 241 return length 242 243 def writelines(self, seq): 244 """Write a sequence of byte strings to the file. 245 246 Returns the number of uncompressed bytes written. 247 seq can be any iterable yielding byte strings. 248 249 Line separators are not added between the written byte strings. 250 """ 251 return _compression.BaseStream.writelines(self, seq) 252 253 def seek(self, offset, whence=io.SEEK_SET): 254 """Change the file position. 255 256 The new position is specified by offset, relative to the 257 position indicated by whence. Values for whence are: 258 259 0: start of stream (default); offset must not be negative 260 1: current stream position 261 2: end of stream; offset must not be positive 262 263 Returns the new file position. 264 265 Note that seeking is emulated, so depending on the parameters, 266 this operation may be extremely slow. 267 """ 268 self._check_can_seek() 269 return self._buffer.seek(offset, whence) 270 271 def tell(self): 272 """Return the current file position.""" 273 self._check_not_closed() 274 if self._mode == _MODE_READ: 275 return self._buffer.tell() 276 return self._pos 277 278 279def open(filename, mode="rb", compresslevel=9, 280 encoding=None, errors=None, newline=None): 281 """Open a bzip2-compressed file in binary or text mode. 282 283 The filename argument can be an actual filename (a str, bytes, or 284 PathLike object), or an existing file object to read from or write 285 to. 286 287 The mode argument can be "r", "rb", "w", "wb", "x", "xb", "a" or 288 "ab" for binary mode, or "rt", "wt", "xt" or "at" for text mode. 289 The default mode is "rb", and the default compresslevel is 9. 290 291 For binary mode, this function is equivalent to the BZ2File 292 constructor: BZ2File(filename, mode, compresslevel). In this case, 293 the encoding, errors and newline arguments must not be provided. 294 295 For text mode, a BZ2File object is created, and wrapped in an 296 io.TextIOWrapper instance with the specified encoding, error 297 handling behavior, and line ending(s). 298 299 """ 300 if "t" in mode: 301 if "b" in mode: 302 raise ValueError("Invalid mode: %r" % (mode,)) 303 else: 304 if encoding is not None: 305 raise ValueError("Argument 'encoding' not supported in binary mode") 306 if errors is not None: 307 raise ValueError("Argument 'errors' not supported in binary mode") 308 if newline is not None: 309 raise ValueError("Argument 'newline' not supported in binary mode") 310 311 bz_mode = mode.replace("t", "") 312 binary_file = BZ2File(filename, bz_mode, compresslevel=compresslevel) 313 314 if "t" in mode: 315 encoding = io.text_encoding(encoding) 316 return io.TextIOWrapper(binary_file, encoding, errors, newline) 317 else: 318 return binary_file 319 320 321def compress(data, compresslevel=9): 322 """Compress a block of data. 323 324 compresslevel, if given, must be a number between 1 and 9. 325 326 For incremental compression, use a BZ2Compressor object instead. 327 """ 328 comp = BZ2Compressor(compresslevel) 329 return comp.compress(data) + comp.flush() 330 331 332def decompress(data): 333 """Decompress a block of data. 334 335 For incremental decompression, use a BZ2Decompressor object instead. 336 """ 337 results = [] 338 while data: 339 decomp = BZ2Decompressor() 340 try: 341 res = decomp.decompress(data) 342 except OSError: 343 if results: 344 break # Leftover data is not a valid bzip2 stream; ignore it. 345 else: 346 raise # Error on the first iteration; bail out. 347 results.append(res) 348 if not decomp.eof: 349 raise ValueError("Compressed data ended before the " 350 "end-of-stream marker was reached") 351 data = decomp.unused_data 352 return b"".join(results) 353