1"""Interface to the liblzma compression library. 2 3This module provides a class for reading and writing compressed files, 4classes for incremental (de)compression, and convenience functions for 5one-shot (de)compression. 6 7These classes and functions support both the XZ and legacy LZMA 8container formats, as well as raw compressed data streams. 9""" 10 11__all__ = [ 12 "CHECK_NONE", "CHECK_CRC32", "CHECK_CRC64", "CHECK_SHA256", 13 "CHECK_ID_MAX", "CHECK_UNKNOWN", 14 "FILTER_LZMA1", "FILTER_LZMA2", "FILTER_DELTA", "FILTER_X86", "FILTER_IA64", 15 "FILTER_ARM", "FILTER_ARMTHUMB", "FILTER_POWERPC", "FILTER_SPARC", 16 "FORMAT_AUTO", "FORMAT_XZ", "FORMAT_ALONE", "FORMAT_RAW", 17 "MF_HC3", "MF_HC4", "MF_BT2", "MF_BT3", "MF_BT4", 18 "MODE_FAST", "MODE_NORMAL", "PRESET_DEFAULT", "PRESET_EXTREME", 19 20 "LZMACompressor", "LZMADecompressor", "LZMAFile", "LZMAError", 21 "open", "compress", "decompress", "is_check_supported", 22] 23 24import builtins 25import io 26import os 27from _lzma import * 28from _lzma import _encode_filter_properties, _decode_filter_properties 29import _compression 30 31 32# Value 0 no longer used 33_MODE_READ = 1 34# Value 2 no longer used 35_MODE_WRITE = 3 36 37 38class LZMAFile(_compression.BaseStream): 39 40 """A file object providing transparent LZMA (de)compression. 41 42 An LZMAFile can act as a wrapper for an existing file object, or 43 refer directly to a named file on disk. 44 45 Note that LZMAFile provides a *binary* file interface - data read 46 is returned as bytes, and data to be written must be given as bytes. 47 """ 48 49 def __init__(self, filename=None, mode="r", *, 50 format=None, check=-1, preset=None, filters=None): 51 """Open an LZMA-compressed file in binary mode. 52 53 filename can be either an actual file name (given as a str, 54 bytes, or PathLike object), in which case the named file is 55 opened, or it can be an existing file object to read from or 56 write to. 57 58 mode can be "r" for reading (default), "w" for (over)writing, 59 "x" for creating exclusively, or "a" for appending. These can 60 equivalently be given as "rb", "wb", "xb" and "ab" respectively. 61 62 format specifies the container format to use for the file. 63 If mode is "r", this defaults to FORMAT_AUTO. Otherwise, the 64 default is FORMAT_XZ. 65 66 check specifies the integrity check to use. This argument can 67 only be used when opening a file for writing. For FORMAT_XZ, 68 the default is CHECK_CRC64. FORMAT_ALONE and FORMAT_RAW do not 69 support integrity checks - for these formats, check must be 70 omitted, or be CHECK_NONE. 71 72 When opening a file for reading, the *preset* argument is not 73 meaningful, and should be omitted. The *filters* argument should 74 also be omitted, except when format is FORMAT_RAW (in which case 75 it is required). 76 77 When opening a file for writing, the settings used by the 78 compressor can be specified either as a preset compression 79 level (with the *preset* argument), or in detail as a custom 80 filter chain (with the *filters* argument). For FORMAT_XZ and 81 FORMAT_ALONE, the default is to use the PRESET_DEFAULT preset 82 level. For FORMAT_RAW, the caller must always specify a filter 83 chain; the raw compressor does not support preset compression 84 levels. 85 86 preset (if provided) should be an integer in the range 0-9, 87 optionally OR-ed with the constant PRESET_EXTREME. 88 89 filters (if provided) should be a sequence of dicts. Each dict 90 should have an entry for "id" indicating ID of the filter, plus 91 additional entries for options to the filter. 92 """ 93 self._fp = None 94 self._closefp = False 95 self._mode = None 96 97 if mode in ("r", "rb"): 98 if check != -1: 99 raise ValueError("Cannot specify an integrity check " 100 "when opening a file for reading") 101 if preset is not None: 102 raise ValueError("Cannot specify a preset compression " 103 "level when opening a file for reading") 104 if format is None: 105 format = FORMAT_AUTO 106 mode_code = _MODE_READ 107 elif mode in ("w", "wb", "a", "ab", "x", "xb"): 108 if format is None: 109 format = FORMAT_XZ 110 mode_code = _MODE_WRITE 111 self._compressor = LZMACompressor(format=format, check=check, 112 preset=preset, filters=filters) 113 self._pos = 0 114 else: 115 raise ValueError("Invalid mode: {!r}".format(mode)) 116 117 if isinstance(filename, (str, bytes, os.PathLike)): 118 if "b" not in mode: 119 mode += "b" 120 self._fp = builtins.open(filename, mode) 121 self._closefp = True 122 self._mode = mode_code 123 elif hasattr(filename, "read") or hasattr(filename, "write"): 124 self._fp = filename 125 self._mode = mode_code 126 else: 127 raise TypeError("filename must be a str, bytes, file or PathLike object") 128 129 if self._mode == _MODE_READ: 130 raw = _compression.DecompressReader(self._fp, LZMADecompressor, 131 trailing_error=LZMAError, format=format, filters=filters) 132 self._buffer = io.BufferedReader(raw) 133 134 def close(self): 135 """Flush and close the file. 136 137 May be called more than once without error. Once the file is 138 closed, any other operation on it will raise a ValueError. 139 """ 140 if self.closed: 141 return 142 try: 143 if self._mode == _MODE_READ: 144 self._buffer.close() 145 self._buffer = None 146 elif self._mode == _MODE_WRITE: 147 self._fp.write(self._compressor.flush()) 148 self._compressor = None 149 finally: 150 try: 151 if self._closefp: 152 self._fp.close() 153 finally: 154 self._fp = None 155 self._closefp = False 156 157 @property 158 def closed(self): 159 """True if this file is closed.""" 160 return self._fp is None 161 162 @property 163 def name(self): 164 self._check_not_closed() 165 return self._fp.name 166 167 @property 168 def mode(self): 169 return 'wb' if self._mode == _MODE_WRITE else 'rb' 170 171 def fileno(self): 172 """Return the file descriptor for the underlying file.""" 173 self._check_not_closed() 174 return self._fp.fileno() 175 176 def seekable(self): 177 """Return whether the file supports seeking.""" 178 return self.readable() and self._buffer.seekable() 179 180 def readable(self): 181 """Return whether the file was opened for reading.""" 182 self._check_not_closed() 183 return self._mode == _MODE_READ 184 185 def writable(self): 186 """Return whether the file was opened for writing.""" 187 self._check_not_closed() 188 return self._mode == _MODE_WRITE 189 190 def peek(self, size=-1): 191 """Return buffered data without advancing the file position. 192 193 Always returns at least one byte of data, unless at EOF. 194 The exact number of bytes returned is unspecified. 195 """ 196 self._check_can_read() 197 # Relies on the undocumented fact that BufferedReader.peek() always 198 # returns at least one byte (except at EOF) 199 return self._buffer.peek(size) 200 201 def read(self, size=-1): 202 """Read up to size uncompressed bytes from the file. 203 204 If size is negative or omitted, read until EOF is reached. 205 Returns b"" if the file is already at EOF. 206 """ 207 self._check_can_read() 208 return self._buffer.read(size) 209 210 def read1(self, size=-1): 211 """Read up to size uncompressed bytes, while trying to avoid 212 making multiple reads from the underlying stream. Reads up to a 213 buffer's worth of data if size is negative. 214 215 Returns b"" if the file is at EOF. 216 """ 217 self._check_can_read() 218 if size < 0: 219 size = io.DEFAULT_BUFFER_SIZE 220 return self._buffer.read1(size) 221 222 def readline(self, size=-1): 223 """Read a line of uncompressed bytes from the file. 224 225 The terminating newline (if present) is retained. If size is 226 non-negative, no more than size bytes will be read (in which 227 case the line may be incomplete). Returns b'' if already at EOF. 228 """ 229 self._check_can_read() 230 return self._buffer.readline(size) 231 232 def write(self, data): 233 """Write a bytes object to the file. 234 235 Returns the number of uncompressed bytes written, which is 236 always the length of data in bytes. Note that due to buffering, 237 the file on disk may not reflect the data written until close() 238 is called. 239 """ 240 self._check_can_write() 241 if isinstance(data, (bytes, bytearray)): 242 length = len(data) 243 else: 244 # accept any data that supports the buffer protocol 245 data = memoryview(data) 246 length = data.nbytes 247 248 compressed = self._compressor.compress(data) 249 self._fp.write(compressed) 250 self._pos += length 251 return length 252 253 def seek(self, offset, whence=io.SEEK_SET): 254 """Change the file position. 255 256 The new position is specified by offset, relative to the 257 position indicated by whence. Possible values for whence are: 258 259 0: start of stream (default): offset must not be negative 260 1: current stream position 261 2: end of stream; offset must not be positive 262 263 Returns the new file position. 264 265 Note that seeking is emulated, so depending on the parameters, 266 this operation may be extremely slow. 267 """ 268 self._check_can_seek() 269 return self._buffer.seek(offset, whence) 270 271 def tell(self): 272 """Return the current file position.""" 273 self._check_not_closed() 274 if self._mode == _MODE_READ: 275 return self._buffer.tell() 276 return self._pos 277 278 279def open(filename, mode="rb", *, 280 format=None, check=-1, preset=None, filters=None, 281 encoding=None, errors=None, newline=None): 282 """Open an LZMA-compressed file in binary or text mode. 283 284 filename can be either an actual file name (given as a str, bytes, 285 or PathLike object), in which case the named file is opened, or it 286 can be an existing file object to read from or write to. 287 288 The mode argument can be "r", "rb" (default), "w", "wb", "x", "xb", 289 "a", or "ab" for binary mode, or "rt", "wt", "xt", or "at" for text 290 mode. 291 292 The format, check, preset and filters arguments specify the 293 compression settings, as for LZMACompressor, LZMADecompressor and 294 LZMAFile. 295 296 For binary mode, this function is equivalent to the LZMAFile 297 constructor: LZMAFile(filename, mode, ...). In this case, the 298 encoding, errors and newline arguments must not be provided. 299 300 For text mode, an LZMAFile object is created, and wrapped in an 301 io.TextIOWrapper instance with the specified encoding, error 302 handling behavior, and line ending(s). 303 304 """ 305 if "t" in mode: 306 if "b" in mode: 307 raise ValueError("Invalid mode: %r" % (mode,)) 308 else: 309 if encoding is not None: 310 raise ValueError("Argument 'encoding' not supported in binary mode") 311 if errors is not None: 312 raise ValueError("Argument 'errors' not supported in binary mode") 313 if newline is not None: 314 raise ValueError("Argument 'newline' not supported in binary mode") 315 316 lz_mode = mode.replace("t", "") 317 binary_file = LZMAFile(filename, lz_mode, format=format, check=check, 318 preset=preset, filters=filters) 319 320 if "t" in mode: 321 encoding = io.text_encoding(encoding) 322 return io.TextIOWrapper(binary_file, encoding, errors, newline) 323 else: 324 return binary_file 325 326 327def compress(data, format=FORMAT_XZ, check=-1, preset=None, filters=None): 328 """Compress a block of data. 329 330 Refer to LZMACompressor's docstring for a description of the 331 optional arguments *format*, *check*, *preset* and *filters*. 332 333 For incremental compression, use an LZMACompressor instead. 334 """ 335 comp = LZMACompressor(format, check, preset, filters) 336 return comp.compress(data) + comp.flush() 337 338 339def decompress(data, format=FORMAT_AUTO, memlimit=None, filters=None): 340 """Decompress a block of data. 341 342 Refer to LZMADecompressor's docstring for a description of the 343 optional arguments *format*, *check* and *filters*. 344 345 For incremental decompression, use an LZMADecompressor instead. 346 """ 347 results = [] 348 while True: 349 decomp = LZMADecompressor(format, memlimit, filters) 350 try: 351 res = decomp.decompress(data) 352 except LZMAError: 353 if results: 354 break # Leftover data is not a valid LZMA/XZ stream; ignore it. 355 else: 356 raise # Error on the first iteration; bail out. 357 results.append(res) 358 if not decomp.eof: 359 raise LZMAError("Compressed data ended before the " 360 "end-of-stream marker was reached") 361 data = decomp.unused_data 362 if not data: 363 break 364 return b"".join(results) 365