1 /* 2 * SeekableXZInputStream 3 * 4 * Author: Lasse Collin <lasse.collin@tukaani.org> 5 * 6 * This file has been put into the public domain. 7 * You can do whatever you want with this file. 8 */ 9 10 package org.tukaani.xz; 11 12 import java.util.Arrays; 13 import java.util.ArrayList; 14 import java.io.DataInputStream; 15 import java.io.IOException; 16 import java.io.EOFException; 17 import org.tukaani.xz.common.DecoderUtil; 18 import org.tukaani.xz.common.StreamFlags; 19 import org.tukaani.xz.check.Check; 20 import org.tukaani.xz.index.IndexDecoder; 21 import org.tukaani.xz.index.BlockInfo; 22 23 /** 24 * Decompresses a .xz file in random access mode. 25 * This supports decompressing concatenated .xz files. 26 * <p> 27 * Each .xz file consist of one or more Streams. Each Stream consist of zero 28 * or more Blocks. Each Stream contains an Index of Streams' Blocks. 29 * The Indexes from all Streams are loaded in RAM by a constructor of this 30 * class. A typical .xz file has only one Stream, and parsing its Index will 31 * need only three or four seeks. 32 * <p> 33 * To make random access possible, the data in a .xz file must be splitted 34 * into multiple Blocks of reasonable size. Decompression can only start at 35 * a Block boundary. When seeking to an uncompressed position that is not at 36 * a Block boundary, decompression starts at the beginning of the Block and 37 * throws away data until the target position is reached. Thus, smaller Blocks 38 * mean faster seeks to arbitrary uncompressed positions. On the other hand, 39 * smaller Blocks mean worse compression. So one has to make a compromise 40 * between random access speed and compression ratio. 41 * <p> 42 * Implementation note: This class uses linear search to locate the correct 43 * Stream from the data structures in RAM. It was the simplest to implement 44 * and should be fine as long as there aren't too many Streams. The correct 45 * Block inside a Stream is located using binary search and thus is fast 46 * even with a huge number of Blocks. 47 * 48 * <h4>Memory usage</h4> 49 * <p> 50 * The amount of memory needed for the Indexes is taken into account when 51 * checking the memory usage limit. Each Stream is calculated to need at 52 * least 1 KiB of memory and each Block 16 bytes of memory, rounded up 53 * to the next kibibyte. So unless the file has a huge number of Streams or 54 * Blocks, these don't take significant amount of memory. 55 * 56 * <h4>Creating random-accessible .xz files</h4> 57 * <p> 58 * When using {@link XZOutputStream}, a new Block can be started by calling 59 * its {@link XZOutputStream#endBlock() endBlock} method. If you know 60 * that the decompressor will only need to seek to certain uncompressed 61 * positions, it can be a good idea to start a new Block at (some of) these 62 * positions (and only at these positions to get better compression ratio). 63 * <p> 64 * liblzma in XZ Utils supports starting a new Block with 65 * <code>LZMA_FULL_FLUSH</code>. XZ Utils 5.1.1alpha added threaded 66 * compression which creates multi-Block .xz files. XZ Utils 5.1.1alpha 67 * also added the option <code>--block-size=SIZE</code> to the xz command 68 * line tool. XZ Utils 5.1.2alpha added a partial implementation of 69 * <code>--block-list=SIZES</code> which allows specifying sizes of 70 * individual Blocks. 71 * 72 * @see SeekableFileInputStream 73 * @see XZInputStream 74 * @see XZOutputStream 75 */ 76 public class SeekableXZInputStream extends SeekableInputStream { 77 /** 78 * Cache for big arrays. 79 */ 80 private final ArrayCache arrayCache; 81 82 /** 83 * The input stream containing XZ compressed data. 84 */ 85 private SeekableInputStream in; 86 87 /** 88 * Memory usage limit after the memory usage of the IndexDecoders have 89 * been substracted. 90 */ 91 private final int memoryLimit; 92 93 /** 94 * Memory usage of the IndexDecoders. 95 * <code>memoryLimit + indexMemoryUsage</code> equals the original 96 * memory usage limit that was passed to the constructor. 97 */ 98 private int indexMemoryUsage = 0; 99 100 /** 101 * List of IndexDecoders, one for each Stream in the file. 102 * The list is in reverse order: The first element is 103 * the last Stream in the file. 104 */ 105 private final ArrayList<IndexDecoder> streams 106 = new ArrayList<IndexDecoder>(); 107 108 /** 109 * Bitmask of all Check IDs seen. 110 */ 111 private int checkTypes = 0; 112 113 /** 114 * Uncompressed size of the file (all Streams). 115 */ 116 private long uncompressedSize = 0; 117 118 /** 119 * Uncompressed size of the largest XZ Block in the file. 120 */ 121 private long largestBlockSize = 0; 122 123 /** 124 * Number of XZ Blocks in the file. 125 */ 126 private int blockCount = 0; 127 128 /** 129 * Size and position information about the current Block. 130 * If there are no Blocks, all values will be <code>-1</code>. 131 */ 132 private final BlockInfo curBlockInfo; 133 134 /** 135 * Temporary (and cached) information about the Block whose information 136 * is queried via <code>getBlockPos</code> and related functions. 137 */ 138 private final BlockInfo queriedBlockInfo; 139 140 /** 141 * Integrity Check in the current XZ Stream. The constructor leaves 142 * this to point to the Check of the first Stream. 143 */ 144 private Check check; 145 146 /** 147 * Flag indicating if the integrity checks will be verified. 148 */ 149 private final boolean verifyCheck; 150 151 /** 152 * Decoder of the current XZ Block, if any. 153 */ 154 private BlockInputStream blockDecoder = null; 155 156 /** 157 * Current uncompressed position. 158 */ 159 private long curPos = 0; 160 161 /** 162 * Target position for seeking. 163 */ 164 private long seekPos; 165 166 /** 167 * True when <code>seek(long)</code> has been called but the actual 168 * seeking hasn't been done yet. 169 */ 170 private boolean seekNeeded = false; 171 172 /** 173 * True when end of the file was reached. This can be cleared by 174 * calling <code>seek(long)</code>. 175 */ 176 private boolean endReached = false; 177 178 /** 179 * Pending exception from an earlier error. 180 */ 181 private IOException exception = null; 182 183 /** 184 * Temporary buffer for read(). This avoids reallocating memory 185 * on every read() call. 186 */ 187 private final byte[] tempBuf = new byte[1]; 188 189 /** 190 * Creates a new seekable XZ decompressor without a memory usage limit. 191 * 192 * @param in seekable input stream containing one or more 193 * XZ Streams; the whole input stream is used 194 * 195 * @throws XZFormatException 196 * input is not in the XZ format 197 * 198 * @throws CorruptedInputException 199 * XZ data is corrupt or truncated 200 * 201 * @throws UnsupportedOptionsException 202 * XZ headers seem valid but they specify 203 * options not supported by this implementation 204 * 205 * @throws EOFException 206 * less than 6 bytes of input was available 207 * from <code>in</code>, or (unlikely) the size 208 * of the underlying stream got smaller while 209 * this was reading from it 210 * 211 * @throws IOException may be thrown by <code>in</code> 212 */ SeekableXZInputStream(SeekableInputStream in)213 public SeekableXZInputStream(SeekableInputStream in) 214 throws IOException { 215 this(in, -1); 216 } 217 218 /** 219 * Creates a new seekable XZ decompressor without a memory usage limit. 220 * <p> 221 * This is identical to 222 * <code>SeekableXZInputStream(SeekableInputStream)</code> except that 223 * this also takes the <code>arrayCache</code> argument. 224 * 225 * @param in seekable input stream containing one or more 226 * XZ Streams; the whole input stream is used 227 * 228 * @param arrayCache cache to be used for allocating large arrays 229 * 230 * @throws XZFormatException 231 * input is not in the XZ format 232 * 233 * @throws CorruptedInputException 234 * XZ data is corrupt or truncated 235 * 236 * @throws UnsupportedOptionsException 237 * XZ headers seem valid but they specify 238 * options not supported by this implementation 239 * 240 * @throws EOFException 241 * less than 6 bytes of input was available 242 * from <code>in</code>, or (unlikely) the size 243 * of the underlying stream got smaller while 244 * this was reading from it 245 * 246 * @throws IOException may be thrown by <code>in</code> 247 * 248 * @since 1.7 249 */ SeekableXZInputStream(SeekableInputStream in, ArrayCache arrayCache)250 public SeekableXZInputStream(SeekableInputStream in, ArrayCache arrayCache) 251 throws IOException { 252 this(in, -1, arrayCache); 253 } 254 255 /** 256 * Creates a new seekable XZ decomporessor with an optional 257 * memory usage limit. 258 * 259 * @param in seekable input stream containing one or more 260 * XZ Streams; the whole input stream is used 261 * 262 * @param memoryLimit memory usage limit in kibibytes (KiB) 263 * or <code>-1</code> to impose no 264 * memory usage limit 265 * 266 * @throws XZFormatException 267 * input is not in the XZ format 268 * 269 * @throws CorruptedInputException 270 * XZ data is corrupt or truncated 271 * 272 * @throws UnsupportedOptionsException 273 * XZ headers seem valid but they specify 274 * options not supported by this implementation 275 * 276 * @throws MemoryLimitException 277 * decoded XZ Indexes would need more memory 278 * than allowed by the memory usage limit 279 * 280 * @throws EOFException 281 * less than 6 bytes of input was available 282 * from <code>in</code>, or (unlikely) the size 283 * of the underlying stream got smaller while 284 * this was reading from it 285 * 286 * @throws IOException may be thrown by <code>in</code> 287 */ SeekableXZInputStream(SeekableInputStream in, int memoryLimit)288 public SeekableXZInputStream(SeekableInputStream in, int memoryLimit) 289 throws IOException { 290 this(in, memoryLimit, true); 291 } 292 293 /** 294 * Creates a new seekable XZ decomporessor with an optional 295 * memory usage limit. 296 * <p> 297 * This is identical to 298 * <code>SeekableXZInputStream(SeekableInputStream,int)</code> 299 * except that this also takes the <code>arrayCache</code> argument. 300 * 301 * @param in seekable input stream containing one or more 302 * XZ Streams; the whole input stream is used 303 * 304 * @param memoryLimit memory usage limit in kibibytes (KiB) 305 * or <code>-1</code> to impose no 306 * memory usage limit 307 * 308 * @param arrayCache cache to be used for allocating large arrays 309 * 310 * @throws XZFormatException 311 * input is not in the XZ format 312 * 313 * @throws CorruptedInputException 314 * XZ data is corrupt or truncated 315 * 316 * @throws UnsupportedOptionsException 317 * XZ headers seem valid but they specify 318 * options not supported by this implementation 319 * 320 * @throws MemoryLimitException 321 * decoded XZ Indexes would need more memory 322 * than allowed by the memory usage limit 323 * 324 * @throws EOFException 325 * less than 6 bytes of input was available 326 * from <code>in</code>, or (unlikely) the size 327 * of the underlying stream got smaller while 328 * this was reading from it 329 * 330 * @throws IOException may be thrown by <code>in</code> 331 * 332 * @since 1.7 333 */ SeekableXZInputStream(SeekableInputStream in, int memoryLimit, ArrayCache arrayCache)334 public SeekableXZInputStream(SeekableInputStream in, int memoryLimit, 335 ArrayCache arrayCache) 336 throws IOException { 337 this(in, memoryLimit, true, arrayCache); 338 } 339 340 /** 341 * Creates a new seekable XZ decomporessor with an optional 342 * memory usage limit and ability to disable verification 343 * of integrity checks. 344 * <p> 345 * Note that integrity check verification should almost never be disabled. 346 * Possible reasons to disable integrity check verification: 347 * <ul> 348 * <li>Trying to recover data from a corrupt .xz file.</li> 349 * <li>Speeding up decompression. This matters mostly with SHA-256 350 * or with files that have compressed extremely well. It's recommended 351 * that integrity checking isn't disabled for performance reasons 352 * unless the file integrity is verified externally in some other 353 * way.</li> 354 * </ul> 355 * <p> 356 * <code>verifyCheck</code> only affects the integrity check of 357 * the actual compressed data. The CRC32 fields in the headers 358 * are always verified. 359 * 360 * @param in seekable input stream containing one or more 361 * XZ Streams; the whole input stream is used 362 * 363 * @param memoryLimit memory usage limit in kibibytes (KiB) 364 * or <code>-1</code> to impose no 365 * memory usage limit 366 * 367 * @param verifyCheck if <code>true</code>, the integrity checks 368 * will be verified; this should almost never 369 * be set to <code>false</code> 370 * 371 * @throws XZFormatException 372 * input is not in the XZ format 373 * 374 * @throws CorruptedInputException 375 * XZ data is corrupt or truncated 376 * 377 * @throws UnsupportedOptionsException 378 * XZ headers seem valid but they specify 379 * options not supported by this implementation 380 * 381 * @throws MemoryLimitException 382 * decoded XZ Indexes would need more memory 383 * than allowed by the memory usage limit 384 * 385 * @throws EOFException 386 * less than 6 bytes of input was available 387 * from <code>in</code>, or (unlikely) the size 388 * of the underlying stream got smaller while 389 * this was reading from it 390 * 391 * @throws IOException may be thrown by <code>in</code> 392 * 393 * @since 1.6 394 */ SeekableXZInputStream(SeekableInputStream in, int memoryLimit, boolean verifyCheck)395 public SeekableXZInputStream(SeekableInputStream in, int memoryLimit, 396 boolean verifyCheck) 397 throws IOException { 398 this(in, memoryLimit, verifyCheck, ArrayCache.getDefaultCache()); 399 } 400 401 /** 402 * Creates a new seekable XZ decomporessor with an optional 403 * memory usage limit and ability to disable verification 404 * of integrity checks. 405 * <p> 406 * This is identical to 407 * <code>SeekableXZInputStream(SeekableInputStream,int,boolean)</code> 408 * except that this also takes the <code>arrayCache</code> argument. 409 * 410 * @param in seekable input stream containing one or more 411 * XZ Streams; the whole input stream is used 412 * 413 * @param memoryLimit memory usage limit in kibibytes (KiB) 414 * or <code>-1</code> to impose no 415 * memory usage limit 416 * 417 * @param verifyCheck if <code>true</code>, the integrity checks 418 * will be verified; this should almost never 419 * be set to <code>false</code> 420 * 421 * @param arrayCache cache to be used for allocating large arrays 422 * 423 * @throws XZFormatException 424 * input is not in the XZ format 425 * 426 * @throws CorruptedInputException 427 * XZ data is corrupt or truncated 428 * 429 * @throws UnsupportedOptionsException 430 * XZ headers seem valid but they specify 431 * options not supported by this implementation 432 * 433 * @throws MemoryLimitException 434 * decoded XZ Indexes would need more memory 435 * than allowed by the memory usage limit 436 * 437 * @throws EOFException 438 * less than 6 bytes of input was available 439 * from <code>in</code>, or (unlikely) the size 440 * of the underlying stream got smaller while 441 * this was reading from it 442 * 443 * @throws IOException may be thrown by <code>in</code> 444 * 445 * @since 1.7 446 */ SeekableXZInputStream(SeekableInputStream in, int memoryLimit, boolean verifyCheck, ArrayCache arrayCache)447 public SeekableXZInputStream(SeekableInputStream in, int memoryLimit, 448 boolean verifyCheck, ArrayCache arrayCache) 449 throws IOException { 450 this.arrayCache = arrayCache; 451 this.verifyCheck = verifyCheck; 452 this.in = in; 453 DataInputStream inData = new DataInputStream(in); 454 455 // Check the magic bytes in the beginning of the file. 456 { 457 in.seek(0); 458 byte[] buf = new byte[XZ.HEADER_MAGIC.length]; 459 inData.readFully(buf); 460 if (!Arrays.equals(buf, XZ.HEADER_MAGIC)) 461 throw new XZFormatException(); 462 } 463 464 // Get the file size and verify that it is a multiple of 4 bytes. 465 long pos = in.length(); 466 if ((pos & 3) != 0) 467 throw new CorruptedInputException( 468 "XZ file size is not a multiple of 4 bytes"); 469 470 // Parse the headers starting from the end of the file. 471 byte[] buf = new byte[DecoderUtil.STREAM_HEADER_SIZE]; 472 long streamPadding = 0; 473 474 while (pos > 0) { 475 if (pos < DecoderUtil.STREAM_HEADER_SIZE) 476 throw new CorruptedInputException(); 477 478 // Read the potential Stream Footer. 479 in.seek(pos - DecoderUtil.STREAM_HEADER_SIZE); 480 inData.readFully(buf); 481 482 // Skip Stream Padding four bytes at a time. 483 // Skipping more at once would be faster, 484 // but usually there isn't much Stream Padding. 485 if (buf[8] == 0x00 && buf[9] == 0x00 && buf[10] == 0x00 486 && buf[11] == 0x00) { 487 streamPadding += 4; 488 pos -= 4; 489 continue; 490 } 491 492 // It's not Stream Padding. Update pos. 493 pos -= DecoderUtil.STREAM_HEADER_SIZE; 494 495 // Decode the Stream Footer and check if Backward Size 496 // looks reasonable. 497 StreamFlags streamFooter = DecoderUtil.decodeStreamFooter(buf); 498 if (streamFooter.backwardSize >= pos) 499 throw new CorruptedInputException( 500 "Backward Size in XZ Stream Footer is too big"); 501 502 // Check that the Check ID is supported. Store it in case this 503 // is the first Stream in the file. 504 check = Check.getInstance(streamFooter.checkType); 505 506 // Remember which Check IDs have been seen. 507 checkTypes |= 1 << streamFooter.checkType; 508 509 // Seek to the beginning of the Index. 510 in.seek(pos - streamFooter.backwardSize); 511 512 // Decode the Index field. 513 IndexDecoder index; 514 try { 515 index = new IndexDecoder(in, streamFooter, streamPadding, 516 memoryLimit); 517 } catch (MemoryLimitException e) { 518 // IndexDecoder doesn't know how much memory we had 519 // already needed so we need to recreate the exception. 520 assert memoryLimit >= 0; 521 throw new MemoryLimitException( 522 e.getMemoryNeeded() + indexMemoryUsage, 523 memoryLimit + indexMemoryUsage); 524 } 525 526 // Update the memory usage and limit counters. 527 indexMemoryUsage += index.getMemoryUsage(); 528 if (memoryLimit >= 0) { 529 memoryLimit -= index.getMemoryUsage(); 530 assert memoryLimit >= 0; 531 } 532 533 // Remember the uncompressed size of the largest Block. 534 if (largestBlockSize < index.getLargestBlockSize()) 535 largestBlockSize = index.getLargestBlockSize(); 536 537 // Calculate the offset to the beginning of this XZ Stream and 538 // check that it looks sane. 539 long off = index.getStreamSize() - DecoderUtil.STREAM_HEADER_SIZE; 540 if (pos < off) 541 throw new CorruptedInputException("XZ Index indicates " 542 + "too big compressed size for the XZ Stream"); 543 544 // Seek to the beginning of this Stream. 545 pos -= off; 546 in.seek(pos); 547 548 // Decode the Stream Header. 549 inData.readFully(buf); 550 StreamFlags streamHeader = DecoderUtil.decodeStreamHeader(buf); 551 552 // Verify that the Stream Header matches the Stream Footer. 553 if (!DecoderUtil.areStreamFlagsEqual(streamHeader, streamFooter)) 554 throw new CorruptedInputException( 555 "XZ Stream Footer does not match Stream Header"); 556 557 // Update the total uncompressed size of the file and check that 558 // it doesn't overflow. 559 uncompressedSize += index.getUncompressedSize(); 560 if (uncompressedSize < 0) 561 throw new UnsupportedOptionsException("XZ file is too big"); 562 563 // Update the Block count and check that it fits into an int. 564 blockCount += index.getRecordCount(); 565 if (blockCount < 0) 566 throw new UnsupportedOptionsException( 567 "XZ file has over " + Integer.MAX_VALUE + " Blocks"); 568 569 // Add this Stream to the list of Streams. 570 streams.add(index); 571 572 // Reset to be ready to parse the next Stream. 573 streamPadding = 0; 574 } 575 576 assert pos == 0; 577 578 // Save it now that indexMemoryUsage has been substracted from it. 579 this.memoryLimit = memoryLimit; 580 581 // Store the relative offsets of the Streams. This way we don't 582 // need to recalculate them in this class when seeking; the 583 // IndexDecoder instances will handle them. 584 IndexDecoder prev = streams.get(streams.size() - 1); 585 for (int i = streams.size() - 2; i >= 0; --i) { 586 IndexDecoder cur = streams.get(i); 587 cur.setOffsets(prev); 588 prev = cur; 589 } 590 591 // Initialize curBlockInfo to point to the first Stream. 592 // The blockNumber will be left to -1 so that .hasNext() 593 // and .setNext() work to get the first Block when starting 594 // to decompress from the beginning of the file. 595 IndexDecoder first = streams.get(streams.size() - 1); 596 curBlockInfo = new BlockInfo(first); 597 598 // queriedBlockInfo needs to be allocated too. The Stream used for 599 // initialization doesn't matter though. 600 queriedBlockInfo = new BlockInfo(first); 601 } 602 603 /** 604 * Gets the types of integrity checks used in the .xz file. 605 * Multiple checks are possible only if there are multiple 606 * concatenated XZ Streams. 607 * <p> 608 * The returned value has a bit set for every check type that is present. 609 * For example, if CRC64 and SHA-256 were used, the return value is 610 * <code>(1 << XZ.CHECK_CRC64) 611 * | (1 << XZ.CHECK_SHA256)</code>. 612 */ getCheckTypes()613 public int getCheckTypes() { 614 return checkTypes; 615 } 616 617 /** 618 * Gets the amount of memory in kibibytes (KiB) used by 619 * the data structures needed to locate the XZ Blocks. 620 * This is usually useless information but since it is calculated 621 * for memory usage limit anyway, it is nice to make it available to too. 622 */ getIndexMemoryUsage()623 public int getIndexMemoryUsage() { 624 return indexMemoryUsage; 625 } 626 627 /** 628 * Gets the uncompressed size of the largest XZ Block in bytes. 629 * This can be useful if you want to check that the file doesn't 630 * have huge XZ Blocks which could make seeking to arbitrary offsets 631 * very slow. Note that huge Blocks don't automatically mean that 632 * seeking would be slow, for example, seeking to the beginning of 633 * any Block is always fast. 634 */ getLargestBlockSize()635 public long getLargestBlockSize() { 636 return largestBlockSize; 637 } 638 639 /** 640 * Gets the number of Streams in the .xz file. 641 * 642 * @since 1.3 643 */ getStreamCount()644 public int getStreamCount() { 645 return streams.size(); 646 } 647 648 /** 649 * Gets the number of Blocks in the .xz file. 650 * 651 * @since 1.3 652 */ getBlockCount()653 public int getBlockCount() { 654 return blockCount; 655 } 656 657 /** 658 * Gets the uncompressed start position of the given Block. 659 * 660 * @throws IndexOutOfBoundsException if 661 * <code>blockNumber < 0</code> or 662 * <code>blockNumber >= getBlockCount()</code>. 663 * 664 * @since 1.3 665 */ getBlockPos(int blockNumber)666 public long getBlockPos(int blockNumber) { 667 locateBlockByNumber(queriedBlockInfo, blockNumber); 668 return queriedBlockInfo.uncompressedOffset; 669 } 670 671 /** 672 * Gets the uncompressed size of the given Block. 673 * 674 * @throws IndexOutOfBoundsException if 675 * <code>blockNumber < 0</code> or 676 * <code>blockNumber >= getBlockCount()</code>. 677 * 678 * @since 1.3 679 */ getBlockSize(int blockNumber)680 public long getBlockSize(int blockNumber) { 681 locateBlockByNumber(queriedBlockInfo, blockNumber); 682 return queriedBlockInfo.uncompressedSize; 683 } 684 685 /** 686 * Gets the position where the given compressed Block starts in 687 * the underlying .xz file. 688 * This information is rarely useful to the users of this class. 689 * 690 * @throws IndexOutOfBoundsException if 691 * <code>blockNumber < 0</code> or 692 * <code>blockNumber >= getBlockCount()</code>. 693 * 694 * @since 1.3 695 */ getBlockCompPos(int blockNumber)696 public long getBlockCompPos(int blockNumber) { 697 locateBlockByNumber(queriedBlockInfo, blockNumber); 698 return queriedBlockInfo.compressedOffset; 699 } 700 701 /** 702 * Gets the compressed size of the given Block. 703 * This together with the uncompressed size can be used to calculate 704 * the compression ratio of the specific Block. 705 * 706 * @throws IndexOutOfBoundsException if 707 * <code>blockNumber < 0</code> or 708 * <code>blockNumber >= getBlockCount()</code>. 709 * 710 * @since 1.3 711 */ getBlockCompSize(int blockNumber)712 public long getBlockCompSize(int blockNumber) { 713 locateBlockByNumber(queriedBlockInfo, blockNumber); 714 return (queriedBlockInfo.unpaddedSize + 3) & ~3; 715 } 716 717 /** 718 * Gets integrity check type (Check ID) of the given Block. 719 * 720 * @throws IndexOutOfBoundsException if 721 * <code>blockNumber < 0</code> or 722 * <code>blockNumber >= getBlockCount()</code>. 723 * 724 * @see #getCheckTypes() 725 * 726 * @since 1.3 727 */ getBlockCheckType(int blockNumber)728 public int getBlockCheckType(int blockNumber) { 729 locateBlockByNumber(queriedBlockInfo, blockNumber); 730 return queriedBlockInfo.getCheckType(); 731 } 732 733 /** 734 * Gets the number of the Block that contains the byte at the given 735 * uncompressed position. 736 * 737 * @throws IndexOutOfBoundsException if 738 * <code>pos < 0</code> or 739 * <code>pos >= length()</code>. 740 * 741 * @since 1.3 742 */ getBlockNumber(long pos)743 public int getBlockNumber(long pos) { 744 locateBlockByPos(queriedBlockInfo, pos); 745 return queriedBlockInfo.blockNumber; 746 } 747 748 /** 749 * Decompresses the next byte from this input stream. 750 * 751 * @return the next decompressed byte, or <code>-1</code> 752 * to indicate the end of the compressed stream 753 * 754 * @throws CorruptedInputException 755 * @throws UnsupportedOptionsException 756 * @throws MemoryLimitException 757 * 758 * @throws XZIOException if the stream has been closed 759 * 760 * @throws IOException may be thrown by <code>in</code> 761 */ read()762 public int read() throws IOException { 763 return read(tempBuf, 0, 1) == -1 ? -1 : (tempBuf[0] & 0xFF); 764 } 765 766 /** 767 * Decompresses into an array of bytes. 768 * <p> 769 * If <code>len</code> is zero, no bytes are read and <code>0</code> 770 * is returned. Otherwise this will try to decompress <code>len</code> 771 * bytes of uncompressed data. Less than <code>len</code> bytes may 772 * be read only in the following situations: 773 * <ul> 774 * <li>The end of the compressed data was reached successfully.</li> 775 * <li>An error is detected after at least one but less than 776 * <code>len</code> bytes have already been successfully 777 * decompressed. The next call with non-zero <code>len</code> 778 * will immediately throw the pending exception.</li> 779 * <li>An exception is thrown.</li> 780 * </ul> 781 * 782 * @param buf target buffer for uncompressed data 783 * @param off start offset in <code>buf</code> 784 * @param len maximum number of uncompressed bytes to read 785 * 786 * @return number of bytes read, or <code>-1</code> to indicate 787 * the end of the compressed stream 788 * 789 * @throws CorruptedInputException 790 * @throws UnsupportedOptionsException 791 * @throws MemoryLimitException 792 * 793 * @throws XZIOException if the stream has been closed 794 * 795 * @throws IOException may be thrown by <code>in</code> 796 */ read(byte[] buf, int off, int len)797 public int read(byte[] buf, int off, int len) throws IOException { 798 if (off < 0 || len < 0 || off + len < 0 || off + len > buf.length) 799 throw new IndexOutOfBoundsException(); 800 801 if (len == 0) 802 return 0; 803 804 if (in == null) 805 throw new XZIOException("Stream closed"); 806 807 if (exception != null) 808 throw exception; 809 810 int size = 0; 811 812 try { 813 if (seekNeeded) 814 seek(); 815 816 if (endReached) 817 return -1; 818 819 while (len > 0) { 820 if (blockDecoder == null) { 821 seek(); 822 if (endReached) 823 break; 824 } 825 826 int ret = blockDecoder.read(buf, off, len); 827 828 if (ret > 0) { 829 curPos += ret; 830 size += ret; 831 off += ret; 832 len -= ret; 833 } else if (ret == -1) { 834 blockDecoder = null; 835 } 836 } 837 } catch (IOException e) { 838 // We know that the file isn't simply truncated because we could 839 // parse the Indexes in the constructor. So convert EOFException 840 // to CorruptedInputException. 841 if (e instanceof EOFException) 842 e = new CorruptedInputException(); 843 844 exception = e; 845 if (size == 0) 846 throw e; 847 } 848 849 return size; 850 } 851 852 /** 853 * Returns the number of uncompressed bytes that can be read 854 * without blocking. The value is returned with an assumption 855 * that the compressed input data will be valid. If the compressed 856 * data is corrupt, <code>CorruptedInputException</code> may get 857 * thrown before the number of bytes claimed to be available have 858 * been read from this input stream. 859 * 860 * @return the number of uncompressed bytes that can be read 861 * without blocking 862 */ available()863 public int available() throws IOException { 864 if (in == null) 865 throw new XZIOException("Stream closed"); 866 867 if (exception != null) 868 throw exception; 869 870 if (endReached || seekNeeded || blockDecoder == null) 871 return 0; 872 873 return blockDecoder.available(); 874 } 875 876 /** 877 * Closes the stream and calls <code>in.close()</code>. 878 * If the stream was already closed, this does nothing. 879 * <p> 880 * This is equivalent to <code>close(true)</code>. 881 * 882 * @throws IOException if thrown by <code>in.close()</code> 883 */ close()884 public void close() throws IOException { 885 close(true); 886 } 887 888 /** 889 * Closes the stream and optionally calls <code>in.close()</code>. 890 * If the stream was already closed, this does nothing. 891 * If <code>close(false)</code> has been called, a further 892 * call of <code>close(true)</code> does nothing (it doesn't call 893 * <code>in.close()</code>). 894 * <p> 895 * If you don't want to close the underlying <code>InputStream</code>, 896 * there is usually no need to worry about closing this stream either; 897 * it's fine to do nothing and let the garbage collector handle it. 898 * However, if you are using {@link ArrayCache}, <code>close(false)</code> 899 * can be useful to put the allocated arrays back to the cache without 900 * closing the underlying <code>InputStream</code>. 901 * <p> 902 * Note that if you successfully reach the end of the stream 903 * (<code>read</code> returns <code>-1</code>), the arrays are 904 * automatically put back to the cache by that <code>read</code> call. In 905 * this situation <code>close(false)</code> is redundant (but harmless). 906 * 907 * @throws IOException if thrown by <code>in.close()</code> 908 * 909 * @since 1.7 910 */ close(boolean closeInput)911 public void close(boolean closeInput) throws IOException { 912 if (in != null) { 913 if (blockDecoder != null) { 914 blockDecoder.close(); 915 blockDecoder = null; 916 } 917 918 try { 919 if (closeInput) 920 in.close(); 921 } finally { 922 in = null; 923 } 924 } 925 } 926 927 /** 928 * Gets the uncompressed size of this input stream. If there are multiple 929 * XZ Streams, the total uncompressed size of all XZ Streams is returned. 930 */ length()931 public long length() { 932 return uncompressedSize; 933 } 934 935 /** 936 * Gets the current uncompressed position in this input stream. 937 * 938 * @throws XZIOException if the stream has been closed 939 */ position()940 public long position() throws IOException { 941 if (in == null) 942 throw new XZIOException("Stream closed"); 943 944 return seekNeeded ? seekPos : curPos; 945 } 946 947 /** 948 * Seeks to the specified absolute uncompressed position in the stream. 949 * This only stores the new position, so this function itself is always 950 * very fast. The actual seek is done when <code>read</code> is called 951 * to read at least one byte. 952 * <p> 953 * Seeking past the end of the stream is possible. In that case 954 * <code>read</code> will return <code>-1</code> to indicate 955 * the end of the stream. 956 * 957 * @param pos new uncompressed read position 958 * 959 * @throws XZIOException 960 * if <code>pos</code> is negative, or 961 * if stream has been closed 962 */ seek(long pos)963 public void seek(long pos) throws IOException { 964 if (in == null) 965 throw new XZIOException("Stream closed"); 966 967 if (pos < 0) 968 throw new XZIOException("Negative seek position: " + pos); 969 970 seekPos = pos; 971 seekNeeded = true; 972 } 973 974 /** 975 * Seeks to the beginning of the given XZ Block. 976 * 977 * @throws XZIOException 978 * if <code>blockNumber < 0</code> or 979 * <code>blockNumber >= getBlockCount()</code>, 980 * or if stream has been closed 981 * 982 * @since 1.3 983 */ seekToBlock(int blockNumber)984 public void seekToBlock(int blockNumber) throws IOException { 985 if (in == null) 986 throw new XZIOException("Stream closed"); 987 988 if (blockNumber < 0 || blockNumber >= blockCount) 989 throw new XZIOException("Invalid XZ Block number: " + blockNumber); 990 991 // This is a bit silly implementation. Here we locate the uncompressed 992 // offset of the specified Block, then when doing the actual seek in 993 // seek(), we need to find the Block number based on seekPos. 994 seekPos = getBlockPos(blockNumber); 995 seekNeeded = true; 996 } 997 998 /** 999 * Does the actual seeking. This is also called when <code>read</code> 1000 * needs a new Block to decode. 1001 */ seek()1002 private void seek() throws IOException { 1003 // If seek(long) wasn't called, we simply need to get the next Block 1004 // from the same Stream. If there are no more Blocks in this Stream, 1005 // then we behave as if seek(long) had been called. 1006 if (!seekNeeded) { 1007 if (curBlockInfo.hasNext()) { 1008 curBlockInfo.setNext(); 1009 initBlockDecoder(); 1010 return; 1011 } 1012 1013 seekPos = curPos; 1014 } 1015 1016 seekNeeded = false; 1017 1018 // Check if we are seeking to or past the end of the file. 1019 if (seekPos >= uncompressedSize) { 1020 curPos = seekPos; 1021 1022 if (blockDecoder != null) { 1023 blockDecoder.close(); 1024 blockDecoder = null; 1025 } 1026 1027 endReached = true; 1028 return; 1029 } 1030 1031 endReached = false; 1032 1033 // Locate the Block that contains the uncompressed target position. 1034 locateBlockByPos(curBlockInfo, seekPos); 1035 1036 // Seek in the underlying stream and create a new Block decoder 1037 // only if really needed. We can skip it if the current position 1038 // is already in the correct Block and the target position hasn't 1039 // been decompressed yet. 1040 // 1041 // NOTE: If curPos points to the beginning of this Block, it's 1042 // because it was left there after decompressing an earlier Block. 1043 // In that case, decoding of the current Block hasn't been started 1044 // yet. (Decoding of a Block won't be started until at least one 1045 // byte will also be read from it.) 1046 if (!(curPos > curBlockInfo.uncompressedOffset && curPos <= seekPos)) { 1047 // Seek to the beginning of the Block. 1048 in.seek(curBlockInfo.compressedOffset); 1049 1050 // Since it is possible that this Block is from a different 1051 // Stream than the previous Block, initialize a new Check. 1052 check = Check.getInstance(curBlockInfo.getCheckType()); 1053 1054 // Create a new Block decoder. 1055 initBlockDecoder(); 1056 curPos = curBlockInfo.uncompressedOffset; 1057 } 1058 1059 // If the target wasn't at a Block boundary, decompress and throw 1060 // away data to reach the target position. 1061 if (seekPos > curPos) { 1062 // NOTE: The "if" below is there just in case. In this situation, 1063 // blockDecoder.skip will always skip the requested amount 1064 // or throw an exception. 1065 long skipAmount = seekPos - curPos; 1066 if (blockDecoder.skip(skipAmount) != skipAmount) 1067 throw new CorruptedInputException(); 1068 1069 curPos = seekPos; 1070 } 1071 } 1072 1073 /** 1074 * Locates the Block that contains the given uncompressed position. 1075 */ locateBlockByPos(BlockInfo info, long pos)1076 private void locateBlockByPos(BlockInfo info, long pos) { 1077 if (pos < 0 || pos >= uncompressedSize) 1078 throw new IndexOutOfBoundsException( 1079 "Invalid uncompressed position: " + pos); 1080 1081 // Locate the Stream that contains the target position. 1082 IndexDecoder index; 1083 for (int i = 0; ; ++i) { 1084 index = streams.get(i); 1085 if (index.hasUncompressedOffset(pos)) 1086 break; 1087 } 1088 1089 // Locate the Block from the Stream that contains the target position. 1090 index.locateBlock(info, pos); 1091 1092 assert (info.compressedOffset & 3) == 0; 1093 assert info.uncompressedSize > 0; 1094 assert pos >= info.uncompressedOffset; 1095 assert pos < info.uncompressedOffset + info.uncompressedSize; 1096 } 1097 1098 /** 1099 * Locates the given Block and stores information about it 1100 * to <code>info</code>. 1101 */ 1102 private void locateBlockByNumber(BlockInfo info, int blockNumber) { 1103 // Validate. 1104 if (blockNumber < 0 || blockNumber >= blockCount) 1105 throw new IndexOutOfBoundsException( 1106 "Invalid XZ Block number: " + blockNumber); 1107 1108 // Skip the search if info already points to the correct Block. 1109 if (info.blockNumber == blockNumber) 1110 return; 1111 1112 // Search the Stream that contains the given Block and then 1113 // search the Block from that Stream. 1114 for (int i = 0; ; ++i) { 1115 IndexDecoder index = streams.get(i); 1116 if (index.hasRecord(blockNumber)) { 1117 index.setBlockInfo(info, blockNumber); 1118 return; 1119 } 1120 } 1121 } 1122 1123 /** 1124 * Initializes a new BlockInputStream. This is a helper function for 1125 * <code>seek()</code>. 1126 */ 1127 private void initBlockDecoder() throws IOException { 1128 try { 1129 // Set it to null first so that GC can collect it if memory 1130 // runs tight when initializing a new BlockInputStream. 1131 if (blockDecoder != null) { 1132 blockDecoder.close(); 1133 blockDecoder = null; 1134 } 1135 1136 blockDecoder = new BlockInputStream( 1137 in, check, verifyCheck, memoryLimit, 1138 curBlockInfo.unpaddedSize, curBlockInfo.uncompressedSize, 1139 arrayCache); 1140 } catch (MemoryLimitException e) { 1141 // BlockInputStream doesn't know how much memory we had 1142 // already needed so we need to recreate the exception. 1143 assert memoryLimit >= 0; 1144 throw new MemoryLimitException( 1145 e.getMemoryNeeded() + indexMemoryUsage, 1146 memoryLimit + indexMemoryUsage); 1147 } catch (IndexIndicatorException e) { 1148 // It cannot be Index so the file must be corrupt. 1149 throw new CorruptedInputException(); 1150 } 1151 } 1152 } 1153