1 /* 2 * SeekableXZInputStream 3 * 4 * Author: Lasse Collin <lasse.collin@tukaani.org> 5 * 6 * This file has been put into the public domain. 7 * You can do whatever you want with this file. 8 */ 9 10 package org.tukaani.xz; 11 12 import java.util.Arrays; 13 import java.util.ArrayList; 14 import java.io.DataInputStream; 15 import java.io.IOException; 16 import java.io.EOFException; 17 import org.tukaani.xz.common.DecoderUtil; 18 import org.tukaani.xz.common.StreamFlags; 19 import org.tukaani.xz.check.Check; 20 import org.tukaani.xz.index.IndexDecoder; 21 import org.tukaani.xz.index.BlockInfo; 22 23 /** 24 * Decompresses a .xz file in random access mode. 25 * This supports decompressing concatenated .xz files. 26 * <p> 27 * Each .xz file consist of one or more Streams. Each Stream consist of zero 28 * or more Blocks. Each Stream contains an Index of Streams' Blocks. 29 * The Indexes from all Streams are loaded in RAM by a constructor of this 30 * class. A typical .xz file has only one Stream, and parsing its Index will 31 * need only three or four seeks. 32 * <p> 33 * To make random access possible, the data in a .xz file must be splitted 34 * into multiple Blocks of reasonable size. Decompression can only start at 35 * a Block boundary. When seeking to an uncompressed position that is not at 36 * a Block boundary, decompression starts at the beginning of the Block and 37 * throws away data until the target position is reached. Thus, smaller Blocks 38 * mean faster seeks to arbitrary uncompressed positions. On the other hand, 39 * smaller Blocks mean worse compression. So one has to make a compromise 40 * between random access speed and compression ratio. 41 * <p> 42 * Implementation note: This class uses linear search to locate the correct 43 * Stream from the data structures in RAM. It was the simplest to implement 44 * and should be fine as long as there aren't too many Streams. The correct 45 * Block inside a Stream is located using binary search and thus is fast 46 * even with a huge number of Blocks. 47 * 48 * <h2>Memory usage</h2> 49 * <p> 50 * The amount of memory needed for the Indexes is taken into account when 51 * checking the memory usage limit. Each Stream is calculated to need at 52 * least 1 KiB of memory and each Block 16 bytes of memory, rounded up 53 * to the next kibibyte. So unless the file has a huge number of Streams or 54 * Blocks, these don't take significant amount of memory. 55 * 56 * <h2>Creating random-accessible .xz files</h2> 57 * <p> 58 * When using {@link XZOutputStream}, a new Block can be started by calling 59 * its {@link XZOutputStream#endBlock() endBlock} method. If you know 60 * that the decompressor will only need to seek to certain uncompressed 61 * positions, it can be a good idea to start a new Block at (some of) these 62 * positions (and only at these positions to get better compression ratio). 63 * <p> 64 * liblzma in XZ Utils supports starting a new Block with 65 * <code>LZMA_FULL_FLUSH</code>. XZ Utils 5.1.1alpha added threaded 66 * compression which creates multi-Block .xz files. XZ Utils 5.1.1alpha 67 * also added the option <code>--block-size=SIZE</code> to the xz command 68 * line tool. XZ Utils 5.1.2alpha added a partial implementation of 69 * <code>--block-list=SIZES</code> which allows specifying sizes of 70 * individual Blocks. 71 * 72 * <h2>Example: getting the uncompressed size of a .xz file</h2> 73 * <blockquote><pre> 74 * String filename = "foo.xz"; 75 * SeekableFileInputStream seekableFile 76 * = new SeekableFileInputStream(filename); 77 * 78 * try { 79 * SeekableXZInputStream seekableXZ 80 * = new SeekableXZInputStream(seekableFile); 81 * System.out.println("Uncompressed size: " + seekableXZ.length()); 82 * } finally { 83 * seekableFile.close(); 84 * } 85 * </pre></blockquote> 86 * 87 * @see SeekableFileInputStream 88 * @see XZInputStream 89 * @see XZOutputStream 90 */ 91 public class SeekableXZInputStream extends SeekableInputStream { 92 /** 93 * Cache for big arrays. 94 */ 95 private final ArrayCache arrayCache; 96 97 /** 98 * The input stream containing XZ compressed data. 99 */ 100 private SeekableInputStream in; 101 102 /** 103 * Memory usage limit after the memory usage of the IndexDecoders have 104 * been substracted. 105 */ 106 private final int memoryLimit; 107 108 /** 109 * Memory usage of the IndexDecoders. 110 * <code>memoryLimit + indexMemoryUsage</code> equals the original 111 * memory usage limit that was passed to the constructor. 112 */ 113 private int indexMemoryUsage = 0; 114 115 /** 116 * List of IndexDecoders, one for each Stream in the file. 117 * The list is in reverse order: The first element is 118 * the last Stream in the file. 119 */ 120 private final ArrayList<IndexDecoder> streams 121 = new ArrayList<IndexDecoder>(); 122 123 /** 124 * Bitmask of all Check IDs seen. 125 */ 126 private int checkTypes = 0; 127 128 /** 129 * Uncompressed size of the file (all Streams). 130 */ 131 private long uncompressedSize = 0; 132 133 /** 134 * Uncompressed size of the largest XZ Block in the file. 135 */ 136 private long largestBlockSize = 0; 137 138 /** 139 * Number of XZ Blocks in the file. 140 */ 141 private int blockCount = 0; 142 143 /** 144 * Size and position information about the current Block. 145 * If there are no Blocks, all values will be <code>-1</code>. 146 */ 147 private final BlockInfo curBlockInfo; 148 149 /** 150 * Temporary (and cached) information about the Block whose information 151 * is queried via <code>getBlockPos</code> and related functions. 152 */ 153 private final BlockInfo queriedBlockInfo; 154 155 /** 156 * Integrity Check in the current XZ Stream. The constructor leaves 157 * this to point to the Check of the first Stream. 158 */ 159 private Check check; 160 161 /** 162 * Flag indicating if the integrity checks will be verified. 163 */ 164 private final boolean verifyCheck; 165 166 /** 167 * Decoder of the current XZ Block, if any. 168 */ 169 private BlockInputStream blockDecoder = null; 170 171 /** 172 * Current uncompressed position. 173 */ 174 private long curPos = 0; 175 176 /** 177 * Target position for seeking. 178 */ 179 private long seekPos; 180 181 /** 182 * True when <code>seek(long)</code> has been called but the actual 183 * seeking hasn't been done yet. 184 */ 185 private boolean seekNeeded = false; 186 187 /** 188 * True when end of the file was reached. This can be cleared by 189 * calling <code>seek(long)</code>. 190 */ 191 private boolean endReached = false; 192 193 /** 194 * Pending exception from an earlier error. 195 */ 196 private IOException exception = null; 197 198 /** 199 * Temporary buffer for read(). This avoids reallocating memory 200 * on every read() call. 201 */ 202 private final byte[] tempBuf = new byte[1]; 203 204 /** 205 * Creates a new seekable XZ decompressor without a memory usage limit. 206 * 207 * @param in seekable input stream containing one or more 208 * XZ Streams; the whole input stream is used 209 * 210 * @throws XZFormatException 211 * input is not in the XZ format 212 * 213 * @throws CorruptedInputException 214 * XZ data is corrupt or truncated 215 * 216 * @throws UnsupportedOptionsException 217 * XZ headers seem valid but they specify 218 * options not supported by this implementation 219 * 220 * @throws EOFException 221 * less than 6 bytes of input was available 222 * from <code>in</code>, or (unlikely) the size 223 * of the underlying stream got smaller while 224 * this was reading from it 225 * 226 * @throws IOException may be thrown by <code>in</code> 227 */ SeekableXZInputStream(SeekableInputStream in)228 public SeekableXZInputStream(SeekableInputStream in) 229 throws IOException { 230 this(in, -1); 231 } 232 233 /** 234 * Creates a new seekable XZ decompressor without a memory usage limit. 235 * <p> 236 * This is identical to 237 * <code>SeekableXZInputStream(SeekableInputStream)</code> except that 238 * this also takes the <code>arrayCache</code> argument. 239 * 240 * @param in seekable input stream containing one or more 241 * XZ Streams; the whole input stream is used 242 * 243 * @param arrayCache cache to be used for allocating large arrays 244 * 245 * @throws XZFormatException 246 * input is not in the XZ format 247 * 248 * @throws CorruptedInputException 249 * XZ data is corrupt or truncated 250 * 251 * @throws UnsupportedOptionsException 252 * XZ headers seem valid but they specify 253 * options not supported by this implementation 254 * 255 * @throws EOFException 256 * less than 6 bytes of input was available 257 * from <code>in</code>, or (unlikely) the size 258 * of the underlying stream got smaller while 259 * this was reading from it 260 * 261 * @throws IOException may be thrown by <code>in</code> 262 * 263 * @since 1.7 264 */ SeekableXZInputStream(SeekableInputStream in, ArrayCache arrayCache)265 public SeekableXZInputStream(SeekableInputStream in, ArrayCache arrayCache) 266 throws IOException { 267 this(in, -1, arrayCache); 268 } 269 270 /** 271 * Creates a new seekable XZ decomporessor with an optional 272 * memory usage limit. 273 * 274 * @param in seekable input stream containing one or more 275 * XZ Streams; the whole input stream is used 276 * 277 * @param memoryLimit memory usage limit in kibibytes (KiB) 278 * or <code>-1</code> to impose no 279 * memory usage limit 280 * 281 * @throws XZFormatException 282 * input is not in the XZ format 283 * 284 * @throws CorruptedInputException 285 * XZ data is corrupt or truncated 286 * 287 * @throws UnsupportedOptionsException 288 * XZ headers seem valid but they specify 289 * options not supported by this implementation 290 * 291 * @throws MemoryLimitException 292 * decoded XZ Indexes would need more memory 293 * than allowed by the memory usage limit 294 * 295 * @throws EOFException 296 * less than 6 bytes of input was available 297 * from <code>in</code>, or (unlikely) the size 298 * of the underlying stream got smaller while 299 * this was reading from it 300 * 301 * @throws IOException may be thrown by <code>in</code> 302 */ SeekableXZInputStream(SeekableInputStream in, int memoryLimit)303 public SeekableXZInputStream(SeekableInputStream in, int memoryLimit) 304 throws IOException { 305 this(in, memoryLimit, true); 306 } 307 308 /** 309 * Creates a new seekable XZ decomporessor with an optional 310 * memory usage limit. 311 * <p> 312 * This is identical to 313 * <code>SeekableXZInputStream(SeekableInputStream,int)</code> 314 * except that this also takes the <code>arrayCache</code> argument. 315 * 316 * @param in seekable input stream containing one or more 317 * XZ Streams; the whole input stream is used 318 * 319 * @param memoryLimit memory usage limit in kibibytes (KiB) 320 * or <code>-1</code> to impose no 321 * memory usage limit 322 * 323 * @param arrayCache cache to be used for allocating large arrays 324 * 325 * @throws XZFormatException 326 * input is not in the XZ format 327 * 328 * @throws CorruptedInputException 329 * XZ data is corrupt or truncated 330 * 331 * @throws UnsupportedOptionsException 332 * XZ headers seem valid but they specify 333 * options not supported by this implementation 334 * 335 * @throws MemoryLimitException 336 * decoded XZ Indexes would need more memory 337 * than allowed by the memory usage limit 338 * 339 * @throws EOFException 340 * less than 6 bytes of input was available 341 * from <code>in</code>, or (unlikely) the size 342 * of the underlying stream got smaller while 343 * this was reading from it 344 * 345 * @throws IOException may be thrown by <code>in</code> 346 * 347 * @since 1.7 348 */ SeekableXZInputStream(SeekableInputStream in, int memoryLimit, ArrayCache arrayCache)349 public SeekableXZInputStream(SeekableInputStream in, int memoryLimit, 350 ArrayCache arrayCache) 351 throws IOException { 352 this(in, memoryLimit, true, arrayCache); 353 } 354 355 /** 356 * Creates a new seekable XZ decomporessor with an optional 357 * memory usage limit and ability to disable verification 358 * of integrity checks. 359 * <p> 360 * Note that integrity check verification should almost never be disabled. 361 * Possible reasons to disable integrity check verification: 362 * <ul> 363 * <li>Trying to recover data from a corrupt .xz file.</li> 364 * <li>Speeding up decompression. This matters mostly with SHA-256 365 * or with files that have compressed extremely well. It's recommended 366 * that integrity checking isn't disabled for performance reasons 367 * unless the file integrity is verified externally in some other 368 * way.</li> 369 * </ul> 370 * <p> 371 * <code>verifyCheck</code> only affects the integrity check of 372 * the actual compressed data. The CRC32 fields in the headers 373 * are always verified. 374 * 375 * @param in seekable input stream containing one or more 376 * XZ Streams; the whole input stream is used 377 * 378 * @param memoryLimit memory usage limit in kibibytes (KiB) 379 * or <code>-1</code> to impose no 380 * memory usage limit 381 * 382 * @param verifyCheck if <code>true</code>, the integrity checks 383 * will be verified; this should almost never 384 * be set to <code>false</code> 385 * 386 * @throws XZFormatException 387 * input is not in the XZ format 388 * 389 * @throws CorruptedInputException 390 * XZ data is corrupt or truncated 391 * 392 * @throws UnsupportedOptionsException 393 * XZ headers seem valid but they specify 394 * options not supported by this implementation 395 * 396 * @throws MemoryLimitException 397 * decoded XZ Indexes would need more memory 398 * than allowed by the memory usage limit 399 * 400 * @throws EOFException 401 * less than 6 bytes of input was available 402 * from <code>in</code>, or (unlikely) the size 403 * of the underlying stream got smaller while 404 * this was reading from it 405 * 406 * @throws IOException may be thrown by <code>in</code> 407 * 408 * @since 1.6 409 */ SeekableXZInputStream(SeekableInputStream in, int memoryLimit, boolean verifyCheck)410 public SeekableXZInputStream(SeekableInputStream in, int memoryLimit, 411 boolean verifyCheck) 412 throws IOException { 413 this(in, memoryLimit, verifyCheck, ArrayCache.getDefaultCache()); 414 } 415 416 /** 417 * Creates a new seekable XZ decomporessor with an optional 418 * memory usage limit and ability to disable verification 419 * of integrity checks. 420 * <p> 421 * This is identical to 422 * <code>SeekableXZInputStream(SeekableInputStream,int,boolean)</code> 423 * except that this also takes the <code>arrayCache</code> argument. 424 * 425 * @param in seekable input stream containing one or more 426 * XZ Streams; the whole input stream is used 427 * 428 * @param memoryLimit memory usage limit in kibibytes (KiB) 429 * or <code>-1</code> to impose no 430 * memory usage limit 431 * 432 * @param verifyCheck if <code>true</code>, the integrity checks 433 * will be verified; this should almost never 434 * be set to <code>false</code> 435 * 436 * @param arrayCache cache to be used for allocating large arrays 437 * 438 * @throws XZFormatException 439 * input is not in the XZ format 440 * 441 * @throws CorruptedInputException 442 * XZ data is corrupt or truncated 443 * 444 * @throws UnsupportedOptionsException 445 * XZ headers seem valid but they specify 446 * options not supported by this implementation 447 * 448 * @throws MemoryLimitException 449 * decoded XZ Indexes would need more memory 450 * than allowed by the memory usage limit 451 * 452 * @throws EOFException 453 * less than 6 bytes of input was available 454 * from <code>in</code>, or (unlikely) the size 455 * of the underlying stream got smaller while 456 * this was reading from it 457 * 458 * @throws IOException may be thrown by <code>in</code> 459 * 460 * @since 1.7 461 */ SeekableXZInputStream(SeekableInputStream in, int memoryLimit, boolean verifyCheck, ArrayCache arrayCache)462 public SeekableXZInputStream(SeekableInputStream in, int memoryLimit, 463 boolean verifyCheck, ArrayCache arrayCache) 464 throws IOException { 465 this.arrayCache = arrayCache; 466 this.verifyCheck = verifyCheck; 467 this.in = in; 468 DataInputStream inData = new DataInputStream(in); 469 470 // Check the magic bytes in the beginning of the file. 471 { 472 in.seek(0); 473 byte[] buf = new byte[XZ.HEADER_MAGIC.length]; 474 inData.readFully(buf); 475 if (!Arrays.equals(buf, XZ.HEADER_MAGIC)) 476 throw new XZFormatException(); 477 } 478 479 // Get the file size and verify that it is a multiple of 4 bytes. 480 long pos = in.length(); 481 if ((pos & 3) != 0) 482 throw new CorruptedInputException( 483 "XZ file size is not a multiple of 4 bytes"); 484 485 // Parse the headers starting from the end of the file. 486 byte[] buf = new byte[DecoderUtil.STREAM_HEADER_SIZE]; 487 long streamPadding = 0; 488 489 while (pos > 0) { 490 if (pos < DecoderUtil.STREAM_HEADER_SIZE) 491 throw new CorruptedInputException(); 492 493 // Read the potential Stream Footer. 494 in.seek(pos - DecoderUtil.STREAM_HEADER_SIZE); 495 inData.readFully(buf); 496 497 // Skip Stream Padding four bytes at a time. 498 // Skipping more at once would be faster, 499 // but usually there isn't much Stream Padding. 500 if (buf[8] == 0x00 && buf[9] == 0x00 && buf[10] == 0x00 501 && buf[11] == 0x00) { 502 streamPadding += 4; 503 pos -= 4; 504 continue; 505 } 506 507 // It's not Stream Padding. Update pos. 508 pos -= DecoderUtil.STREAM_HEADER_SIZE; 509 510 // Decode the Stream Footer and check if Backward Size 511 // looks reasonable. 512 StreamFlags streamFooter = DecoderUtil.decodeStreamFooter(buf); 513 if (streamFooter.backwardSize >= pos) 514 throw new CorruptedInputException( 515 "Backward Size in XZ Stream Footer is too big"); 516 517 // Check that the Check ID is supported. Store it in case this 518 // is the first Stream in the file. 519 check = Check.getInstance(streamFooter.checkType); 520 521 // Remember which Check IDs have been seen. 522 checkTypes |= 1 << streamFooter.checkType; 523 524 // Seek to the beginning of the Index. 525 in.seek(pos - streamFooter.backwardSize); 526 527 // Decode the Index field. 528 IndexDecoder index; 529 try { 530 index = new IndexDecoder(in, streamFooter, streamPadding, 531 memoryLimit); 532 } catch (MemoryLimitException e) { 533 // IndexDecoder doesn't know how much memory we had 534 // already needed so we need to recreate the exception. 535 assert memoryLimit >= 0; 536 throw new MemoryLimitException( 537 e.getMemoryNeeded() + indexMemoryUsage, 538 memoryLimit + indexMemoryUsage); 539 } 540 541 // Update the memory usage and limit counters. 542 indexMemoryUsage += index.getMemoryUsage(); 543 if (memoryLimit >= 0) { 544 memoryLimit -= index.getMemoryUsage(); 545 assert memoryLimit >= 0; 546 } 547 548 // Remember the uncompressed size of the largest Block. 549 if (largestBlockSize < index.getLargestBlockSize()) 550 largestBlockSize = index.getLargestBlockSize(); 551 552 // Calculate the offset to the beginning of this XZ Stream and 553 // check that it looks sane. 554 long off = index.getStreamSize() - DecoderUtil.STREAM_HEADER_SIZE; 555 if (pos < off) 556 throw new CorruptedInputException("XZ Index indicates " 557 + "too big compressed size for the XZ Stream"); 558 559 // Seek to the beginning of this Stream. 560 pos -= off; 561 in.seek(pos); 562 563 // Decode the Stream Header. 564 inData.readFully(buf); 565 StreamFlags streamHeader = DecoderUtil.decodeStreamHeader(buf); 566 567 // Verify that the Stream Header matches the Stream Footer. 568 if (!DecoderUtil.areStreamFlagsEqual(streamHeader, streamFooter)) 569 throw new CorruptedInputException( 570 "XZ Stream Footer does not match Stream Header"); 571 572 // Update the total uncompressed size of the file and check that 573 // it doesn't overflow. 574 uncompressedSize += index.getUncompressedSize(); 575 if (uncompressedSize < 0) 576 throw new UnsupportedOptionsException("XZ file is too big"); 577 578 // Update the Block count and check that it fits into an int. 579 blockCount += index.getRecordCount(); 580 if (blockCount < 0) 581 throw new UnsupportedOptionsException( 582 "XZ file has over " + Integer.MAX_VALUE + " Blocks"); 583 584 // Add this Stream to the list of Streams. 585 streams.add(index); 586 587 // Reset to be ready to parse the next Stream. 588 streamPadding = 0; 589 } 590 591 assert pos == 0; 592 593 // Save it now that indexMemoryUsage has been substracted from it. 594 this.memoryLimit = memoryLimit; 595 596 // Store the relative offsets of the Streams. This way we don't 597 // need to recalculate them in this class when seeking; the 598 // IndexDecoder instances will handle them. 599 IndexDecoder prev = streams.get(streams.size() - 1); 600 for (int i = streams.size() - 2; i >= 0; --i) { 601 IndexDecoder cur = streams.get(i); 602 cur.setOffsets(prev); 603 prev = cur; 604 } 605 606 // Initialize curBlockInfo to point to the first Stream. 607 // The blockNumber will be left to -1 so that .hasNext() 608 // and .setNext() work to get the first Block when starting 609 // to decompress from the beginning of the file. 610 IndexDecoder first = streams.get(streams.size() - 1); 611 curBlockInfo = new BlockInfo(first); 612 613 // queriedBlockInfo needs to be allocated too. The Stream used for 614 // initialization doesn't matter though. 615 queriedBlockInfo = new BlockInfo(first); 616 } 617 618 /** 619 * Gets the types of integrity checks used in the .xz file. 620 * Multiple checks are possible only if there are multiple 621 * concatenated XZ Streams. 622 * <p> 623 * The returned value has a bit set for every check type that is present. 624 * For example, if CRC64 and SHA-256 were used, the return value is 625 * <code>(1 << XZ.CHECK_CRC64) 626 * | (1 << XZ.CHECK_SHA256)</code>. 627 */ getCheckTypes()628 public int getCheckTypes() { 629 return checkTypes; 630 } 631 632 /** 633 * Gets the amount of memory in kibibytes (KiB) used by 634 * the data structures needed to locate the XZ Blocks. 635 * This is usually useless information but since it is calculated 636 * for memory usage limit anyway, it is nice to make it available to too. 637 */ getIndexMemoryUsage()638 public int getIndexMemoryUsage() { 639 return indexMemoryUsage; 640 } 641 642 /** 643 * Gets the uncompressed size of the largest XZ Block in bytes. 644 * This can be useful if you want to check that the file doesn't 645 * have huge XZ Blocks which could make seeking to arbitrary offsets 646 * very slow. Note that huge Blocks don't automatically mean that 647 * seeking would be slow, for example, seeking to the beginning of 648 * any Block is always fast. 649 */ getLargestBlockSize()650 public long getLargestBlockSize() { 651 return largestBlockSize; 652 } 653 654 /** 655 * Gets the number of Streams in the .xz file. 656 * 657 * @since 1.3 658 */ getStreamCount()659 public int getStreamCount() { 660 return streams.size(); 661 } 662 663 /** 664 * Gets the number of Blocks in the .xz file. 665 * 666 * @since 1.3 667 */ getBlockCount()668 public int getBlockCount() { 669 return blockCount; 670 } 671 672 /** 673 * Gets the uncompressed start position of the given Block. 674 * 675 * @throws IndexOutOfBoundsException if 676 * <code>blockNumber < 0</code> or 677 * <code>blockNumber >= getBlockCount()</code>. 678 * 679 * @since 1.3 680 */ getBlockPos(int blockNumber)681 public long getBlockPos(int blockNumber) { 682 locateBlockByNumber(queriedBlockInfo, blockNumber); 683 return queriedBlockInfo.uncompressedOffset; 684 } 685 686 /** 687 * Gets the uncompressed size of the given Block. 688 * 689 * @throws IndexOutOfBoundsException if 690 * <code>blockNumber < 0</code> or 691 * <code>blockNumber >= getBlockCount()</code>. 692 * 693 * @since 1.3 694 */ getBlockSize(int blockNumber)695 public long getBlockSize(int blockNumber) { 696 locateBlockByNumber(queriedBlockInfo, blockNumber); 697 return queriedBlockInfo.uncompressedSize; 698 } 699 700 /** 701 * Gets the position where the given compressed Block starts in 702 * the underlying .xz file. 703 * This information is rarely useful to the users of this class. 704 * 705 * @throws IndexOutOfBoundsException if 706 * <code>blockNumber < 0</code> or 707 * <code>blockNumber >= getBlockCount()</code>. 708 * 709 * @since 1.3 710 */ getBlockCompPos(int blockNumber)711 public long getBlockCompPos(int blockNumber) { 712 locateBlockByNumber(queriedBlockInfo, blockNumber); 713 return queriedBlockInfo.compressedOffset; 714 } 715 716 /** 717 * Gets the compressed size of the given Block. 718 * This together with the uncompressed size can be used to calculate 719 * the compression ratio of the specific Block. 720 * 721 * @throws IndexOutOfBoundsException if 722 * <code>blockNumber < 0</code> or 723 * <code>blockNumber >= getBlockCount()</code>. 724 * 725 * @since 1.3 726 */ getBlockCompSize(int blockNumber)727 public long getBlockCompSize(int blockNumber) { 728 locateBlockByNumber(queriedBlockInfo, blockNumber); 729 return (queriedBlockInfo.unpaddedSize + 3) & ~3; 730 } 731 732 /** 733 * Gets integrity check type (Check ID) of the given Block. 734 * 735 * @throws IndexOutOfBoundsException if 736 * <code>blockNumber < 0</code> or 737 * <code>blockNumber >= getBlockCount()</code>. 738 * 739 * @see #getCheckTypes() 740 * 741 * @since 1.3 742 */ getBlockCheckType(int blockNumber)743 public int getBlockCheckType(int blockNumber) { 744 locateBlockByNumber(queriedBlockInfo, blockNumber); 745 return queriedBlockInfo.getCheckType(); 746 } 747 748 /** 749 * Gets the number of the Block that contains the byte at the given 750 * uncompressed position. 751 * 752 * @throws IndexOutOfBoundsException if 753 * <code>pos < 0</code> or 754 * <code>pos >= length()</code>. 755 * 756 * @since 1.3 757 */ getBlockNumber(long pos)758 public int getBlockNumber(long pos) { 759 locateBlockByPos(queriedBlockInfo, pos); 760 return queriedBlockInfo.blockNumber; 761 } 762 763 /** 764 * Decompresses the next byte from this input stream. 765 * 766 * @return the next decompressed byte, or <code>-1</code> 767 * to indicate the end of the compressed stream 768 * 769 * @throws CorruptedInputException 770 * @throws UnsupportedOptionsException 771 * @throws MemoryLimitException 772 * 773 * @throws XZIOException if the stream has been closed 774 * 775 * @throws IOException may be thrown by <code>in</code> 776 */ read()777 public int read() throws IOException { 778 return read(tempBuf, 0, 1) == -1 ? -1 : (tempBuf[0] & 0xFF); 779 } 780 781 /** 782 * Decompresses into an array of bytes. 783 * <p> 784 * If <code>len</code> is zero, no bytes are read and <code>0</code> 785 * is returned. Otherwise this will try to decompress <code>len</code> 786 * bytes of uncompressed data. Less than <code>len</code> bytes may 787 * be read only in the following situations: 788 * <ul> 789 * <li>The end of the compressed data was reached successfully.</li> 790 * <li>An error is detected after at least one but less than 791 * <code>len</code> bytes have already been successfully 792 * decompressed. The next call with non-zero <code>len</code> 793 * will immediately throw the pending exception.</li> 794 * <li>An exception is thrown.</li> 795 * </ul> 796 * 797 * @param buf target buffer for uncompressed data 798 * @param off start offset in <code>buf</code> 799 * @param len maximum number of uncompressed bytes to read 800 * 801 * @return number of bytes read, or <code>-1</code> to indicate 802 * the end of the compressed stream 803 * 804 * @throws CorruptedInputException 805 * @throws UnsupportedOptionsException 806 * @throws MemoryLimitException 807 * 808 * @throws XZIOException if the stream has been closed 809 * 810 * @throws IOException may be thrown by <code>in</code> 811 */ read(byte[] buf, int off, int len)812 public int read(byte[] buf, int off, int len) throws IOException { 813 if (off < 0 || len < 0 || off + len < 0 || off + len > buf.length) 814 throw new IndexOutOfBoundsException(); 815 816 if (len == 0) 817 return 0; 818 819 if (in == null) 820 throw new XZIOException("Stream closed"); 821 822 if (exception != null) 823 throw exception; 824 825 int size = 0; 826 827 try { 828 if (seekNeeded) 829 seek(); 830 831 if (endReached) 832 return -1; 833 834 while (len > 0) { 835 if (blockDecoder == null) { 836 seek(); 837 if (endReached) 838 break; 839 } 840 841 int ret = blockDecoder.read(buf, off, len); 842 843 if (ret > 0) { 844 curPos += ret; 845 size += ret; 846 off += ret; 847 len -= ret; 848 } else if (ret == -1) { 849 blockDecoder = null; 850 } 851 } 852 } catch (IOException e) { 853 // We know that the file isn't simply truncated because we could 854 // parse the Indexes in the constructor. So convert EOFException 855 // to CorruptedInputException. 856 if (e instanceof EOFException) 857 e = new CorruptedInputException(); 858 859 exception = e; 860 if (size == 0) 861 throw e; 862 } 863 864 return size; 865 } 866 867 /** 868 * Returns the number of uncompressed bytes that can be read 869 * without blocking. The value is returned with an assumption 870 * that the compressed input data will be valid. If the compressed 871 * data is corrupt, <code>CorruptedInputException</code> may get 872 * thrown before the number of bytes claimed to be available have 873 * been read from this input stream. 874 * 875 * @return the number of uncompressed bytes that can be read 876 * without blocking 877 */ available()878 public int available() throws IOException { 879 if (in == null) 880 throw new XZIOException("Stream closed"); 881 882 if (exception != null) 883 throw exception; 884 885 if (endReached || seekNeeded || blockDecoder == null) 886 return 0; 887 888 return blockDecoder.available(); 889 } 890 891 /** 892 * Closes the stream and calls <code>in.close()</code>. 893 * If the stream was already closed, this does nothing. 894 * <p> 895 * This is equivalent to <code>close(true)</code>. 896 * 897 * @throws IOException if thrown by <code>in.close()</code> 898 */ close()899 public void close() throws IOException { 900 close(true); 901 } 902 903 /** 904 * Closes the stream and optionally calls <code>in.close()</code>. 905 * If the stream was already closed, this does nothing. 906 * If <code>close(false)</code> has been called, a further 907 * call of <code>close(true)</code> does nothing (it doesn't call 908 * <code>in.close()</code>). 909 * <p> 910 * If you don't want to close the underlying <code>InputStream</code>, 911 * there is usually no need to worry about closing this stream either; 912 * it's fine to do nothing and let the garbage collector handle it. 913 * However, if you are using {@link ArrayCache}, <code>close(false)</code> 914 * can be useful to put the allocated arrays back to the cache without 915 * closing the underlying <code>InputStream</code>. 916 * <p> 917 * Note that if you successfully reach the end of the stream 918 * (<code>read</code> returns <code>-1</code>), the arrays are 919 * automatically put back to the cache by that <code>read</code> call. In 920 * this situation <code>close(false)</code> is redundant (but harmless). 921 * 922 * @throws IOException if thrown by <code>in.close()</code> 923 * 924 * @since 1.7 925 */ close(boolean closeInput)926 public void close(boolean closeInput) throws IOException { 927 if (in != null) { 928 if (blockDecoder != null) { 929 blockDecoder.close(); 930 blockDecoder = null; 931 } 932 933 try { 934 if (closeInput) 935 in.close(); 936 } finally { 937 in = null; 938 } 939 } 940 } 941 942 /** 943 * Gets the uncompressed size of this input stream. If there are multiple 944 * XZ Streams, the total uncompressed size of all XZ Streams is returned. 945 */ length()946 public long length() { 947 return uncompressedSize; 948 } 949 950 /** 951 * Gets the current uncompressed position in this input stream. 952 * 953 * @throws XZIOException if the stream has been closed 954 */ position()955 public long position() throws IOException { 956 if (in == null) 957 throw new XZIOException("Stream closed"); 958 959 return seekNeeded ? seekPos : curPos; 960 } 961 962 /** 963 * Seeks to the specified absolute uncompressed position in the stream. 964 * This only stores the new position, so this function itself is always 965 * very fast. The actual seek is done when <code>read</code> is called 966 * to read at least one byte. 967 * <p> 968 * Seeking past the end of the stream is possible. In that case 969 * <code>read</code> will return <code>-1</code> to indicate 970 * the end of the stream. 971 * 972 * @param pos new uncompressed read position 973 * 974 * @throws XZIOException 975 * if <code>pos</code> is negative, or 976 * if stream has been closed 977 */ seek(long pos)978 public void seek(long pos) throws IOException { 979 if (in == null) 980 throw new XZIOException("Stream closed"); 981 982 if (pos < 0) 983 throw new XZIOException("Negative seek position: " + pos); 984 985 seekPos = pos; 986 seekNeeded = true; 987 } 988 989 /** 990 * Seeks to the beginning of the given XZ Block. 991 * 992 * @throws XZIOException 993 * if <code>blockNumber < 0</code> or 994 * <code>blockNumber >= getBlockCount()</code>, 995 * or if stream has been closed 996 * 997 * @since 1.3 998 */ seekToBlock(int blockNumber)999 public void seekToBlock(int blockNumber) throws IOException { 1000 if (in == null) 1001 throw new XZIOException("Stream closed"); 1002 1003 if (blockNumber < 0 || blockNumber >= blockCount) 1004 throw new XZIOException("Invalid XZ Block number: " + blockNumber); 1005 1006 // This is a bit silly implementation. Here we locate the uncompressed 1007 // offset of the specified Block, then when doing the actual seek in 1008 // seek(), we need to find the Block number based on seekPos. 1009 seekPos = getBlockPos(blockNumber); 1010 seekNeeded = true; 1011 } 1012 1013 /** 1014 * Does the actual seeking. This is also called when <code>read</code> 1015 * needs a new Block to decode. 1016 */ seek()1017 private void seek() throws IOException { 1018 // If seek(long) wasn't called, we simply need to get the next Block 1019 // from the same Stream. If there are no more Blocks in this Stream, 1020 // then we behave as if seek(long) had been called. 1021 if (!seekNeeded) { 1022 if (curBlockInfo.hasNext()) { 1023 curBlockInfo.setNext(); 1024 initBlockDecoder(); 1025 return; 1026 } 1027 1028 seekPos = curPos; 1029 } 1030 1031 seekNeeded = false; 1032 1033 // Check if we are seeking to or past the end of the file. 1034 if (seekPos >= uncompressedSize) { 1035 curPos = seekPos; 1036 1037 if (blockDecoder != null) { 1038 blockDecoder.close(); 1039 blockDecoder = null; 1040 } 1041 1042 endReached = true; 1043 return; 1044 } 1045 1046 endReached = false; 1047 1048 // Locate the Block that contains the uncompressed target position. 1049 locateBlockByPos(curBlockInfo, seekPos); 1050 1051 // Seek in the underlying stream and create a new Block decoder 1052 // only if really needed. We can skip it if the current position 1053 // is already in the correct Block and the target position hasn't 1054 // been decompressed yet. 1055 // 1056 // NOTE: If curPos points to the beginning of this Block, it's 1057 // because it was left there after decompressing an earlier Block. 1058 // In that case, decoding of the current Block hasn't been started 1059 // yet. (Decoding of a Block won't be started until at least one 1060 // byte will also be read from it.) 1061 if (!(curPos > curBlockInfo.uncompressedOffset && curPos <= seekPos)) { 1062 // Seek to the beginning of the Block. 1063 in.seek(curBlockInfo.compressedOffset); 1064 1065 // Since it is possible that this Block is from a different 1066 // Stream than the previous Block, initialize a new Check. 1067 check = Check.getInstance(curBlockInfo.getCheckType()); 1068 1069 // Create a new Block decoder. 1070 initBlockDecoder(); 1071 curPos = curBlockInfo.uncompressedOffset; 1072 } 1073 1074 // If the target wasn't at a Block boundary, decompress and throw 1075 // away data to reach the target position. 1076 if (seekPos > curPos) { 1077 // NOTE: The "if" below is there just in case. In this situation, 1078 // blockDecoder.skip will always skip the requested amount 1079 // or throw an exception. 1080 long skipAmount = seekPos - curPos; 1081 if (blockDecoder.skip(skipAmount) != skipAmount) 1082 throw new CorruptedInputException(); 1083 1084 curPos = seekPos; 1085 } 1086 } 1087 1088 /** 1089 * Locates the Block that contains the given uncompressed position. 1090 */ locateBlockByPos(BlockInfo info, long pos)1091 private void locateBlockByPos(BlockInfo info, long pos) { 1092 if (pos < 0 || pos >= uncompressedSize) 1093 throw new IndexOutOfBoundsException( 1094 "Invalid uncompressed position: " + pos); 1095 1096 // Locate the Stream that contains the target position. 1097 IndexDecoder index; 1098 for (int i = 0; ; ++i) { 1099 index = streams.get(i); 1100 if (index.hasUncompressedOffset(pos)) 1101 break; 1102 } 1103 1104 // Locate the Block from the Stream that contains the target position. 1105 index.locateBlock(info, pos); 1106 1107 assert (info.compressedOffset & 3) == 0; 1108 assert info.uncompressedSize > 0; 1109 assert pos >= info.uncompressedOffset; 1110 assert pos < info.uncompressedOffset + info.uncompressedSize; 1111 } 1112 1113 /** 1114 * Locates the given Block and stores information about it 1115 * to <code>info</code>. 1116 */ 1117 private void locateBlockByNumber(BlockInfo info, int blockNumber) { 1118 // Validate. 1119 if (blockNumber < 0 || blockNumber >= blockCount) 1120 throw new IndexOutOfBoundsException( 1121 "Invalid XZ Block number: " + blockNumber); 1122 1123 // Skip the search if info already points to the correct Block. 1124 if (info.blockNumber == blockNumber) 1125 return; 1126 1127 // Search the Stream that contains the given Block and then 1128 // search the Block from that Stream. 1129 for (int i = 0; ; ++i) { 1130 IndexDecoder index = streams.get(i); 1131 if (index.hasRecord(blockNumber)) { 1132 index.setBlockInfo(info, blockNumber); 1133 return; 1134 } 1135 } 1136 } 1137 1138 /** 1139 * Initializes a new BlockInputStream. This is a helper function for 1140 * <code>seek()</code>. 1141 */ 1142 private void initBlockDecoder() throws IOException { 1143 try { 1144 // Set it to null first so that GC can collect it if memory 1145 // runs tight when initializing a new BlockInputStream. 1146 if (blockDecoder != null) { 1147 blockDecoder.close(); 1148 blockDecoder = null; 1149 } 1150 1151 blockDecoder = new BlockInputStream( 1152 in, check, verifyCheck, memoryLimit, 1153 curBlockInfo.unpaddedSize, curBlockInfo.uncompressedSize, 1154 arrayCache); 1155 } catch (MemoryLimitException e) { 1156 // BlockInputStream doesn't know how much memory we had 1157 // already needed so we need to recreate the exception. 1158 assert memoryLimit >= 0; 1159 throw new MemoryLimitException( 1160 e.getMemoryNeeded() + indexMemoryUsage, 1161 memoryLimit + indexMemoryUsage); 1162 } catch (IndexIndicatorException e) { 1163 // It cannot be Index so the file must be corrupt. 1164 throw new CorruptedInputException(); 1165 } 1166 } 1167 } 1168