• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * SeekableXZInputStream
3  *
4  * Author: Lasse Collin <lasse.collin@tukaani.org>
5  *
6  * This file has been put into the public domain.
7  * You can do whatever you want with this file.
8  */
9 
10 package org.tukaani.xz;
11 
12 import java.util.Arrays;
13 import java.util.ArrayList;
14 import java.io.DataInputStream;
15 import java.io.IOException;
16 import java.io.EOFException;
17 import org.tukaani.xz.common.DecoderUtil;
18 import org.tukaani.xz.common.StreamFlags;
19 import org.tukaani.xz.check.Check;
20 import org.tukaani.xz.index.IndexDecoder;
21 import org.tukaani.xz.index.BlockInfo;
22 
23 /**
24  * Decompresses a .xz file in random access mode.
25  * This supports decompressing concatenated .xz files.
26  * <p>
27  * Each .xz file consist of one or more Streams. Each Stream consist of zero
28  * or more Blocks. Each Stream contains an Index of Streams' Blocks.
29  * The Indexes from all Streams are loaded in RAM by a constructor of this
30  * class. A typical .xz file has only one Stream, and parsing its Index will
31  * need only three or four seeks.
32  * <p>
33  * To make random access possible, the data in a .xz file must be splitted
34  * into multiple Blocks of reasonable size. Decompression can only start at
35  * a Block boundary. When seeking to an uncompressed position that is not at
36  * a Block boundary, decompression starts at the beginning of the Block and
37  * throws away data until the target position is reached. Thus, smaller Blocks
38  * mean faster seeks to arbitrary uncompressed positions. On the other hand,
39  * smaller Blocks mean worse compression. So one has to make a compromise
40  * between random access speed and compression ratio.
41  * <p>
42  * Implementation note: This class uses linear search to locate the correct
43  * Stream from the data structures in RAM. It was the simplest to implement
44  * and should be fine as long as there aren't too many Streams. The correct
45  * Block inside a Stream is located using binary search and thus is fast
46  * even with a huge number of Blocks.
47  *
48  * <h2>Memory usage</h2>
49  * <p>
50  * The amount of memory needed for the Indexes is taken into account when
51  * checking the memory usage limit. Each Stream is calculated to need at
52  * least 1&nbsp;KiB of memory and each Block 16 bytes of memory, rounded up
53  * to the next kibibyte. So unless the file has a huge number of Streams or
54  * Blocks, these don't take significant amount of memory.
55  *
56  * <h2>Creating random-accessible .xz files</h2>
57  * <p>
58  * When using {@link XZOutputStream}, a new Block can be started by calling
59  * its {@link XZOutputStream#endBlock() endBlock} method. If you know
60  * that the decompressor will only need to seek to certain uncompressed
61  * positions, it can be a good idea to start a new Block at (some of) these
62  * positions (and only at these positions to get better compression ratio).
63  * <p>
64  * liblzma in XZ Utils supports starting a new Block with
65  * <code>LZMA_FULL_FLUSH</code>. XZ Utils 5.1.1alpha added threaded
66  * compression which creates multi-Block .xz files. XZ Utils 5.1.1alpha
67  * also added the option <code>--block-size=SIZE</code> to the xz command
68  * line tool. XZ Utils 5.1.2alpha added a partial implementation of
69  * <code>--block-list=SIZES</code> which allows specifying sizes of
70  * individual Blocks.
71  *
72  * <h2>Example: getting the uncompressed size of a .xz file</h2>
73  * <blockquote><pre>
74  * String filename = "foo.xz";
75  * SeekableFileInputStream seekableFile
76  *         = new SeekableFileInputStream(filename);
77  *
78  * try {
79  *     SeekableXZInputStream seekableXZ
80  *             = new SeekableXZInputStream(seekableFile);
81  *     System.out.println("Uncompressed size: " + seekableXZ.length());
82  * } finally {
83  *     seekableFile.close();
84  * }
85  * </pre></blockquote>
86  *
87  * @see SeekableFileInputStream
88  * @see XZInputStream
89  * @see XZOutputStream
90  */
91 public class SeekableXZInputStream extends SeekableInputStream {
92     /**
93      * Cache for big arrays.
94      */
95     private final ArrayCache arrayCache;
96 
97     /**
98      * The input stream containing XZ compressed data.
99      */
100     private SeekableInputStream in;
101 
102     /**
103      * Memory usage limit after the memory usage of the IndexDecoders have
104      * been substracted.
105      */
106     private final int memoryLimit;
107 
108     /**
109      * Memory usage of the IndexDecoders.
110      * <code>memoryLimit + indexMemoryUsage</code> equals the original
111      * memory usage limit that was passed to the constructor.
112      */
113     private int indexMemoryUsage = 0;
114 
115     /**
116      * List of IndexDecoders, one for each Stream in the file.
117      * The list is in reverse order: The first element is
118      * the last Stream in the file.
119      */
120     private final ArrayList<IndexDecoder> streams
121             = new ArrayList<IndexDecoder>();
122 
123     /**
124      * Bitmask of all Check IDs seen.
125      */
126     private int checkTypes = 0;
127 
128     /**
129      * Uncompressed size of the file (all Streams).
130      */
131     private long uncompressedSize = 0;
132 
133     /**
134      * Uncompressed size of the largest XZ Block in the file.
135      */
136     private long largestBlockSize = 0;
137 
138     /**
139      * Number of XZ Blocks in the file.
140      */
141     private int blockCount = 0;
142 
143     /**
144      * Size and position information about the current Block.
145      * If there are no Blocks, all values will be <code>-1</code>.
146      */
147     private final BlockInfo curBlockInfo;
148 
149     /**
150      * Temporary (and cached) information about the Block whose information
151      * is queried via <code>getBlockPos</code> and related functions.
152      */
153     private final BlockInfo queriedBlockInfo;
154 
155     /**
156      * Integrity Check in the current XZ Stream. The constructor leaves
157      * this to point to the Check of the first Stream.
158      */
159     private Check check;
160 
161     /**
162      * Flag indicating if the integrity checks will be verified.
163      */
164     private final boolean verifyCheck;
165 
166     /**
167      * Decoder of the current XZ Block, if any.
168      */
169     private BlockInputStream blockDecoder = null;
170 
171     /**
172      * Current uncompressed position.
173      */
174     private long curPos = 0;
175 
176     /**
177      * Target position for seeking.
178      */
179     private long seekPos;
180 
181     /**
182      * True when <code>seek(long)</code> has been called but the actual
183      * seeking hasn't been done yet.
184      */
185     private boolean seekNeeded = false;
186 
187     /**
188      * True when end of the file was reached. This can be cleared by
189      * calling <code>seek(long)</code>.
190      */
191     private boolean endReached = false;
192 
193     /**
194      * Pending exception from an earlier error.
195      */
196     private IOException exception = null;
197 
198     /**
199      * Temporary buffer for read(). This avoids reallocating memory
200      * on every read() call.
201      */
202     private final byte[] tempBuf = new byte[1];
203 
204     /**
205      * Creates a new seekable XZ decompressor without a memory usage limit.
206      *
207      * @param       in          seekable input stream containing one or more
208      *                          XZ Streams; the whole input stream is used
209      *
210      * @throws      XZFormatException
211      *                          input is not in the XZ format
212      *
213      * @throws      CorruptedInputException
214      *                          XZ data is corrupt or truncated
215      *
216      * @throws      UnsupportedOptionsException
217      *                          XZ headers seem valid but they specify
218      *                          options not supported by this implementation
219      *
220      * @throws      EOFException
221      *                          less than 6 bytes of input was available
222      *                          from <code>in</code>, or (unlikely) the size
223      *                          of the underlying stream got smaller while
224      *                          this was reading from it
225      *
226      * @throws      IOException may be thrown by <code>in</code>
227      */
SeekableXZInputStream(SeekableInputStream in)228     public SeekableXZInputStream(SeekableInputStream in)
229             throws IOException {
230         this(in, -1);
231     }
232 
233     /**
234      * Creates a new seekable XZ decompressor without a memory usage limit.
235      * <p>
236      * This is identical to
237      * <code>SeekableXZInputStream(SeekableInputStream)</code> except that
238      * this also takes the <code>arrayCache</code> argument.
239      *
240      * @param       in          seekable input stream containing one or more
241      *                          XZ Streams; the whole input stream is used
242      *
243      * @param       arrayCache  cache to be used for allocating large arrays
244      *
245      * @throws      XZFormatException
246      *                          input is not in the XZ format
247      *
248      * @throws      CorruptedInputException
249      *                          XZ data is corrupt or truncated
250      *
251      * @throws      UnsupportedOptionsException
252      *                          XZ headers seem valid but they specify
253      *                          options not supported by this implementation
254      *
255      * @throws      EOFException
256      *                          less than 6 bytes of input was available
257      *                          from <code>in</code>, or (unlikely) the size
258      *                          of the underlying stream got smaller while
259      *                          this was reading from it
260      *
261      * @throws      IOException may be thrown by <code>in</code>
262      *
263      * @since 1.7
264      */
SeekableXZInputStream(SeekableInputStream in, ArrayCache arrayCache)265     public SeekableXZInputStream(SeekableInputStream in, ArrayCache arrayCache)
266             throws IOException {
267         this(in, -1, arrayCache);
268     }
269 
270     /**
271      * Creates a new seekable XZ decomporessor with an optional
272      * memory usage limit.
273      *
274      * @param       in          seekable input stream containing one or more
275      *                          XZ Streams; the whole input stream is used
276      *
277      * @param       memoryLimit memory usage limit in kibibytes (KiB)
278      *                          or <code>-1</code> to impose no
279      *                          memory usage limit
280      *
281      * @throws      XZFormatException
282      *                          input is not in the XZ format
283      *
284      * @throws      CorruptedInputException
285      *                          XZ data is corrupt or truncated
286      *
287      * @throws      UnsupportedOptionsException
288      *                          XZ headers seem valid but they specify
289      *                          options not supported by this implementation
290      *
291      * @throws      MemoryLimitException
292      *                          decoded XZ Indexes would need more memory
293      *                          than allowed by the memory usage limit
294      *
295      * @throws      EOFException
296      *                          less than 6 bytes of input was available
297      *                          from <code>in</code>, or (unlikely) the size
298      *                          of the underlying stream got smaller while
299      *                          this was reading from it
300      *
301      * @throws      IOException may be thrown by <code>in</code>
302      */
SeekableXZInputStream(SeekableInputStream in, int memoryLimit)303     public SeekableXZInputStream(SeekableInputStream in, int memoryLimit)
304             throws IOException {
305         this(in, memoryLimit, true);
306     }
307 
308     /**
309      * Creates a new seekable XZ decomporessor with an optional
310      * memory usage limit.
311      * <p>
312      * This is identical to
313      * <code>SeekableXZInputStream(SeekableInputStream,int)</code>
314      * except that this also takes the <code>arrayCache</code> argument.
315      *
316      * @param       in          seekable input stream containing one or more
317      *                          XZ Streams; the whole input stream is used
318      *
319      * @param       memoryLimit memory usage limit in kibibytes (KiB)
320      *                          or <code>-1</code> to impose no
321      *                          memory usage limit
322      *
323      * @param       arrayCache  cache to be used for allocating large arrays
324      *
325      * @throws      XZFormatException
326      *                          input is not in the XZ format
327      *
328      * @throws      CorruptedInputException
329      *                          XZ data is corrupt or truncated
330      *
331      * @throws      UnsupportedOptionsException
332      *                          XZ headers seem valid but they specify
333      *                          options not supported by this implementation
334      *
335      * @throws      MemoryLimitException
336      *                          decoded XZ Indexes would need more memory
337      *                          than allowed by the memory usage limit
338      *
339      * @throws      EOFException
340      *                          less than 6 bytes of input was available
341      *                          from <code>in</code>, or (unlikely) the size
342      *                          of the underlying stream got smaller while
343      *                          this was reading from it
344      *
345      * @throws      IOException may be thrown by <code>in</code>
346      *
347      * @since 1.7
348      */
SeekableXZInputStream(SeekableInputStream in, int memoryLimit, ArrayCache arrayCache)349     public SeekableXZInputStream(SeekableInputStream in, int memoryLimit,
350                                  ArrayCache arrayCache)
351             throws IOException {
352         this(in, memoryLimit, true, arrayCache);
353     }
354 
355     /**
356      * Creates a new seekable XZ decomporessor with an optional
357      * memory usage limit and ability to disable verification
358      * of integrity checks.
359      * <p>
360      * Note that integrity check verification should almost never be disabled.
361      * Possible reasons to disable integrity check verification:
362      * <ul>
363      *   <li>Trying to recover data from a corrupt .xz file.</li>
364      *   <li>Speeding up decompression. This matters mostly with SHA-256
365      *   or with files that have compressed extremely well. It's recommended
366      *   that integrity checking isn't disabled for performance reasons
367      *   unless the file integrity is verified externally in some other
368      *   way.</li>
369      * </ul>
370      * <p>
371      * <code>verifyCheck</code> only affects the integrity check of
372      * the actual compressed data. The CRC32 fields in the headers
373      * are always verified.
374      *
375      * @param       in          seekable input stream containing one or more
376      *                          XZ Streams; the whole input stream is used
377      *
378      * @param       memoryLimit memory usage limit in kibibytes (KiB)
379      *                          or <code>-1</code> to impose no
380      *                          memory usage limit
381      *
382      * @param       verifyCheck if <code>true</code>, the integrity checks
383      *                          will be verified; this should almost never
384      *                          be set to <code>false</code>
385      *
386      * @throws      XZFormatException
387      *                          input is not in the XZ format
388      *
389      * @throws      CorruptedInputException
390      *                          XZ data is corrupt or truncated
391      *
392      * @throws      UnsupportedOptionsException
393      *                          XZ headers seem valid but they specify
394      *                          options not supported by this implementation
395      *
396      * @throws      MemoryLimitException
397      *                          decoded XZ Indexes would need more memory
398      *                          than allowed by the memory usage limit
399      *
400      * @throws      EOFException
401      *                          less than 6 bytes of input was available
402      *                          from <code>in</code>, or (unlikely) the size
403      *                          of the underlying stream got smaller while
404      *                          this was reading from it
405      *
406      * @throws      IOException may be thrown by <code>in</code>
407      *
408      * @since 1.6
409      */
SeekableXZInputStream(SeekableInputStream in, int memoryLimit, boolean verifyCheck)410     public SeekableXZInputStream(SeekableInputStream in, int memoryLimit,
411                                  boolean verifyCheck)
412             throws IOException {
413         this(in, memoryLimit, verifyCheck, ArrayCache.getDefaultCache());
414     }
415 
416     /**
417      * Creates a new seekable XZ decomporessor with an optional
418      * memory usage limit and ability to disable verification
419      * of integrity checks.
420      * <p>
421      * This is identical to
422      * <code>SeekableXZInputStream(SeekableInputStream,int,boolean)</code>
423      * except that this also takes the <code>arrayCache</code> argument.
424      *
425      * @param       in          seekable input stream containing one or more
426      *                          XZ Streams; the whole input stream is used
427      *
428      * @param       memoryLimit memory usage limit in kibibytes (KiB)
429      *                          or <code>-1</code> to impose no
430      *                          memory usage limit
431      *
432      * @param       verifyCheck if <code>true</code>, the integrity checks
433      *                          will be verified; this should almost never
434      *                          be set to <code>false</code>
435      *
436      * @param       arrayCache  cache to be used for allocating large arrays
437      *
438      * @throws      XZFormatException
439      *                          input is not in the XZ format
440      *
441      * @throws      CorruptedInputException
442      *                          XZ data is corrupt or truncated
443      *
444      * @throws      UnsupportedOptionsException
445      *                          XZ headers seem valid but they specify
446      *                          options not supported by this implementation
447      *
448      * @throws      MemoryLimitException
449      *                          decoded XZ Indexes would need more memory
450      *                          than allowed by the memory usage limit
451      *
452      * @throws      EOFException
453      *                          less than 6 bytes of input was available
454      *                          from <code>in</code>, or (unlikely) the size
455      *                          of the underlying stream got smaller while
456      *                          this was reading from it
457      *
458      * @throws      IOException may be thrown by <code>in</code>
459      *
460      * @since 1.7
461      */
SeekableXZInputStream(SeekableInputStream in, int memoryLimit, boolean verifyCheck, ArrayCache arrayCache)462     public SeekableXZInputStream(SeekableInputStream in, int memoryLimit,
463                                  boolean verifyCheck, ArrayCache arrayCache)
464             throws IOException {
465         this.arrayCache = arrayCache;
466         this.verifyCheck = verifyCheck;
467         this.in = in;
468         DataInputStream inData = new DataInputStream(in);
469 
470         // Check the magic bytes in the beginning of the file.
471         {
472             in.seek(0);
473             byte[] buf = new byte[XZ.HEADER_MAGIC.length];
474             inData.readFully(buf);
475             if (!Arrays.equals(buf, XZ.HEADER_MAGIC))
476                 throw new XZFormatException();
477         }
478 
479         // Get the file size and verify that it is a multiple of 4 bytes.
480         long pos = in.length();
481         if ((pos & 3) != 0)
482             throw new CorruptedInputException(
483                     "XZ file size is not a multiple of 4 bytes");
484 
485         // Parse the headers starting from the end of the file.
486         byte[] buf = new byte[DecoderUtil.STREAM_HEADER_SIZE];
487         long streamPadding = 0;
488 
489         while (pos > 0) {
490             if (pos < DecoderUtil.STREAM_HEADER_SIZE)
491                 throw new CorruptedInputException();
492 
493             // Read the potential Stream Footer.
494             in.seek(pos - DecoderUtil.STREAM_HEADER_SIZE);
495             inData.readFully(buf);
496 
497             // Skip Stream Padding four bytes at a time.
498             // Skipping more at once would be faster,
499             // but usually there isn't much Stream Padding.
500             if (buf[8] == 0x00 && buf[9] == 0x00 && buf[10] == 0x00
501                     && buf[11] == 0x00) {
502                 streamPadding += 4;
503                 pos -= 4;
504                 continue;
505             }
506 
507             // It's not Stream Padding. Update pos.
508             pos -= DecoderUtil.STREAM_HEADER_SIZE;
509 
510             // Decode the Stream Footer and check if Backward Size
511             // looks reasonable.
512             StreamFlags streamFooter = DecoderUtil.decodeStreamFooter(buf);
513             if (streamFooter.backwardSize >= pos)
514                 throw new CorruptedInputException(
515                         "Backward Size in XZ Stream Footer is too big");
516 
517             // Check that the Check ID is supported. Store it in case this
518             // is the first Stream in the file.
519             check = Check.getInstance(streamFooter.checkType);
520 
521             // Remember which Check IDs have been seen.
522             checkTypes |= 1 << streamFooter.checkType;
523 
524             // Seek to the beginning of the Index.
525             in.seek(pos - streamFooter.backwardSize);
526 
527             // Decode the Index field.
528             IndexDecoder index;
529             try {
530                 index = new IndexDecoder(in, streamFooter, streamPadding,
531                                          memoryLimit);
532             } catch (MemoryLimitException e) {
533                 // IndexDecoder doesn't know how much memory we had
534                 // already needed so we need to recreate the exception.
535                 assert memoryLimit >= 0;
536                 throw new MemoryLimitException(
537                         e.getMemoryNeeded() + indexMemoryUsage,
538                         memoryLimit + indexMemoryUsage);
539             }
540 
541             // Update the memory usage and limit counters.
542             indexMemoryUsage += index.getMemoryUsage();
543             if (memoryLimit >= 0) {
544                 memoryLimit -= index.getMemoryUsage();
545                 assert memoryLimit >= 0;
546             }
547 
548             // Remember the uncompressed size of the largest Block.
549             if (largestBlockSize < index.getLargestBlockSize())
550                 largestBlockSize = index.getLargestBlockSize();
551 
552             // Calculate the offset to the beginning of this XZ Stream and
553             // check that it looks sane.
554             long off = index.getStreamSize() - DecoderUtil.STREAM_HEADER_SIZE;
555             if (pos < off)
556                 throw new CorruptedInputException("XZ Index indicates "
557                         + "too big compressed size for the XZ Stream");
558 
559             // Seek to the beginning of this Stream.
560             pos -= off;
561             in.seek(pos);
562 
563             // Decode the Stream Header.
564             inData.readFully(buf);
565             StreamFlags streamHeader = DecoderUtil.decodeStreamHeader(buf);
566 
567             // Verify that the Stream Header matches the Stream Footer.
568             if (!DecoderUtil.areStreamFlagsEqual(streamHeader, streamFooter))
569                 throw new CorruptedInputException(
570                         "XZ Stream Footer does not match Stream Header");
571 
572             // Update the total uncompressed size of the file and check that
573             // it doesn't overflow.
574             uncompressedSize += index.getUncompressedSize();
575             if (uncompressedSize < 0)
576                 throw new UnsupportedOptionsException("XZ file is too big");
577 
578             // Update the Block count and check that it fits into an int.
579             blockCount += index.getRecordCount();
580             if (blockCount < 0)
581                 throw new UnsupportedOptionsException(
582                         "XZ file has over " + Integer.MAX_VALUE + " Blocks");
583 
584             // Add this Stream to the list of Streams.
585             streams.add(index);
586 
587             // Reset to be ready to parse the next Stream.
588             streamPadding = 0;
589         }
590 
591         assert pos == 0;
592 
593         // Save it now that indexMemoryUsage has been substracted from it.
594         this.memoryLimit = memoryLimit;
595 
596         // Store the relative offsets of the Streams. This way we don't
597         // need to recalculate them in this class when seeking; the
598         // IndexDecoder instances will handle them.
599         IndexDecoder prev = streams.get(streams.size() - 1);
600         for (int i = streams.size() - 2; i >= 0; --i) {
601             IndexDecoder cur = streams.get(i);
602             cur.setOffsets(prev);
603             prev = cur;
604         }
605 
606         // Initialize curBlockInfo to point to the first Stream.
607         // The blockNumber will be left to -1 so that .hasNext()
608         // and .setNext() work to get the first Block when starting
609         // to decompress from the beginning of the file.
610         IndexDecoder first = streams.get(streams.size() - 1);
611         curBlockInfo = new BlockInfo(first);
612 
613         // queriedBlockInfo needs to be allocated too. The Stream used for
614         // initialization doesn't matter though.
615         queriedBlockInfo = new BlockInfo(first);
616     }
617 
618     /**
619      * Gets the types of integrity checks used in the .xz file.
620      * Multiple checks are possible only if there are multiple
621      * concatenated XZ Streams.
622      * <p>
623      * The returned value has a bit set for every check type that is present.
624      * For example, if CRC64 and SHA-256 were used, the return value is
625      * <code>(1&nbsp;&lt;&lt;&nbsp;XZ.CHECK_CRC64)
626      * | (1&nbsp;&lt;&lt;&nbsp;XZ.CHECK_SHA256)</code>.
627      */
getCheckTypes()628     public int getCheckTypes() {
629         return checkTypes;
630     }
631 
632     /**
633      * Gets the amount of memory in kibibytes (KiB) used by
634      * the data structures needed to locate the XZ Blocks.
635      * This is usually useless information but since it is calculated
636      * for memory usage limit anyway, it is nice to make it available to too.
637      */
getIndexMemoryUsage()638     public int getIndexMemoryUsage() {
639         return indexMemoryUsage;
640     }
641 
642     /**
643      * Gets the uncompressed size of the largest XZ Block in bytes.
644      * This can be useful if you want to check that the file doesn't
645      * have huge XZ Blocks which could make seeking to arbitrary offsets
646      * very slow. Note that huge Blocks don't automatically mean that
647      * seeking would be slow, for example, seeking to the beginning of
648      * any Block is always fast.
649      */
getLargestBlockSize()650     public long getLargestBlockSize() {
651         return largestBlockSize;
652     }
653 
654     /**
655      * Gets the number of Streams in the .xz file.
656      *
657      * @since 1.3
658      */
getStreamCount()659     public int getStreamCount() {
660         return streams.size();
661     }
662 
663     /**
664      * Gets the number of Blocks in the .xz file.
665      *
666      * @since 1.3
667      */
getBlockCount()668     public int getBlockCount() {
669         return blockCount;
670     }
671 
672     /**
673      * Gets the uncompressed start position of the given Block.
674      *
675      * @throws  IndexOutOfBoundsException if
676      *          <code>blockNumber&nbsp;&lt;&nbsp;0</code> or
677      *          <code>blockNumber&nbsp;&gt;=&nbsp;getBlockCount()</code>.
678      *
679      * @since 1.3
680      */
getBlockPos(int blockNumber)681     public long getBlockPos(int blockNumber) {
682         locateBlockByNumber(queriedBlockInfo, blockNumber);
683         return queriedBlockInfo.uncompressedOffset;
684     }
685 
686     /**
687      * Gets the uncompressed size of the given Block.
688      *
689      * @throws  IndexOutOfBoundsException if
690      *          <code>blockNumber&nbsp;&lt;&nbsp;0</code> or
691      *          <code>blockNumber&nbsp;&gt;=&nbsp;getBlockCount()</code>.
692      *
693      * @since 1.3
694      */
getBlockSize(int blockNumber)695     public long getBlockSize(int blockNumber) {
696         locateBlockByNumber(queriedBlockInfo, blockNumber);
697         return queriedBlockInfo.uncompressedSize;
698     }
699 
700     /**
701      * Gets the position where the given compressed Block starts in
702      * the underlying .xz file.
703      * This information is rarely useful to the users of this class.
704      *
705      * @throws  IndexOutOfBoundsException if
706      *          <code>blockNumber&nbsp;&lt;&nbsp;0</code> or
707      *          <code>blockNumber&nbsp;&gt;=&nbsp;getBlockCount()</code>.
708      *
709      * @since 1.3
710      */
getBlockCompPos(int blockNumber)711     public long getBlockCompPos(int blockNumber) {
712         locateBlockByNumber(queriedBlockInfo, blockNumber);
713         return queriedBlockInfo.compressedOffset;
714     }
715 
716     /**
717      * Gets the compressed size of the given Block.
718      * This together with the uncompressed size can be used to calculate
719      * the compression ratio of the specific Block.
720      *
721      * @throws  IndexOutOfBoundsException if
722      *          <code>blockNumber&nbsp;&lt;&nbsp;0</code> or
723      *          <code>blockNumber&nbsp;&gt;=&nbsp;getBlockCount()</code>.
724      *
725      * @since 1.3
726      */
getBlockCompSize(int blockNumber)727     public long getBlockCompSize(int blockNumber) {
728         locateBlockByNumber(queriedBlockInfo, blockNumber);
729         return (queriedBlockInfo.unpaddedSize + 3) & ~3;
730     }
731 
732     /**
733      * Gets integrity check type (Check ID) of the given Block.
734      *
735      * @throws  IndexOutOfBoundsException if
736      *          <code>blockNumber&nbsp;&lt;&nbsp;0</code> or
737      *          <code>blockNumber&nbsp;&gt;=&nbsp;getBlockCount()</code>.
738      *
739      * @see #getCheckTypes()
740      *
741      * @since 1.3
742      */
getBlockCheckType(int blockNumber)743     public int getBlockCheckType(int blockNumber) {
744         locateBlockByNumber(queriedBlockInfo, blockNumber);
745         return queriedBlockInfo.getCheckType();
746     }
747 
748     /**
749      * Gets the number of the Block that contains the byte at the given
750      * uncompressed position.
751      *
752      * @throws  IndexOutOfBoundsException if
753      *          <code>pos&nbsp;&lt;&nbsp;0</code> or
754      *          <code>pos&nbsp;&gt;=&nbsp;length()</code>.
755      *
756      * @since 1.3
757      */
getBlockNumber(long pos)758     public int getBlockNumber(long pos) {
759         locateBlockByPos(queriedBlockInfo, pos);
760         return queriedBlockInfo.blockNumber;
761     }
762 
763     /**
764      * Decompresses the next byte from this input stream.
765      *
766      * @return      the next decompressed byte, or <code>-1</code>
767      *              to indicate the end of the compressed stream
768      *
769      * @throws      CorruptedInputException
770      * @throws      UnsupportedOptionsException
771      * @throws      MemoryLimitException
772      *
773      * @throws      XZIOException if the stream has been closed
774      *
775      * @throws      IOException may be thrown by <code>in</code>
776      */
read()777     public int read() throws IOException {
778         return read(tempBuf, 0, 1) == -1 ? -1 : (tempBuf[0] & 0xFF);
779     }
780 
781     /**
782      * Decompresses into an array of bytes.
783      * <p>
784      * If <code>len</code> is zero, no bytes are read and <code>0</code>
785      * is returned. Otherwise this will try to decompress <code>len</code>
786      * bytes of uncompressed data. Less than <code>len</code> bytes may
787      * be read only in the following situations:
788      * <ul>
789      *   <li>The end of the compressed data was reached successfully.</li>
790      *   <li>An error is detected after at least one but less than
791      *       <code>len</code> bytes have already been successfully
792      *       decompressed. The next call with non-zero <code>len</code>
793      *       will immediately throw the pending exception.</li>
794      *   <li>An exception is thrown.</li>
795      * </ul>
796      *
797      * @param       buf         target buffer for uncompressed data
798      * @param       off         start offset in <code>buf</code>
799      * @param       len         maximum number of uncompressed bytes to read
800      *
801      * @return      number of bytes read, or <code>-1</code> to indicate
802      *              the end of the compressed stream
803      *
804      * @throws      CorruptedInputException
805      * @throws      UnsupportedOptionsException
806      * @throws      MemoryLimitException
807      *
808      * @throws      XZIOException if the stream has been closed
809      *
810      * @throws      IOException may be thrown by <code>in</code>
811      */
read(byte[] buf, int off, int len)812     public int read(byte[] buf, int off, int len) throws IOException {
813         if (off < 0 || len < 0 || off + len < 0 || off + len > buf.length)
814             throw new IndexOutOfBoundsException();
815 
816         if (len == 0)
817             return 0;
818 
819         if (in == null)
820             throw new XZIOException("Stream closed");
821 
822         if (exception != null)
823             throw exception;
824 
825         int size = 0;
826 
827         try {
828             if (seekNeeded)
829                 seek();
830 
831             if (endReached)
832                 return -1;
833 
834             while (len > 0) {
835                 if (blockDecoder == null) {
836                     seek();
837                     if (endReached)
838                         break;
839                 }
840 
841                 int ret = blockDecoder.read(buf, off, len);
842 
843                 if (ret > 0) {
844                     curPos += ret;
845                     size += ret;
846                     off += ret;
847                     len -= ret;
848                 } else if (ret == -1) {
849                     blockDecoder = null;
850                 }
851             }
852         } catch (IOException e) {
853             // We know that the file isn't simply truncated because we could
854             // parse the Indexes in the constructor. So convert EOFException
855             // to CorruptedInputException.
856             if (e instanceof EOFException)
857                 e = new CorruptedInputException();
858 
859             exception = e;
860             if (size == 0)
861                 throw e;
862         }
863 
864         return size;
865     }
866 
867     /**
868      * Returns the number of uncompressed bytes that can be read
869      * without blocking. The value is returned with an assumption
870      * that the compressed input data will be valid. If the compressed
871      * data is corrupt, <code>CorruptedInputException</code> may get
872      * thrown before the number of bytes claimed to be available have
873      * been read from this input stream.
874      *
875      * @return      the number of uncompressed bytes that can be read
876      *              without blocking
877      */
available()878     public int available() throws IOException {
879         if (in == null)
880             throw new XZIOException("Stream closed");
881 
882         if (exception != null)
883             throw exception;
884 
885         if (endReached || seekNeeded || blockDecoder == null)
886             return 0;
887 
888         return blockDecoder.available();
889     }
890 
891     /**
892      * Closes the stream and calls <code>in.close()</code>.
893      * If the stream was already closed, this does nothing.
894      * <p>
895      * This is equivalent to <code>close(true)</code>.
896      *
897      * @throws  IOException if thrown by <code>in.close()</code>
898      */
close()899     public void close() throws IOException {
900         close(true);
901     }
902 
903     /**
904      * Closes the stream and optionally calls <code>in.close()</code>.
905      * If the stream was already closed, this does nothing.
906      * If <code>close(false)</code> has been called, a further
907      * call of <code>close(true)</code> does nothing (it doesn't call
908      * <code>in.close()</code>).
909      * <p>
910      * If you don't want to close the underlying <code>InputStream</code>,
911      * there is usually no need to worry about closing this stream either;
912      * it's fine to do nothing and let the garbage collector handle it.
913      * However, if you are using {@link ArrayCache}, <code>close(false)</code>
914      * can be useful to put the allocated arrays back to the cache without
915      * closing the underlying <code>InputStream</code>.
916      * <p>
917      * Note that if you successfully reach the end of the stream
918      * (<code>read</code> returns <code>-1</code>), the arrays are
919      * automatically put back to the cache by that <code>read</code> call. In
920      * this situation <code>close(false)</code> is redundant (but harmless).
921      *
922      * @throws  IOException if thrown by <code>in.close()</code>
923      *
924      * @since 1.7
925      */
close(boolean closeInput)926     public void close(boolean closeInput) throws IOException {
927         if (in != null) {
928             if (blockDecoder != null) {
929                 blockDecoder.close();
930                 blockDecoder = null;
931             }
932 
933             try {
934                 if (closeInput)
935                     in.close();
936             } finally {
937                 in = null;
938             }
939         }
940     }
941 
942     /**
943      * Gets the uncompressed size of this input stream. If there are multiple
944      * XZ Streams, the total uncompressed size of all XZ Streams is returned.
945      */
length()946     public long length() {
947         return uncompressedSize;
948     }
949 
950     /**
951      * Gets the current uncompressed position in this input stream.
952      *
953      * @throws      XZIOException if the stream has been closed
954      */
position()955     public long position() throws IOException {
956         if (in == null)
957             throw new XZIOException("Stream closed");
958 
959         return seekNeeded ? seekPos : curPos;
960     }
961 
962     /**
963      * Seeks to the specified absolute uncompressed position in the stream.
964      * This only stores the new position, so this function itself is always
965      * very fast. The actual seek is done when <code>read</code> is called
966      * to read at least one byte.
967      * <p>
968      * Seeking past the end of the stream is possible. In that case
969      * <code>read</code> will return <code>-1</code> to indicate
970      * the end of the stream.
971      *
972      * @param       pos         new uncompressed read position
973      *
974      * @throws      XZIOException
975      *                          if <code>pos</code> is negative, or
976      *                          if stream has been closed
977      */
seek(long pos)978     public void seek(long pos) throws IOException {
979         if (in == null)
980             throw new XZIOException("Stream closed");
981 
982         if (pos < 0)
983             throw new XZIOException("Negative seek position: " + pos);
984 
985         seekPos = pos;
986         seekNeeded = true;
987     }
988 
989     /**
990      * Seeks to the beginning of the given XZ Block.
991      *
992      * @throws      XZIOException
993      *              if <code>blockNumber&nbsp;&lt;&nbsp;0</code> or
994      *              <code>blockNumber&nbsp;&gt;=&nbsp;getBlockCount()</code>,
995      *              or if stream has been closed
996      *
997      * @since 1.3
998      */
seekToBlock(int blockNumber)999     public void seekToBlock(int blockNumber) throws IOException {
1000         if (in == null)
1001             throw new XZIOException("Stream closed");
1002 
1003         if (blockNumber < 0 || blockNumber >= blockCount)
1004             throw new XZIOException("Invalid XZ Block number: " + blockNumber);
1005 
1006         // This is a bit silly implementation. Here we locate the uncompressed
1007         // offset of the specified Block, then when doing the actual seek in
1008         // seek(), we need to find the Block number based on seekPos.
1009         seekPos = getBlockPos(blockNumber);
1010         seekNeeded = true;
1011     }
1012 
1013     /**
1014      * Does the actual seeking. This is also called when <code>read</code>
1015      * needs a new Block to decode.
1016      */
seek()1017     private void seek() throws IOException {
1018         // If seek(long) wasn't called, we simply need to get the next Block
1019         // from the same Stream. If there are no more Blocks in this Stream,
1020         // then we behave as if seek(long) had been called.
1021         if (!seekNeeded) {
1022             if (curBlockInfo.hasNext()) {
1023                 curBlockInfo.setNext();
1024                 initBlockDecoder();
1025                 return;
1026             }
1027 
1028             seekPos = curPos;
1029         }
1030 
1031         seekNeeded = false;
1032 
1033         // Check if we are seeking to or past the end of the file.
1034         if (seekPos >= uncompressedSize) {
1035             curPos = seekPos;
1036 
1037             if (blockDecoder != null) {
1038                 blockDecoder.close();
1039                 blockDecoder = null;
1040             }
1041 
1042             endReached = true;
1043             return;
1044         }
1045 
1046         endReached = false;
1047 
1048         // Locate the Block that contains the uncompressed target position.
1049         locateBlockByPos(curBlockInfo, seekPos);
1050 
1051         // Seek in the underlying stream and create a new Block decoder
1052         // only if really needed. We can skip it if the current position
1053         // is already in the correct Block and the target position hasn't
1054         // been decompressed yet.
1055         //
1056         // NOTE: If curPos points to the beginning of this Block, it's
1057         // because it was left there after decompressing an earlier Block.
1058         // In that case, decoding of the current Block hasn't been started
1059         // yet. (Decoding of a Block won't be started until at least one
1060         // byte will also be read from it.)
1061         if (!(curPos > curBlockInfo.uncompressedOffset && curPos <= seekPos)) {
1062             // Seek to the beginning of the Block.
1063             in.seek(curBlockInfo.compressedOffset);
1064 
1065             // Since it is possible that this Block is from a different
1066             // Stream than the previous Block, initialize a new Check.
1067             check = Check.getInstance(curBlockInfo.getCheckType());
1068 
1069             // Create a new Block decoder.
1070             initBlockDecoder();
1071             curPos = curBlockInfo.uncompressedOffset;
1072         }
1073 
1074         // If the target wasn't at a Block boundary, decompress and throw
1075         // away data to reach the target position.
1076         if (seekPos > curPos) {
1077             // NOTE: The "if" below is there just in case. In this situation,
1078             // blockDecoder.skip will always skip the requested amount
1079             // or throw an exception.
1080             long skipAmount = seekPos - curPos;
1081             if (blockDecoder.skip(skipAmount) != skipAmount)
1082                 throw new CorruptedInputException();
1083 
1084             curPos = seekPos;
1085         }
1086     }
1087 
1088     /**
1089      * Locates the Block that contains the given uncompressed position.
1090      */
locateBlockByPos(BlockInfo info, long pos)1091     private void locateBlockByPos(BlockInfo info, long pos) {
1092         if (pos < 0 || pos >= uncompressedSize)
1093             throw new IndexOutOfBoundsException(
1094                     "Invalid uncompressed position: " + pos);
1095 
1096         // Locate the Stream that contains the target position.
1097         IndexDecoder index;
1098         for (int i = 0; ; ++i) {
1099             index = streams.get(i);
1100             if (index.hasUncompressedOffset(pos))
1101                 break;
1102         }
1103 
1104         // Locate the Block from the Stream that contains the target position.
1105         index.locateBlock(info, pos);
1106 
1107         assert (info.compressedOffset & 3) == 0;
1108         assert info.uncompressedSize > 0;
1109         assert pos >= info.uncompressedOffset;
1110         assert pos < info.uncompressedOffset + info.uncompressedSize;
1111     }
1112 
1113     /**
1114      * Locates the given Block and stores information about it
1115      * to <code>info</code>.
1116      */
1117     private void locateBlockByNumber(BlockInfo info, int blockNumber) {
1118         // Validate.
1119         if (blockNumber < 0 || blockNumber >= blockCount)
1120             throw new IndexOutOfBoundsException(
1121                     "Invalid XZ Block number: " + blockNumber);
1122 
1123         // Skip the search if info already points to the correct Block.
1124         if (info.blockNumber == blockNumber)
1125             return;
1126 
1127         // Search the Stream that contains the given Block and then
1128         // search the Block from that Stream.
1129         for (int i = 0; ; ++i) {
1130             IndexDecoder index = streams.get(i);
1131             if (index.hasRecord(blockNumber)) {
1132                 index.setBlockInfo(info, blockNumber);
1133                 return;
1134             }
1135         }
1136     }
1137 
1138     /**
1139      * Initializes a new BlockInputStream. This is a helper function for
1140      * <code>seek()</code>.
1141      */
1142     private void initBlockDecoder() throws IOException {
1143         try {
1144             // Set it to null first so that GC can collect it if memory
1145             // runs tight when initializing a new BlockInputStream.
1146             if (blockDecoder != null) {
1147                 blockDecoder.close();
1148                 blockDecoder = null;
1149             }
1150 
1151             blockDecoder = new BlockInputStream(
1152                     in, check, verifyCheck, memoryLimit,
1153                     curBlockInfo.unpaddedSize, curBlockInfo.uncompressedSize,
1154                     arrayCache);
1155         } catch (MemoryLimitException e) {
1156             // BlockInputStream doesn't know how much memory we had
1157             // already needed so we need to recreate the exception.
1158             assert memoryLimit >= 0;
1159             throw new MemoryLimitException(
1160                     e.getMemoryNeeded() + indexMemoryUsage,
1161                     memoryLimit + indexMemoryUsage);
1162         } catch (IndexIndicatorException e) {
1163             // It cannot be Index so the file must be corrupt.
1164             throw new CorruptedInputException();
1165         }
1166     }
1167 }
1168