• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one
3  * or more contributor license agreements.  See the NOTICE file
4  * distributed with this work for additional information
5  * regarding copyright ownership.  The ASF licenses this file
6  * to you under the Apache License, Version 2.0 (the
7  * "License"); you may not use this file except in compliance
8  * with the License.  You may obtain a copy of the License at
9  *
10  * http://www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing,
13  * software distributed under the License is distributed on an
14  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15  * KIND, either express or implied.  See the License for the
16  * specific language governing permissions and limitations
17  * under the License.
18  */
19 package org.apache.commons.compress.compressors.lz77support;
20 
21 import java.io.IOException;
22 import java.io.InputStream;
23 import java.util.Arrays;
24 
25 import org.apache.commons.compress.compressors.CompressorInputStream;
26 import org.apache.commons.compress.utils.ByteUtils;
27 import org.apache.commons.compress.utils.CountingInputStream;
28 import org.apache.commons.compress.utils.IOUtils;
29 import org.apache.commons.compress.utils.InputStreamStatistics;
30 
31 /**
32  * Encapsulates code common to LZ77 decompressors.
33  *
34  * <p>Assumes the stream consists of blocks of literal data and
35  * back-references (called copies) in any order. Of course the first
36  * block must be a literal block for the scheme to work - unless the
37  * {@link #prefill prefill} method has been used to provide initial
38  * data that is never returned by {@link #read read} but only used for
39  * back-references.</p>
40  *
41  * <p>Subclasses must override the three-arg {@link #read read} method
42  * as the no-arg version delegates to it and the default
43  * implementation delegates to the no-arg version, leading to infinite
44  * mutual recursion and a {@code StackOverflowError} otherwise.</p>
45  *
46  * <p>The contract for subclasses' {@code read} implementation is:</p>
47  * <ul>
48  *
49  *  <li>keep track of the current state of the stream. Is it inside a
50  *  literal block or a back-reference or in-between blocks?</li>
51  *
52  *  <li>Use {@link #readOneByte} to access the underlying stream
53  *  directly.</li>
54  *
55  *  <li>If a new literal block starts, use {@link #startLiteral} to
56  *  tell this class about it and read the literal data using {@link
57  *  #readLiteral} until it returns {@code 0}. {@link
58  *  #hasMoreDataInBlock} will return {@code false} before the next
59  *  call to {@link #readLiteral} would return {@code 0}.</li>
60  *
61  *  <li>If a new back-reference starts, use {@link #startBackReference} to
62  *  tell this class about it and read the literal data using {@link
63  *  #readBackReference} until it returns {@code 0}. {@link
64  *  #hasMoreDataInBlock} will return {@code false} before the next
65  *  call to {@link #readBackReference} would return {@code 0}.</li>
66  *
67  *  <li>If the end of the stream has been reached, return {@code -1}
68  *  as this class' methods will never do so themselves.</li>
69  *
70  * </ul>
71  *
72  * <p>{@link #readOneByte} and {@link #readLiteral} update the counter
73  * for bytes read.</p>
74  *
75  * @since 1.14
76  */
77 public abstract class AbstractLZ77CompressorInputStream extends CompressorInputStream
78     implements InputStreamStatistics {
79 
80     /** Size of the window - must be bigger than the biggest offset expected. */
81     private final int windowSize;
82 
83     /**
84      * Buffer to write decompressed bytes to for back-references, will
85      * be three times windowSize big.
86      *
87      * <p>Three times so we can slide the whole buffer a windowSize to
88      * the left once we've read twice windowSize and still have enough
89      * data inside of it to satisfy back-references.</p>
90      */
91     private final byte[] buf;
92 
93     /** One behind the index of the last byte in the buffer that was written, i.e. the next position to write to */
94     private int writeIndex;
95 
96     /** Index of the next byte to be read. */
97     private int readIndex;
98 
99     /** The underlying stream to read compressed data from */
100     private final CountingInputStream in;
101 
102     /** Number of bytes still to be read from the current literal or back-reference. */
103     private long bytesRemaining;
104 
105     /** Offset of the current back-reference. */
106     private int backReferenceOffset;
107 
108     /** uncompressed size */
109     private int size = 0;
110 
111     // used in no-arg read method
112     private final byte[] oneByte = new byte[1];
113 
114     /**
115      * Supplier that delegates to {@link #readOneByte}.
116      */
117     protected final ByteUtils.ByteSupplier supplier = new ByteUtils.ByteSupplier() {
118         @Override
119         public int getAsByte() throws IOException {
120             return readOneByte();
121         }
122     };
123 
124     /**
125      * Creates a new LZ77 input stream.
126      *
127      * @param is
128      *            An InputStream to read compressed data from
129      * @param windowSize
130      *            Size of the window kept for back-references, must be bigger than the biggest offset expected.
131      *
132      * @throws IOException if reading fails
133      */
AbstractLZ77CompressorInputStream(final InputStream is, int windowSize)134     public AbstractLZ77CompressorInputStream(final InputStream is, int windowSize) throws IOException {
135         this.in = new CountingInputStream(is);
136         this.windowSize = windowSize;
137         buf = new byte[3 * windowSize];
138         writeIndex = readIndex = 0;
139         bytesRemaining = 0;
140     }
141 
142     /** {@inheritDoc} */
143     @Override
read()144     public int read() throws IOException {
145         return read(oneByte, 0, 1) == -1 ? -1 : oneByte[0] & 0xFF;
146     }
147 
148     /** {@inheritDoc} */
149     @Override
close()150     public void close() throws IOException {
151         in.close();
152     }
153 
154     /** {@inheritDoc} */
155     @Override
available()156     public int available() {
157         return writeIndex - readIndex;
158     }
159 
160     /**
161      * Get the uncompressed size of the stream
162      *
163      * @return the uncompressed size
164      */
getSize()165     public int getSize() {
166         return size;
167     }
168 
169     /**
170      * Adds some initial data to fill the window with.
171      *
172      * <p>This is used if the stream has been cut into blocks and
173      * back-references of one block may refer to data of the previous
174      * block(s). One such example is the LZ4 frame format using block
175      * dependency.</p>
176      *
177      * @param data the data to fill the window with.
178      * @throws IllegalStateException if the stream has already started to read data
179      */
prefill(byte[] data)180     public void prefill(byte[] data) {
181         if (writeIndex != 0) {
182             throw new IllegalStateException("the stream has already been read from, can't prefill anymore");
183         }
184         // we don't need more data than the big offset could refer to, so cap it
185         int len = Math.min(windowSize, data.length);
186         // we need the last data as we are dealing with *back*-references
187         System.arraycopy(data, data.length - len, buf, 0, len);
188         writeIndex += len;
189         readIndex += len;
190     }
191 
192     /**
193      * @since 1.17
194      */
195     @Override
getCompressedCount()196     public long getCompressedCount() {
197         return in.getBytesRead();
198     }
199 
200     /**
201      * Used by subclasses to signal the next block contains the given
202      * amount of literal data.
203      * @param length the length of the block
204      */
startLiteral(long length)205     protected final void startLiteral(long length) {
206         bytesRemaining = length;
207     }
208 
209     /**
210      * Is there still data remaining inside the current block?
211      * @return true if there is still data remaining inside the current block.
212      */
hasMoreDataInBlock()213     protected final boolean hasMoreDataInBlock() {
214         return bytesRemaining > 0;
215     }
216 
217     /**
218      * Reads data from the current literal block.
219      * @param b buffer to write data to
220      * @param off offset to start writing to
221      * @param len maximum amount of data to read
222      * @return number of bytes read, may be 0. Will never return -1 as
223      * EOF-detection is the responsibility of the subclass
224      * @throws IOException if the underlying stream throws or signals
225      * an EOF before the amount of data promised for the block have
226      * been read
227      */
readLiteral(final byte[] b, final int off, final int len)228     protected final int readLiteral(final byte[] b, final int off, final int len) throws IOException {
229         final int avail = available();
230         if (len > avail) {
231             tryToReadLiteral(len - avail);
232         }
233         return readFromBuffer(b, off, len);
234     }
235 
tryToReadLiteral(int bytesToRead)236     private void tryToReadLiteral(int bytesToRead) throws IOException {
237         // min of "what is still inside the literal", "what does the user want" and "how muc can fit into the buffer"
238         final int reallyTryToRead = Math.min((int) Math.min(bytesToRead, bytesRemaining),
239                                              buf.length - writeIndex);
240         final int bytesRead = reallyTryToRead > 0
241             ? IOUtils.readFully(in, buf, writeIndex, reallyTryToRead)
242             : 0 /* happens for bytesRemaining == 0 */;
243         count(bytesRead);
244         if (reallyTryToRead != bytesRead) {
245             throw new IOException("Premature end of stream reading literal");
246         }
247         writeIndex += reallyTryToRead;
248         bytesRemaining -= reallyTryToRead;
249     }
250 
readFromBuffer(final byte[] b, final int off, final int len)251     private int readFromBuffer(final byte[] b, final int off, final int len) {
252         final int readable = Math.min(len, available());
253         if (readable > 0) {
254             System.arraycopy(buf, readIndex, b, off, readable);
255             readIndex += readable;
256             if (readIndex > 2 * windowSize) {
257                 slideBuffer();
258             }
259         }
260         size += readable;
261         return readable;
262     }
263 
slideBuffer()264     private void slideBuffer() {
265         System.arraycopy(buf, windowSize, buf, 0, windowSize * 2);
266         writeIndex -= windowSize;
267         readIndex -= windowSize;
268     }
269 
270     /**
271      * Used by subclasses to signal the next block contains a back-reference with the given coordinates.
272      * @param offset the offset of the back-reference
273      * @param length the length of the back-reference
274      */
startBackReference(int offset, long length)275     protected final void startBackReference(int offset, long length) {
276         backReferenceOffset = offset;
277         bytesRemaining = length;
278     }
279 
280     /**
281      * Reads data from the current back-reference.
282      * @param b buffer to write data to
283      * @param off offset to start writing to
284      * @param len maximum amount of data to read
285      * @return number of bytes read, may be 0. Will never return -1 as
286      * EOF-detection is the responsibility of the subclass
287      */
readBackReference(final byte[] b, final int off, final int len)288     protected final int readBackReference(final byte[] b, final int off, final int len) {
289         final int avail = available();
290         if (len > avail) {
291             tryToCopy(len - avail);
292         }
293         return readFromBuffer(b, off, len);
294     }
295 
tryToCopy(int bytesToCopy)296     private void tryToCopy(int bytesToCopy) {
297         // this will fit into the buffer without sliding and not
298         // require more than is available inside the back-reference
299         int copy = Math.min((int) Math.min(bytesToCopy, bytesRemaining),
300                             buf.length - writeIndex);
301         if (copy == 0) {
302             // NOP
303         } else if (backReferenceOffset == 1) { // pretty common special case
304             final byte last = buf[writeIndex - 1];
305             Arrays.fill(buf, writeIndex, writeIndex + copy, last);
306             writeIndex += copy;
307         } else if (copy < backReferenceOffset) {
308             System.arraycopy(buf, writeIndex - backReferenceOffset, buf, writeIndex, copy);
309             writeIndex += copy;
310         } else {
311             // back-reference overlaps with the bytes created from it
312             // like go back two bytes and then copy six (by copying
313             // the last two bytes three time).
314             final int fullRots = copy / backReferenceOffset;
315             for (int i = 0; i < fullRots; i++) {
316                 System.arraycopy(buf, writeIndex - backReferenceOffset, buf, writeIndex, backReferenceOffset);
317                 writeIndex += backReferenceOffset;
318             }
319 
320             final int pad = copy - (backReferenceOffset * fullRots);
321             if (pad > 0) {
322                 System.arraycopy(buf, writeIndex - backReferenceOffset, buf, writeIndex, pad);
323                 writeIndex += pad;
324             }
325         }
326         bytesRemaining -= copy;
327     }
328 
329     /**
330      * Reads a single byte from the real input stream and ensures the data is accounted for.
331      *
332      * @return the byte read as value between 0 and 255 or -1 if EOF has been reached.
333      * @throws IOException if the underlying stream throws
334      */
readOneByte()335     protected final int readOneByte() throws IOException {
336         final int b = in.read();
337         if (b != -1) {
338             count(1);
339             return b & 0xFF;
340         }
341         return -1;
342     }
343 }
344