1 /* 2 * Licensed to the Apache Software Foundation (ASF) under one 3 * or more contributor license agreements. See the NOTICE file 4 * distributed with this work for additional information 5 * regarding copyright ownership. The ASF licenses this file 6 * to you under the Apache License, Version 2.0 (the 7 * "License"); you may not use this file except in compliance 8 * with the License. You may obtain a copy of the License at 9 * 10 * http://www.apache.org/licenses/LICENSE-2.0 11 * 12 * Unless required by applicable law or agreed to in writing, 13 * software distributed under the License is distributed on an 14 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 * KIND, either express or implied. See the License for the 16 * specific language governing permissions and limitations 17 * under the License. 18 */ 19 package org.apache.commons.compress.compressors.lz77support; 20 21 import java.io.IOException; 22 import java.io.InputStream; 23 import java.util.Arrays; 24 25 import org.apache.commons.compress.compressors.CompressorInputStream; 26 import org.apache.commons.compress.utils.ByteUtils; 27 import org.apache.commons.compress.utils.CountingInputStream; 28 import org.apache.commons.compress.utils.IOUtils; 29 import org.apache.commons.compress.utils.InputStreamStatistics; 30 31 /** 32 * Encapsulates code common to LZ77 decompressors. 33 * 34 * <p>Assumes the stream consists of blocks of literal data and 35 * back-references (called copies) in any order. Of course the first 36 * block must be a literal block for the scheme to work - unless the 37 * {@link #prefill prefill} method has been used to provide initial 38 * data that is never returned by {@link #read read} but only used for 39 * back-references.</p> 40 * 41 * <p>Subclasses must override the three-arg {@link #read read} method 42 * as the no-arg version delegates to it and the default 43 * implementation delegates to the no-arg version, leading to infinite 44 * mutual recursion and a {@code StackOverflowError} otherwise.</p> 45 * 46 * <p>The contract for subclasses' {@code read} implementation is:</p> 47 * <ul> 48 * 49 * <li>keep track of the current state of the stream. Is it inside a 50 * literal block or a back-reference or in-between blocks?</li> 51 * 52 * <li>Use {@link #readOneByte} to access the underlying stream 53 * directly.</li> 54 * 55 * <li>If a new literal block starts, use {@link #startLiteral} to 56 * tell this class about it and read the literal data using {@link 57 * #readLiteral} until it returns {@code 0}. {@link 58 * #hasMoreDataInBlock} will return {@code false} before the next 59 * call to {@link #readLiteral} would return {@code 0}.</li> 60 * 61 * <li>If a new back-reference starts, use {@link #startBackReference} to 62 * tell this class about it and read the literal data using {@link 63 * #readBackReference} until it returns {@code 0}. {@link 64 * #hasMoreDataInBlock} will return {@code false} before the next 65 * call to {@link #readBackReference} would return {@code 0}.</li> 66 * 67 * <li>If the end of the stream has been reached, return {@code -1} 68 * as this class' methods will never do so themselves.</li> 69 * 70 * </ul> 71 * 72 * <p>{@link #readOneByte} and {@link #readLiteral} update the counter 73 * for bytes read.</p> 74 * 75 * @since 1.14 76 */ 77 public abstract class AbstractLZ77CompressorInputStream extends CompressorInputStream 78 implements InputStreamStatistics { 79 80 /** Size of the window - must be bigger than the biggest offset expected. */ 81 private final int windowSize; 82 83 /** 84 * Buffer to write decompressed bytes to for back-references, will 85 * be three times windowSize big. 86 * 87 * <p>Three times so we can slide the whole buffer a windowSize to 88 * the left once we've read twice windowSize and still have enough 89 * data inside of it to satisfy back-references.</p> 90 */ 91 private final byte[] buf; 92 93 /** One behind the index of the last byte in the buffer that was written, i.e. the next position to write to */ 94 private int writeIndex; 95 96 /** Index of the next byte to be read. */ 97 private int readIndex; 98 99 /** The underlying stream to read compressed data from */ 100 private final CountingInputStream in; 101 102 /** Number of bytes still to be read from the current literal or back-reference. */ 103 private long bytesRemaining; 104 105 /** Offset of the current back-reference. */ 106 private int backReferenceOffset; 107 108 /** uncompressed size */ 109 private int size = 0; 110 111 // used in no-arg read method 112 private final byte[] oneByte = new byte[1]; 113 114 /** 115 * Supplier that delegates to {@link #readOneByte}. 116 */ 117 protected final ByteUtils.ByteSupplier supplier = new ByteUtils.ByteSupplier() { 118 @Override 119 public int getAsByte() throws IOException { 120 return readOneByte(); 121 } 122 }; 123 124 /** 125 * Creates a new LZ77 input stream. 126 * 127 * @param is 128 * An InputStream to read compressed data from 129 * @param windowSize 130 * Size of the window kept for back-references, must be bigger than the biggest offset expected. 131 * 132 * @throws IOException if reading fails 133 */ AbstractLZ77CompressorInputStream(final InputStream is, int windowSize)134 public AbstractLZ77CompressorInputStream(final InputStream is, int windowSize) throws IOException { 135 this.in = new CountingInputStream(is); 136 this.windowSize = windowSize; 137 buf = new byte[3 * windowSize]; 138 writeIndex = readIndex = 0; 139 bytesRemaining = 0; 140 } 141 142 /** {@inheritDoc} */ 143 @Override read()144 public int read() throws IOException { 145 return read(oneByte, 0, 1) == -1 ? -1 : oneByte[0] & 0xFF; 146 } 147 148 /** {@inheritDoc} */ 149 @Override close()150 public void close() throws IOException { 151 in.close(); 152 } 153 154 /** {@inheritDoc} */ 155 @Override available()156 public int available() { 157 return writeIndex - readIndex; 158 } 159 160 /** 161 * Get the uncompressed size of the stream 162 * 163 * @return the uncompressed size 164 */ getSize()165 public int getSize() { 166 return size; 167 } 168 169 /** 170 * Adds some initial data to fill the window with. 171 * 172 * <p>This is used if the stream has been cut into blocks and 173 * back-references of one block may refer to data of the previous 174 * block(s). One such example is the LZ4 frame format using block 175 * dependency.</p> 176 * 177 * @param data the data to fill the window with. 178 * @throws IllegalStateException if the stream has already started to read data 179 */ prefill(byte[] data)180 public void prefill(byte[] data) { 181 if (writeIndex != 0) { 182 throw new IllegalStateException("the stream has already been read from, can't prefill anymore"); 183 } 184 // we don't need more data than the big offset could refer to, so cap it 185 int len = Math.min(windowSize, data.length); 186 // we need the last data as we are dealing with *back*-references 187 System.arraycopy(data, data.length - len, buf, 0, len); 188 writeIndex += len; 189 readIndex += len; 190 } 191 192 /** 193 * @since 1.17 194 */ 195 @Override getCompressedCount()196 public long getCompressedCount() { 197 return in.getBytesRead(); 198 } 199 200 /** 201 * Used by subclasses to signal the next block contains the given 202 * amount of literal data. 203 * @param length the length of the block 204 */ startLiteral(long length)205 protected final void startLiteral(long length) { 206 bytesRemaining = length; 207 } 208 209 /** 210 * Is there still data remaining inside the current block? 211 * @return true if there is still data remaining inside the current block. 212 */ hasMoreDataInBlock()213 protected final boolean hasMoreDataInBlock() { 214 return bytesRemaining > 0; 215 } 216 217 /** 218 * Reads data from the current literal block. 219 * @param b buffer to write data to 220 * @param off offset to start writing to 221 * @param len maximum amount of data to read 222 * @return number of bytes read, may be 0. Will never return -1 as 223 * EOF-detection is the responsibility of the subclass 224 * @throws IOException if the underlying stream throws or signals 225 * an EOF before the amount of data promised for the block have 226 * been read 227 */ readLiteral(final byte[] b, final int off, final int len)228 protected final int readLiteral(final byte[] b, final int off, final int len) throws IOException { 229 final int avail = available(); 230 if (len > avail) { 231 tryToReadLiteral(len - avail); 232 } 233 return readFromBuffer(b, off, len); 234 } 235 tryToReadLiteral(int bytesToRead)236 private void tryToReadLiteral(int bytesToRead) throws IOException { 237 // min of "what is still inside the literal", "what does the user want" and "how muc can fit into the buffer" 238 final int reallyTryToRead = Math.min((int) Math.min(bytesToRead, bytesRemaining), 239 buf.length - writeIndex); 240 final int bytesRead = reallyTryToRead > 0 241 ? IOUtils.readFully(in, buf, writeIndex, reallyTryToRead) 242 : 0 /* happens for bytesRemaining == 0 */; 243 count(bytesRead); 244 if (reallyTryToRead != bytesRead) { 245 throw new IOException("Premature end of stream reading literal"); 246 } 247 writeIndex += reallyTryToRead; 248 bytesRemaining -= reallyTryToRead; 249 } 250 readFromBuffer(final byte[] b, final int off, final int len)251 private int readFromBuffer(final byte[] b, final int off, final int len) { 252 final int readable = Math.min(len, available()); 253 if (readable > 0) { 254 System.arraycopy(buf, readIndex, b, off, readable); 255 readIndex += readable; 256 if (readIndex > 2 * windowSize) { 257 slideBuffer(); 258 } 259 } 260 size += readable; 261 return readable; 262 } 263 slideBuffer()264 private void slideBuffer() { 265 System.arraycopy(buf, windowSize, buf, 0, windowSize * 2); 266 writeIndex -= windowSize; 267 readIndex -= windowSize; 268 } 269 270 /** 271 * Used by subclasses to signal the next block contains a back-reference with the given coordinates. 272 * @param offset the offset of the back-reference 273 * @param length the length of the back-reference 274 */ startBackReference(int offset, long length)275 protected final void startBackReference(int offset, long length) { 276 backReferenceOffset = offset; 277 bytesRemaining = length; 278 } 279 280 /** 281 * Reads data from the current back-reference. 282 * @param b buffer to write data to 283 * @param off offset to start writing to 284 * @param len maximum amount of data to read 285 * @return number of bytes read, may be 0. Will never return -1 as 286 * EOF-detection is the responsibility of the subclass 287 */ readBackReference(final byte[] b, final int off, final int len)288 protected final int readBackReference(final byte[] b, final int off, final int len) { 289 final int avail = available(); 290 if (len > avail) { 291 tryToCopy(len - avail); 292 } 293 return readFromBuffer(b, off, len); 294 } 295 tryToCopy(int bytesToCopy)296 private void tryToCopy(int bytesToCopy) { 297 // this will fit into the buffer without sliding and not 298 // require more than is available inside the back-reference 299 int copy = Math.min((int) Math.min(bytesToCopy, bytesRemaining), 300 buf.length - writeIndex); 301 if (copy == 0) { 302 // NOP 303 } else if (backReferenceOffset == 1) { // pretty common special case 304 final byte last = buf[writeIndex - 1]; 305 Arrays.fill(buf, writeIndex, writeIndex + copy, last); 306 writeIndex += copy; 307 } else if (copy < backReferenceOffset) { 308 System.arraycopy(buf, writeIndex - backReferenceOffset, buf, writeIndex, copy); 309 writeIndex += copy; 310 } else { 311 // back-reference overlaps with the bytes created from it 312 // like go back two bytes and then copy six (by copying 313 // the last two bytes three time). 314 final int fullRots = copy / backReferenceOffset; 315 for (int i = 0; i < fullRots; i++) { 316 System.arraycopy(buf, writeIndex - backReferenceOffset, buf, writeIndex, backReferenceOffset); 317 writeIndex += backReferenceOffset; 318 } 319 320 final int pad = copy - (backReferenceOffset * fullRots); 321 if (pad > 0) { 322 System.arraycopy(buf, writeIndex - backReferenceOffset, buf, writeIndex, pad); 323 writeIndex += pad; 324 } 325 } 326 bytesRemaining -= copy; 327 } 328 329 /** 330 * Reads a single byte from the real input stream and ensures the data is accounted for. 331 * 332 * @return the byte read as value between 0 and 255 or -1 if EOF has been reached. 333 * @throws IOException if the underlying stream throws 334 */ readOneByte()335 protected final int readOneByte() throws IOException { 336 final int b = in.read(); 337 if (b != -1) { 338 count(1); 339 return b & 0xFF; 340 } 341 return -1; 342 } 343 } 344