tukaani/xz/LZMA2InputStream.java

/*
 * LZMA2InputStream
 *
 * Authors: Lasse Collin <lasse.collin@tukaani.org>
 *          Igor Pavlov <http://7-zip.org/>
 *
 * This file has been put into the public domain.
 * You can do whatever you want with this file.
 */

package org.tukaani.xz;

import java.io.InputStream;
import java.io.DataInputStream;
import java.io.IOException;
import java.io.EOFException;
import org.tukaani.xz.lz.LZDecoder;
import org.tukaani.xz.rangecoder.RangeDecoderFromBuffer;
import org.tukaani.xz.lzma.LZMADecoder;

/**
 * Decompresses a raw LZMA2 stream (no XZ headers).
 */
public class LZMA2InputStream extends InputStream {
    /**
     * Smallest valid LZMA2 dictionary size.
     * <p>
     * Very tiny dictionaries would be a performance problem, so
     * the minimum is 4 KiB.
     */
    public static final int DICT_SIZE_MIN = 4096;

    /**
     * Largest dictionary size supported by this implementation.
     * <p>
     * The LZMA2 algorithm allows dictionaries up to one byte less than 4 GiB.
     * This implementation supports only 16 bytes less than 2 GiB for raw
     * LZMA2 streams, and for .xz files the maximum is 1.5 GiB. This
     * limitation is due to Java using signed 32-bit integers for array
     * indexing. The limitation shouldn't matter much in practice since so
     * huge dictionaries are not normally used.
     */
    public static final int DICT_SIZE_MAX = Integer.MAX_VALUE & ~15;

    private static final int COMPRESSED_SIZE_MAX = 1 << 16;

    private final ArrayCache arrayCache;
    private DataInputStream in;

    private LZDecoder lz;
    private RangeDecoderFromBuffer rc;
    private LZMADecoder lzma;

    private int uncompressedSize = 0;
    private boolean isLZMAChunk = false;

    private boolean needDictReset = true;
    private boolean needProps = true;
    private boolean endReached = false;

    private IOException exception = null;

    private final byte[] tempBuf = new byte[1];

    /**
     * Gets approximate decompressor memory requirements as kibibytes for
     * the given dictionary size.
     *
     * @param       dictSize    LZMA2 dictionary size as bytes, must be
     *                          in the range [<code>DICT_SIZE_MIN</code>,
     *                          <code>DICT_SIZE_MAX</code>]
     *
     * @return      approximate memory requirements as kibibytes (KiB)
     */
    public static int getMemoryUsage(int dictSize) {
        // The base state is around 30-40 KiB (probabilities etc.),
        // range decoder needs COMPRESSED_SIZE_MAX bytes for buffering,
        // and LZ decoder needs a dictionary buffer.
        return 40 + COMPRESSED_SIZE_MAX / 1024 + getDictSize(dictSize) / 1024;
    }

    private static int getDictSize(int dictSize) {
        if (dictSize < DICT_SIZE_MIN || dictSize > DICT_SIZE_MAX)
            throw new IllegalArgumentException(
                    "Unsupported dictionary size " + dictSize);

        // Round dictionary size upward to a multiple of 16. This way LZMA
        // can use LZDecoder.getPos() for calculating LZMA's posMask.
        // Note that this check is needed only for raw LZMA2 streams; it is
        // redundant with .xz.
        return (dictSize + 15) & ~15;
    }

    /**
     * Creates a new input stream that decompresses raw LZMA2 data
     * from <code>in</code>.
     * <p>
     * The caller needs to know the dictionary size used when compressing;
     * the dictionary size isn't stored as part of a raw LZMA2 stream.
     * <p>
     * Specifying a too small dictionary size will prevent decompressing
     * the stream. Specifying a too big dictionary is waste of memory but
     * decompression will work.
     * <p>
     * There is no need to specify a dictionary bigger than
     * the uncompressed size of the data even if a bigger dictionary
     * was used when compressing. If you know the uncompressed size
     * of the data, this might allow saving some memory.
     *
     * @param       in          input stream from which LZMA2-compressed
     *                          data is read
     *
     * @param       dictSize    LZMA2 dictionary size as bytes, must be
     *                          in the range [<code>DICT_SIZE_MIN</code>,
     *                          <code>DICT_SIZE_MAX</code>]
     */
    public LZMA2InputStream(InputStream in, int dictSize) {
        this(in, dictSize, null);
    }

    /**
     * Creates a new LZMA2 decompressor using a preset dictionary.
     * <p>
     * This is like <code>LZMA2InputStream(InputStream, int)</code> except
     * that the dictionary may be initialized using a preset dictionary.
     * If a preset dictionary was used when compressing the data, the
     * same preset dictionary must be provided when decompressing.
     *
     * @param       in          input stream from which LZMA2-compressed
     *                          data is read
     *
     * @param       dictSize    LZMA2 dictionary size as bytes, must be
     *                          in the range [<code>DICT_SIZE_MIN</code>,
     *                          <code>DICT_SIZE_MAX</code>]
     *
     * @param       presetDict  preset dictionary or <code>null</code>
     *                          to use no preset dictionary
     */
    public LZMA2InputStream(InputStream in, int dictSize, byte[] presetDict) {
        this(in, dictSize, presetDict, ArrayCache.getDefaultCache());
    }

    /**
     * Creates a new LZMA2 decompressor using a preset dictionary
     * and array cache.
     * <p>
     * This is like <code>LZMA2InputStream(InputStream, int, byte[])</code>
     * except that this also takes the <code>arrayCache</code> argument.
     *
     * @param       in          input stream from which LZMA2-compressed
     *                          data is read
     *
     * @param       dictSize    LZMA2 dictionary size as bytes, must be
     *                          in the range [<code>DICT_SIZE_MIN</code>,
     *                          <code>DICT_SIZE_MAX</code>]
     *
     * @param       presetDict  preset dictionary or <code>null</code>
     *                          to use no preset dictionary
     *
     * @param       arrayCache  cache to be used for allocating large arrays
     *
     * @since 1.7
     */
    LZMA2InputStream(InputStream in, int dictSize, byte[] presetDict,
                     ArrayCache arrayCache) {
        // Check for null because otherwise null isn't detect
        // in this constructor.
        if (in == null)
            throw new NullPointerException();

        this.arrayCache = arrayCache;
        this.in = new DataInputStream(in);
        this.rc = new RangeDecoderFromBuffer(COMPRESSED_SIZE_MAX, arrayCache);
        this.lz = new LZDecoder(getDictSize(dictSize), presetDict, arrayCache);

        if (presetDict != null && presetDict.length > 0)
            needDictReset = false;
    }

    /**
     * Decompresses the next byte from this input stream.
     * <p>
     * Reading lots of data with <code>read()</code> from this input stream
     * may be inefficient. Wrap it in <code>java.io.BufferedInputStream</code>
     * if you need to read lots of data one byte at a time.
     *
     * @return      the next decompressed byte, or <code>-1</code>
     *              to indicate the end of the compressed stream
     *
     * @throws      CorruptedInputException
     *
     * @throws      XZIOException if the stream has been closed
     *
     * @throws      EOFException
     *                          compressed input is truncated or corrupt
     *
     * @throws      IOException may be thrown by <code>in</code>
     */
    public int read() throws IOException {
        return read(tempBuf, 0, 1) == -1 ? -1 : (tempBuf[0] & 0xFF);
    }

    /**
     * Decompresses into an array of bytes.
     * <p>
     * If <code>len</code> is zero, no bytes are read and <code>0</code>
     * is returned. Otherwise this will block until <code>len</code>
     * bytes have been decompressed, the end of the LZMA2 stream is reached,
     * or an exception is thrown.
     *
     * @param       buf         target buffer for uncompressed data
     * @param       off         start offset in <code>buf</code>
     * @param       len         maximum number of uncompressed bytes to read
     *
     * @return      number of bytes read, or <code>-1</code> to indicate
     *              the end of the compressed stream
     *
     * @throws      CorruptedInputException
     *
     * @throws      XZIOException if the stream has been closed
     *
     * @throws      EOFException
     *                          compressed input is truncated or corrupt
     *
     * @throws      IOException may be thrown by <code>in</code>
     */
    public int read(byte[] buf, int off, int len) throws IOException {
        if (off < 0 || len < 0 || off + len < 0 || off + len > buf.length)
            throw new IndexOutOfBoundsException();

        if (len == 0)
            return 0;

        if (in == null)
            throw new XZIOException("Stream closed");

        if (exception != null)
            throw exception;

        if (endReached)
            return -1;

        try {
            int size = 0;

            while (len > 0) {
                if (uncompressedSize == 0) {
                    decodeChunkHeader();
                    if (endReached)
                        return size == 0 ? -1 : size;
                }

                int copySizeMax = Math.min(uncompressedSize, len);

                if (!isLZMAChunk) {
                    lz.copyUncompressed(in, copySizeMax);
                } else {
                    lz.setLimit(copySizeMax);
                    lzma.decode();
                }

                int copiedSize = lz.flush(buf, off);
                off += copiedSize;
                len -= copiedSize;
                size += copiedSize;
                uncompressedSize -= copiedSize;

                if (uncompressedSize == 0)
                    if (!rc.isFinished() || lz.hasPending())
                        throw new CorruptedInputException();
            }

            return size;

        } catch (IOException e) {
            exception = e;
            throw e;
        }
    }

    private void decodeChunkHeader() throws IOException {
        int control = in.readUnsignedByte();

        if (control == 0x00) {
            endReached = true;
            putArraysToCache();
            return;
        }

        if (control >= 0xE0 || control == 0x01) {
            needProps = true;
            needDictReset = false;
            lz.reset();
        } else if (needDictReset) {
            throw new CorruptedInputException();
        }

        if (control >= 0x80) {
            isLZMAChunk = true;

            uncompressedSize = (control & 0x1F) << 16;
            uncompressedSize += in.readUnsignedShort() + 1;

            int compressedSize = in.readUnsignedShort() + 1;

            if (control >= 0xC0) {
                needProps = false;
                decodeProps();

            } else if (needProps) {
                throw new CorruptedInputException();

            } else if (control >= 0xA0) {
                lzma.reset();
            }

            rc.prepareInputBuffer(in, compressedSize);

        } else if (control > 0x02) {
            throw new CorruptedInputException();

        } else {
            isLZMAChunk = false;
            uncompressedSize = in.readUnsignedShort() + 1;
        }
    }

    private void decodeProps() throws IOException {
        int props = in.readUnsignedByte();

        if (props > (4 * 5 + 4) * 9 + 8)
            throw new CorruptedInputException();

        int pb = props / (9 * 5);
        props -= pb * 9 * 5;
        int lp = props / 9;
        int lc = props - lp * 9;

        if (lc + lp > 4)
            throw new CorruptedInputException();

        lzma = new LZMADecoder(lz, rc, lc, lp, pb);
    }

    /**
     * Returns the number of uncompressed bytes that can be read
     * without blocking. The value is returned with an assumption
     * that the compressed input data will be valid. If the compressed
     * data is corrupt, <code>CorruptedInputException</code> may get
     * thrown before the number of bytes claimed to be available have
     * been read from this input stream.
     * <p>
     * In LZMA2InputStream, the return value will be non-zero when the
     * decompressor is in the middle of an LZMA2 chunk. The return value
     * will then be the number of uncompressed bytes remaining from that
     * chunk. The return value can also be non-zero in the middle of
     * an uncompressed chunk, but then the return value depends also on
     * the <code>available()</code> method of the underlying InputStream.
     *
     * @return      the number of uncompressed bytes that can be read
     *              without blocking
     */
    public int available() throws IOException {
        if (in == null)
            throw new XZIOException("Stream closed");

        if (exception != null)
            throw exception;

        return isLZMAChunk ? uncompressedSize
                           : Math.min(uncompressedSize, in.available());
    }

    private void putArraysToCache() {
        if (lz != null) {
            lz.putArraysToCache(arrayCache);
            lz = null;

            rc.putArraysToCache(arrayCache);
            rc = null;
        }
    }

    /**
     * Closes the stream and calls <code>in.close()</code>.
     * If the stream was already closed, this does nothing.
     *
     * @throws  IOException if thrown by <code>in.close()</code>
     */
    public void close() throws IOException {
        if (in != null) {
            putArraysToCache();

            try {
                in.close();
            } finally {
                in = null;
            }
        }
    }
}