• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Licensed to the Apache Software Foundation (ASF) under one
3  * or more contributor license agreements.  See the NOTICE file
4  * distributed with this work for additional information
5  * regarding copyright ownership.  The ASF licenses this file
6  * to you under the Apache License, Version 2.0 (the
7  * "License"); you may not use this file except in compliance
8  * with the License.  You may obtain a copy of the License at
9  *
10  * http://www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing,
13  * software distributed under the License is distributed on an
14  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15  * KIND, either express or implied.  See the License for the
16  * specific language governing permissions and limitations
17  * under the License.
18  */
19 package org.apache.commons.compress.compressors.lz77support;
20 
21 import java.io.IOException;
22 import java.util.Arrays;
23 
24 /**
25  * Helper class for compression algorithms that use the ideas of LZ77.
26  *
27  * <p>Most LZ77 derived algorithms split input data into blocks of
28  * uncompressed data (called literal blocks) and back-references
29  * (pairs of offsets and lengths) that state "add <code>length</code>
30  * bytes that are the same as those already written starting
31  * <code>offset</code> bytes before the current position. The details
32  * of how those blocks and back-references are encoded are quite
33  * different between the algorithms and some algorithms perform
34  * additional steps (Huffman encoding in the case of DEFLATE for
35  * example).</p>
36  *
37  * <p>This class attempts to extract the core logic - finding
38  * back-references - so it can be re-used. It follows the algorithm
39  * explained in section 4 of RFC 1951 (DEFLATE) and currently doesn't
40  * implement the "lazy match" optimization. The three-byte hash
41  * function used in this class is the same as the one used by zlib and
42  * InfoZIP's ZIP implementation of DEFLATE. The whole class is
43  * strongly inspired by InfoZIP's implementation.</p>
44  *
45  * <p>LZ77 is used vaguely here (as well as many other places that
46  * talk about it :-), LZSS would likely be closer to the truth but
47  * LZ77 has become the synonym for a whole family of algorithms.</p>
48  *
49  * <p>The API consists of a compressor that is fed <code>byte</code>s
50  * and emits {@link Block}s to a registered callback where the blocks
51  * represent either {@link LiteralBlock literal blocks}, {@link
52  * BackReference back-references} or {@link EOD end of data
53  * markers}. In order to ensure the callback receives all information,
54  * the {@code #finish} method must be used once all data has been fed
55  * into the compressor.</p>
56  *
57  * <p>Several parameters influence the outcome of the "compression":</p>
58  * <dl>
59  *
60  *  <dt><code>windowSize</code></dt> <dd>the size of the sliding
61  *  window, must be a power of two - this determines the maximum
62  *  offset a back-reference can take. The compressor maintains a
63  *  buffer of twice of <code>windowSize</code> - real world values are
64  *  in the area of 32k.</dd>
65  *
66  *  <dt><code>minBackReferenceLength</code></dt>
67  *  <dd>Minimal length of a back-reference found. A true minimum of 3 is
68  *  hard-coded inside of this implemention but bigger lengths can be
69  *  configured.</dd>
70  *
71  *  <dt><code>maxBackReferenceLength</code></dt>
72  *  <dd>Maximal length of a back-reference found.</dd>
73  *
74  *  <dt><code>maxOffset</code></dt>
75  *  <dd>Maximal offset of a back-reference.</dd>
76  *
77  *  <dt><code>maxLiteralLength</code></dt>
78  *  <dd>Maximal length of a literal block.</dd>
79  * </dl>
80  *
81  * @see "https://tools.ietf.org/html/rfc1951#section-4"
82  * @since 1.14
83  * @NotThreadSafe
84  */
85 public class LZ77Compressor {
86 
87     /**
88      * Base class representing blocks the compressor may emit.
89      *
90      * <p>This class is not supposed to be subclassed by classes
91      * outside of Commons Compress so it is considered internal and
92      * changed that would break subclasses may get introduced with
93      * future releases.</p>
94      */
95     public static abstract class Block {
96         /** Enumeration of the block types the compressor may emit. */
97         public enum BlockType {
98             LITERAL, BACK_REFERENCE, EOD
99         }
getType()100         public abstract BlockType getType();
101     }
102 
103     /**
104      * Represents a literal block of data.
105      *
106      * <p>For performance reasons this encapsulates the real data, not
107      * a copy of it. Don't modify the data and process it inside of
108      * {@link Callback#accept} immediately as it will get overwritten
109      * sooner or later.</p>
110      */
111     public static final class LiteralBlock extends Block {
112         private final byte[] data;
113         private final int offset, length;
LiteralBlock(byte[] data, int offset, int length)114         public LiteralBlock(byte[] data, int offset, int length) {
115             this.data = data;
116             this.offset = offset;
117             this.length = length;
118         }
119         /**
120          * The literal data.
121          *
122          * <p>This returns a life view of the actual data in order to
123          * avoid copying, modify the array at your own risk.</p>
124          * @return the data
125          */
getData()126         public byte[] getData() {
127             return data;
128         }
129         /**
130          * Offset into data where the literal block starts.
131          * @return the offset
132          */
getOffset()133         public int getOffset() {
134             return offset;
135         }
136         /**
137          * Length of literal block.
138          * @return the length
139          */
getLength()140         public int getLength() {
141             return length;
142         }
143         @Override
getType()144         public BlockType getType() {
145             return BlockType.LITERAL;
146         }
147         @Override
toString()148         public String toString() {
149             return "LiteralBlock starting at " + offset + " with length " + length;
150         }
151     }
152 
153     /**
154      * Represents a back-reference.
155      */
156     public static final class BackReference extends Block {
157         private final int offset, length;
BackReference(int offset, int length)158         public BackReference(int offset, int length) {
159             this.offset = offset;
160             this.length = length;
161         }
162         /**
163          * Provides the offset of the back-reference.
164          * @return the offset
165          */
getOffset()166         public int getOffset() {
167             return offset;
168         }
169         /**
170          * Provides the length of the back-reference.
171          * @return the length
172          */
getLength()173         public int getLength() {
174             return length;
175         }
176         @Override
getType()177         public BlockType getType() {
178             return BlockType.BACK_REFERENCE;
179         }
180         @Override
toString()181         public String toString() {
182             return "BackReference with offset " + offset + " and length " + length;
183         }
184     }
185 
186     /** A simple "we are done" marker. */
187     public static final class EOD extends Block {
188         @Override
getType()189         public BlockType getType() {
190             return BlockType.EOD;
191         }
192     }
193 
194     private static final Block THE_EOD = new EOD();
195 
196     /**
197      * Callback invoked while the compressor processes data.
198      *
199      * <p>The callback is invoked on the same thread that receives the
200      * bytes to compress and may be invoked multiple times during the
201      * execution of {@link #compress} or {@link #finish}.</p>
202      */
203     public interface Callback {
204         /**
205          * Consumes a block.
206          * @param b the block to consume
207          * @throws IOException in case of an error
208          */
accept(Block b)209         void accept(Block b) throws IOException;
210     }
211 
212     static final int NUMBER_OF_BYTES_IN_HASH = 3;
213     private static final int NO_MATCH = -1;
214 
215     private final Parameters params;
216     private final Callback callback;
217 
218     // the sliding window, twice as big as "windowSize" parameter
219     private final byte[] window;
220     // the head of hash-chain - indexed by hash-code, points to the
221     // location inside of window of the latest sequence of bytes with
222     // the given hash.
223     private final int[] head;
224     // for each window-location points to the latest earlier location
225     // with the same hash. Only stores values for the latest
226     // "windowSize" elements, the index is "window location modulo
227     // windowSize".
228     private final int[] prev;
229 
230     // bit mask used when indexing into prev
231     private final int wMask;
232 
233     private boolean initialized = false;
234     // the position inside of window that shall be encoded right now
235     private int currentPosition;
236     // the number of bytes available to compress including the one at
237     // currentPosition
238     private int lookahead = 0;
239     // the hash of the three bytes stating at the current position
240     private int insertHash = 0;
241     // the position inside of the window where the current literal
242     // block starts (in case we are inside of a literal block).
243     private int blockStart = 0;
244     // position of the current match
245     private int matchStart = NO_MATCH;
246     // number of missed insertString calls for the up to three last
247     // bytes of the last match that can only be performed once more
248     // data has been read
249     private int missedInserts = 0;
250 
251     /**
252      * Initializes a compressor with parameters and a callback.
253      * @param params the parameters
254      * @param callback the callback
255      * @throws NullPointerException if either parameter is <code>null</code>
256      */
LZ77Compressor(Parameters params, Callback callback)257     public LZ77Compressor(Parameters params, Callback callback) {
258         if (params == null) {
259             throw new NullPointerException("params must not be null");
260         }
261         if (callback == null) {
262             throw new NullPointerException("callback must not be null");
263         }
264         this.params = params;
265         this.callback = callback;
266 
267         final int wSize = params.getWindowSize();
268         window = new byte[wSize * 2];
269         wMask = wSize - 1;
270         head = new int[HASH_SIZE];
271         Arrays.fill(head, NO_MATCH);
272         prev = new int[wSize];
273     }
274 
275     /**
276      * Feeds bytes into the compressor which in turn may emit zero or
277      * more blocks to the callback during the execution of this
278      * method.
279      * @param data the data to compress - must not be null
280      * @throws IOException if the callback throws an exception
281      */
compress(byte[] data)282     public void compress(byte[] data) throws IOException {
283         compress(data, 0, data.length);
284     }
285 
286     /**
287      * Feeds bytes into the compressor which in turn may emit zero or
288      * more blocks to the callback during the execution of this
289      * method.
290      * @param data the data to compress - must not be null
291      * @param off the start offset of the data
292      * @param len the number of bytes to compress
293      * @throws IOException if the callback throws an exception
294      */
compress(byte[] data, int off, int len)295     public void compress(byte[] data, int off, int len) throws IOException {
296         final int wSize = params.getWindowSize();
297         while (len > wSize) { // chop into windowSize sized chunks
298             doCompress(data, off, wSize);
299             off += wSize;
300             len -= wSize;
301         }
302         if (len > 0) {
303             doCompress(data, off, len);
304         }
305     }
306 
307     /**
308      * Tells the compressor to process all remaining data and signal
309      * end of data to the callback.
310      *
311      * <p>The compressor will in turn emit at least one block ({@link
312      * EOD}) but potentially multiple blocks to the callback during
313      * the execution of this method.</p>
314      * @throws IOException if the callback throws an exception
315      */
finish()316     public void finish() throws IOException {
317         if (blockStart != currentPosition || lookahead > 0) {
318             currentPosition += lookahead;
319             flushLiteralBlock();
320         }
321         callback.accept(THE_EOD);
322     }
323 
324     /**
325      * Adds some initial data to fill the window with.
326      *
327      * <p>This is used if the stream has been cut into blocks and
328      * back-references of one block may refer to data of the previous
329      * block(s). One such example is the LZ4 frame format using block
330      * dependency.</p>
331      *
332      * @param data the data to fill the window with.
333      * @throws IllegalStateException if the compressor has already started to accept data
334      */
prefill(byte[] data)335     public void prefill(byte[] data) {
336         if (currentPosition != 0 || lookahead != 0) {
337             throw new IllegalStateException("the compressor has already started to accept data, can't prefill anymore");
338         }
339 
340         // don't need more than windowSize for back-references
341         final int len = Math.min(params.getWindowSize(), data.length);
342         System.arraycopy(data, data.length - len, window, 0, len);
343 
344         if (len >= NUMBER_OF_BYTES_IN_HASH) {
345             initialize();
346             final int stop = len - NUMBER_OF_BYTES_IN_HASH + 1;
347             for (int i = 0; i < stop; i++) {
348                 insertString(i);
349             }
350             missedInserts = NUMBER_OF_BYTES_IN_HASH - 1;
351         } else { // not enough data to hash anything
352             missedInserts = len;
353         }
354         blockStart = currentPosition = len;
355     }
356 
357     // we use a 15 bit hashcode as calculated in updateHash
358     private static final int HASH_SIZE = 1 << 15;
359     private static final int HASH_MASK = HASH_SIZE - 1;
360     private static final int H_SHIFT = 5;
361 
362     /**
363      * Assumes we are calculating the hash for three consecutive bytes
364      * as a rolling hash, i.e. for bytes ABCD if H is the hash of ABC
365      * the new hash for BCD is nextHash(H, D).
366      *
367      * <p>The hash is shifted by five bits on each update so all
368      * effects of A have been swapped after the third update.</p>
369      */
nextHash(int oldHash, byte nextByte)370     private int nextHash(int oldHash, byte nextByte) {
371         final int nextVal = nextByte & 0xFF;
372         return ((oldHash << H_SHIFT) ^ nextVal) & HASH_MASK;
373     }
374 
375     // performs the actual algorithm with the pre-condition len <= windowSize
doCompress(byte[] data, int off, int len)376     private void doCompress(byte[] data, int off, int len) throws IOException {
377         int spaceLeft = window.length - currentPosition - lookahead;
378         if (len > spaceLeft) {
379             slide();
380         }
381         System.arraycopy(data, off, window, currentPosition + lookahead, len);
382         lookahead += len;
383         if (!initialized && lookahead >= params.getMinBackReferenceLength()) {
384             initialize();
385         }
386         if (initialized) {
387             compress();
388         }
389     }
390 
slide()391     private void slide() throws IOException {
392         final int wSize = params.getWindowSize();
393         if (blockStart != currentPosition && blockStart < wSize) {
394             flushLiteralBlock();
395             blockStart = currentPosition;
396         }
397         System.arraycopy(window, wSize, window, 0, wSize);
398         currentPosition -= wSize;
399         matchStart -= wSize;
400         blockStart -= wSize;
401         for (int i = 0; i < HASH_SIZE; i++) {
402             int h = head[i];
403             head[i] = h >= wSize ? h - wSize : NO_MATCH;
404         }
405         for (int i = 0; i < wSize; i++) {
406             int p = prev[i];
407             prev[i] = p >= wSize ? p - wSize : NO_MATCH;
408         }
409     }
410 
initialize()411     private void initialize() {
412         for (int i = 0; i < NUMBER_OF_BYTES_IN_HASH - 1; i++) {
413             insertHash = nextHash(insertHash, window[i]);
414         }
415         initialized = true;
416     }
417 
compress()418     private void compress() throws IOException {
419         final int minMatch = params.getMinBackReferenceLength();
420         final boolean lazy = params.getLazyMatching();
421         final int lazyThreshold = params.getLazyMatchingThreshold();
422 
423         while (lookahead >= minMatch) {
424             catchUpMissedInserts();
425             int matchLength = 0;
426             int hashHead = insertString(currentPosition);
427             if (hashHead != NO_MATCH && hashHead - currentPosition <= params.getMaxOffset()) {
428                 // sets matchStart as a side effect
429                 matchLength = longestMatch(hashHead);
430 
431                 if (lazy && matchLength <= lazyThreshold && lookahead > minMatch) {
432                     // try to find a longer match using the next position
433                     matchLength = longestMatchForNextPosition(matchLength);
434                 }
435             }
436             if (matchLength >= minMatch) {
437                 if (blockStart != currentPosition) {
438                     // emit preceeding literal block
439                     flushLiteralBlock();
440                     blockStart = NO_MATCH;
441                 }
442                 flushBackReference(matchLength);
443                 insertStringsInMatch(matchLength);
444                 lookahead -= matchLength;
445                 currentPosition += matchLength;
446                 blockStart = currentPosition;
447             } else {
448                 // no match, append to current or start a new literal
449                 lookahead--;
450                 currentPosition++;
451                 if (currentPosition - blockStart >= params.getMaxLiteralLength()) {
452                     flushLiteralBlock();
453                     blockStart = currentPosition;
454                 }
455             }
456         }
457     }
458 
459     /**
460      * Inserts the current three byte sequence into the dictionary and
461      * returns the previous head of the hash-chain.
462      *
463      * <p>Updates <code>insertHash</code> and <code>prev</code> as a
464      * side effect.</p>
465      */
insertString(int pos)466     private int insertString(int pos) {
467         insertHash = nextHash(insertHash, window[pos - 1 + NUMBER_OF_BYTES_IN_HASH]);
468         int hashHead = head[insertHash];
469         prev[pos & wMask] = hashHead;
470         head[insertHash] = pos;
471         return hashHead;
472     }
473 
longestMatchForNextPosition(final int prevMatchLength)474     private int longestMatchForNextPosition(final int prevMatchLength) {
475         // save a bunch of values to restore them if the next match isn't better than the current one
476         final int prevMatchStart = matchStart;
477         final int prevInsertHash = insertHash;
478 
479         lookahead--;
480         currentPosition++;
481         int hashHead = insertString(currentPosition);
482         final int prevHashHead = prev[currentPosition & wMask];
483         int matchLength = longestMatch(hashHead);
484 
485         if (matchLength <= prevMatchLength) {
486             // use the first match, as the next one isn't any better
487             matchLength = prevMatchLength;
488             matchStart = prevMatchStart;
489 
490             // restore modified values
491             head[insertHash] = prevHashHead;
492             insertHash = prevInsertHash;
493             currentPosition--;
494             lookahead++;
495         }
496         return matchLength;
497     }
498 
insertStringsInMatch(int matchLength)499     private void insertStringsInMatch(int matchLength) {
500         // inserts strings contained in current match
501         // insertString inserts the byte 2 bytes after position, which may not yet be available -> missedInserts
502         final int stop = Math.min(matchLength - 1, lookahead - NUMBER_OF_BYTES_IN_HASH);
503         // currentPosition has been inserted already
504         for (int i = 1; i <= stop; i++) {
505             insertString(currentPosition + i);
506         }
507         missedInserts = matchLength - stop - 1;
508     }
509 
catchUpMissedInserts()510     private void catchUpMissedInserts() {
511         while (missedInserts > 0) {
512             insertString(currentPosition - missedInserts--);
513         }
514     }
515 
flushBackReference(int matchLength)516     private void flushBackReference(int matchLength) throws IOException {
517         callback.accept(new BackReference(currentPosition - matchStart, matchLength));
518     }
519 
flushLiteralBlock()520     private void flushLiteralBlock() throws IOException {
521         callback.accept(new LiteralBlock(window, blockStart, currentPosition - blockStart));
522     }
523 
524     /**
525      * Searches the hash chain for real matches and returns the length
526      * of the longest match (0 if none were found) that isn't too far
527      * away (WRT maxOffset).
528      *
529      * <p>Sets matchStart to the index of the start position of the
530      * longest match as a side effect.</p>
531      */
longestMatch(int matchHead)532     private int longestMatch(int matchHead) {
533         final int minLength = params.getMinBackReferenceLength();
534         int longestMatchLength = minLength - 1;
535         final int maxPossibleLength = Math.min(params.getMaxBackReferenceLength(), lookahead);
536         final int minIndex = Math.max(0, currentPosition - params.getMaxOffset());
537         final int niceBackReferenceLength = Math.min(maxPossibleLength, params.getNiceBackReferenceLength());
538         final int maxCandidates = params.getMaxCandidates();
539         for (int candidates = 0; candidates < maxCandidates && matchHead >= minIndex; candidates++) {
540             int currentLength = 0;
541             for (int i = 0; i < maxPossibleLength; i++) {
542                 if (window[matchHead + i] != window[currentPosition + i]) {
543                     break;
544                 }
545                 currentLength++;
546             }
547             if (currentLength > longestMatchLength) {
548                 longestMatchLength = currentLength;
549                 matchStart = matchHead;
550                 if (currentLength >= niceBackReferenceLength) {
551                     // no need to search any further
552                     break;
553                 }
554             }
555             matchHead = prev[matchHead & wMask];
556         }
557         return longestMatchLength; // < minLength if no matches have been found, will be ignored in compress()
558     }
559 }
560