1 /* 2 * LZEncoder 3 * 4 * Authors: Lasse Collin <lasse.collin@tukaani.org> 5 * Igor Pavlov <http://7-zip.org/> 6 * 7 * This file has been put into the public domain. 8 * You can do whatever you want with this file. 9 */ 10 11 package org.tukaani.xz.lz; 12 13 import java.io.OutputStream; 14 import java.io.IOException; 15 import org.tukaani.xz.ArrayCache; 16 17 public abstract class LZEncoder { 18 public static final int MF_HC4 = 0x04; 19 public static final int MF_BT4 = 0x14; 20 21 /** 22 * Number of bytes to keep available before the current byte 23 * when moving the LZ window. 24 */ 25 private final int keepSizeBefore; 26 27 /** 28 * Number of bytes that must be available, the current byte included, 29 * to make hasEnoughData return true. Flushing and finishing are 30 * naturally exceptions to this since there cannot be any data after 31 * the end of the uncompressed input. 32 */ 33 private final int keepSizeAfter; 34 35 final int matchLenMax; 36 final int niceLen; 37 38 final byte[] buf; 39 final int bufSize; // To avoid buf.length with an array-cached buf. 40 41 int readPos = -1; 42 private int readLimit = -1; 43 private boolean finishing = false; 44 private int writePos = 0; 45 private int pendingSize = 0; 46 normalize(int[] positions, int positionsCount, int normalizationOffset)47 static void normalize(int[] positions, int positionsCount, 48 int normalizationOffset) { 49 for (int i = 0; i < positionsCount; ++i) { 50 if (positions[i] <= normalizationOffset) 51 positions[i] = 0; 52 else 53 positions[i] -= normalizationOffset; 54 } 55 } 56 57 /** 58 * Gets the size of the LZ window buffer that needs to be allocated. 59 */ getBufSize( int dictSize, int extraSizeBefore, int extraSizeAfter, int matchLenMax)60 private static int getBufSize( 61 int dictSize, int extraSizeBefore, int extraSizeAfter, 62 int matchLenMax) { 63 int keepSizeBefore = extraSizeBefore + dictSize; 64 int keepSizeAfter = extraSizeAfter + matchLenMax; 65 int reserveSize = Math.min(dictSize / 2 + (256 << 10), 512 << 20); 66 return keepSizeBefore + keepSizeAfter + reserveSize; 67 } 68 69 /** 70 * Gets approximate memory usage of the LZEncoder base structure and 71 * the match finder as kibibytes. 72 */ getMemoryUsage( int dictSize, int extraSizeBefore, int extraSizeAfter, int matchLenMax, int mf)73 public static int getMemoryUsage( 74 int dictSize, int extraSizeBefore, int extraSizeAfter, 75 int matchLenMax, int mf) { 76 // Buffer size + a little extra 77 int m = getBufSize(dictSize, extraSizeBefore, extraSizeAfter, 78 matchLenMax) / 1024 + 10; 79 80 switch (mf) { 81 case MF_HC4: 82 m += HC4.getMemoryUsage(dictSize); 83 break; 84 85 case MF_BT4: 86 m += BT4.getMemoryUsage(dictSize); 87 break; 88 89 default: 90 throw new IllegalArgumentException(); 91 } 92 93 return m; 94 } 95 96 /** 97 * Creates a new LZEncoder. 98 * <p> 99 * @param dictSize dictionary size 100 * 101 * @param extraSizeBefore 102 * number of bytes to keep available in the 103 * history in addition to dictSize 104 * 105 * @param extraSizeAfter 106 * number of bytes that must be available 107 * after current position + matchLenMax 108 * 109 * @param niceLen if a match of at least <code>niceLen</code> 110 * bytes is found, be happy with it and don't 111 * stop looking for longer matches 112 * 113 * @param matchLenMax don't test for matches longer than 114 * <code>matchLenMax</code> bytes 115 * 116 * @param mf match finder ID 117 * 118 * @param depthLimit match finder search depth limit 119 */ getInstance( int dictSize, int extraSizeBefore, int extraSizeAfter, int niceLen, int matchLenMax, int mf, int depthLimit, ArrayCache arrayCache)120 public static LZEncoder getInstance( 121 int dictSize, int extraSizeBefore, int extraSizeAfter, 122 int niceLen, int matchLenMax, int mf, int depthLimit, 123 ArrayCache arrayCache) { 124 switch (mf) { 125 case MF_HC4: 126 return new HC4(dictSize, extraSizeBefore, extraSizeAfter, 127 niceLen, matchLenMax, depthLimit, arrayCache); 128 129 case MF_BT4: 130 return new BT4(dictSize, extraSizeBefore, extraSizeAfter, 131 niceLen, matchLenMax, depthLimit, arrayCache); 132 } 133 134 throw new IllegalArgumentException(); 135 } 136 137 /** 138 * Creates a new LZEncoder. See <code>getInstance</code>. 139 */ LZEncoder(int dictSize, int extraSizeBefore, int extraSizeAfter, int niceLen, int matchLenMax, ArrayCache arrayCache)140 LZEncoder(int dictSize, int extraSizeBefore, int extraSizeAfter, 141 int niceLen, int matchLenMax, ArrayCache arrayCache) { 142 bufSize = getBufSize(dictSize, extraSizeBefore, extraSizeAfter, 143 matchLenMax); 144 buf = arrayCache.getByteArray(bufSize, false); 145 146 keepSizeBefore = extraSizeBefore + dictSize; 147 keepSizeAfter = extraSizeAfter + matchLenMax; 148 149 this.matchLenMax = matchLenMax; 150 this.niceLen = niceLen; 151 } 152 putArraysToCache(ArrayCache arrayCache)153 public void putArraysToCache(ArrayCache arrayCache) { 154 arrayCache.putArray(buf); 155 } 156 157 /** 158 * Sets a preset dictionary. If a preset dictionary is wanted, this 159 * function must be called immediately after creating the LZEncoder 160 * before any data has been encoded. 161 */ setPresetDict(int dictSize, byte[] presetDict)162 public void setPresetDict(int dictSize, byte[] presetDict) { 163 assert !isStarted(); 164 assert writePos == 0; 165 166 if (presetDict != null) { 167 // If the preset dictionary buffer is bigger than the dictionary 168 // size, copy only the tail of the preset dictionary. 169 int copySize = Math.min(presetDict.length, dictSize); 170 int offset = presetDict.length - copySize; 171 System.arraycopy(presetDict, offset, buf, 0, copySize); 172 writePos += copySize; 173 skip(copySize); 174 } 175 } 176 177 /** 178 * Moves data from the end of the buffer to the beginning, discarding 179 * old data and making space for new input. 180 */ moveWindow()181 private void moveWindow() { 182 // Align the move to a multiple of 16 bytes. LZMA2 needs this 183 // because it uses the lowest bits from readPos to get the 184 // alignment of the uncompressed data. 185 int moveOffset = (readPos + 1 - keepSizeBefore) & ~15; 186 int moveSize = writePos - moveOffset; 187 System.arraycopy(buf, moveOffset, buf, 0, moveSize); 188 189 readPos -= moveOffset; 190 readLimit -= moveOffset; 191 writePos -= moveOffset; 192 } 193 194 /** 195 * Copies new data into the LZEncoder's buffer. 196 */ fillWindow(byte[] in, int off, int len)197 public int fillWindow(byte[] in, int off, int len) { 198 assert !finishing; 199 200 // Move the sliding window if needed. 201 if (readPos >= bufSize - keepSizeAfter) 202 moveWindow(); 203 204 // Try to fill the dictionary buffer. If it becomes full, 205 // some of the input bytes may be left unused. 206 if (len > bufSize - writePos) 207 len = bufSize - writePos; 208 209 System.arraycopy(in, off, buf, writePos, len); 210 writePos += len; 211 212 // Set the new readLimit but only if there's enough data to allow 213 // encoding of at least one more byte. 214 if (writePos >= keepSizeAfter) 215 readLimit = writePos - keepSizeAfter; 216 217 processPendingBytes(); 218 219 // Tell the caller how much input we actually copied into 220 // the dictionary. 221 return len; 222 } 223 224 /** 225 * Process pending bytes remaining from preset dictionary initialization 226 * or encoder flush operation. 227 */ processPendingBytes()228 private void processPendingBytes() { 229 // After flushing or setting a preset dictionary there will be 230 // pending data that hasn't been ran through the match finder yet. 231 // Run it through the match finder now if there is enough new data 232 // available (readPos < readLimit) that the encoder may encode at 233 // least one more input byte. This way we don't waste any time 234 // looping in the match finder (and marking the same bytes as 235 // pending again) if the application provides very little new data 236 // per write call. 237 if (pendingSize > 0 && readPos < readLimit) { 238 readPos -= pendingSize; 239 int oldPendingSize = pendingSize; 240 pendingSize = 0; 241 skip(oldPendingSize); 242 assert pendingSize < oldPendingSize; 243 } 244 } 245 246 /** 247 * Returns true if at least one byte has already been run through 248 * the match finder. 249 */ 250 public boolean isStarted() { 251 return readPos != -1; 252 } 253 254 /** 255 * Marks that all the input needs to be made available in 256 * the encoded output. 257 */ 258 public void setFlushing() { 259 readLimit = writePos - 1; 260 processPendingBytes(); 261 } 262 263 /** 264 * Marks that there is no more input remaining. The read position 265 * can be advanced until the end of the data. 266 */ 267 public void setFinishing() { 268 readLimit = writePos - 1; 269 finishing = true; 270 processPendingBytes(); 271 } 272 273 /** 274 * Tests if there is enough input available to let the caller encode 275 * at least one more byte. 276 */ 277 public boolean hasEnoughData(int alreadyReadLen) { 278 return readPos - alreadyReadLen < readLimit; 279 } 280 281 public void copyUncompressed(OutputStream out, int backward, int len) 282 throws IOException { 283 out.write(buf, readPos + 1 - backward, len); 284 } 285 286 /** 287 * Get the number of bytes available, including the current byte. 288 * <p> 289 * Note that the result is undefined if <code>getMatches</code> or 290 * <code>skip</code> hasn't been called yet and no preset dictionary 291 * is being used. 292 */ 293 public int getAvail() { 294 assert isStarted(); 295 return writePos - readPos; 296 } 297 298 /** 299 * Gets the lowest four bits of the absolute offset of the current byte. 300 * Bits other than the lowest four are undefined. 301 */ 302 public int getPos() { 303 return readPos; 304 } 305 306 /** 307 * Gets the byte from the given backward offset. 308 * <p> 309 * The current byte is at <code>0</code>, the previous byte 310 * at <code>1</code> etc. To get a byte at zero-based distance, 311 * use <code>getByte(dist + 1)<code>. 312 * <p> 313 * This function is equivalent to <code>getByte(0, backward)</code>. 314 */ 315 public int getByte(int backward) { 316 return buf[readPos - backward] & 0xFF; 317 } 318 319 /** 320 * Gets the byte from the given forward minus backward offset. 321 * The forward offset is added to the current position. This lets 322 * one read bytes ahead of the current byte. 323 */ 324 public int getByte(int forward, int backward) { 325 return buf[readPos + forward - backward] & 0xFF; 326 } 327 328 /** 329 * Get the length of a match at the given distance. 330 * 331 * @param dist zero-based distance of the match to test 332 * @param lenLimit don't test for a match longer than this 333 * 334 * @return length of the match; it is in the range [0, lenLimit] 335 */ 336 public int getMatchLen(int dist, int lenLimit) { 337 int backPos = readPos - dist - 1; 338 int len = 0; 339 340 while (len < lenLimit && buf[readPos + len] == buf[backPos + len]) 341 ++len; 342 343 return len; 344 } 345 346 /** 347 * Get the length of a match at the given distance and forward offset. 348 * 349 * @param forward forward offset 350 * @param dist zero-based distance of the match to test 351 * @param lenLimit don't test for a match longer than this 352 * 353 * @return length of the match; it is in the range [0, lenLimit] 354 */ 355 public int getMatchLen(int forward, int dist, int lenLimit) { 356 int curPos = readPos + forward; 357 int backPos = curPos - dist - 1; 358 int len = 0; 359 360 while (len < lenLimit && buf[curPos + len] == buf[backPos + len]) 361 ++len; 362 363 return len; 364 } 365 366 /** 367 * Verifies that the matches returned by the match finder are valid. 368 * This is meant to be used in an assert statement. This is totally 369 * useless for actual encoding since match finder's results should 370 * naturally always be valid if it isn't broken. 371 * 372 * @param matches return value from <code>getMatches</code> 373 * 374 * @return true if matches are valid, false if match finder is broken 375 */ 376 public boolean verifyMatches(Matches matches) { 377 int lenLimit = Math.min(getAvail(), matchLenMax); 378 379 for (int i = 0; i < matches.count; ++i) 380 if (getMatchLen(matches.dist[i], lenLimit) != matches.len[i]) 381 return false; 382 383 return true; 384 } 385 386 /** 387 * Moves to the next byte, checks if there is enough input available, 388 * and returns the amount of input available. 389 * 390 * @param requiredForFlushing 391 * minimum number of available bytes when 392 * flushing; encoding may be continued with 393 * new input after flushing 394 * @param requiredForFinishing 395 * minimum number of available bytes when 396 * finishing; encoding must not be continued 397 * after finishing or the match finder state 398 * may be corrupt 399 * 400 * @return the number of bytes available or zero if there 401 * is not enough input available 402 */ 403 int movePos(int requiredForFlushing, int requiredForFinishing) { 404 assert requiredForFlushing >= requiredForFinishing; 405 406 ++readPos; 407 int avail = writePos - readPos; 408 409 if (avail < requiredForFlushing) { 410 if (avail < requiredForFinishing || !finishing) { 411 ++pendingSize; 412 avail = 0; 413 } 414 } 415 416 return avail; 417 } 418 419 /** 420 * Runs match finder for the next byte and returns the matches found. 421 */ 422 public abstract Matches getMatches(); 423 424 /** 425 * Skips the given number of bytes in the match finder. 426 */ 427 public abstract void skip(int len); 428 } 429