1 /* 2 * LZMA2Options 3 * 4 * Author: Lasse Collin <lasse.collin@tukaani.org> 5 * 6 * This file has been put into the public domain. 7 * You can do whatever you want with this file. 8 */ 9 10 package org.tukaani.xz; 11 12 import java.io.InputStream; 13 import java.io.IOException; 14 import org.tukaani.xz.lz.LZEncoder; 15 import org.tukaani.xz.lzma.LZMAEncoder; 16 17 /** 18 * LZMA2 compression options. 19 * <p> 20 * While this allows setting the LZMA2 compression options in detail, 21 * often you only need <code>LZMA2Options()</code> or 22 * <code>LZMA2Options(int)</code>. 23 */ 24 public class LZMA2Options extends FilterOptions { 25 /** 26 * Minimum valid compression preset level is 0. 27 */ 28 public static final int PRESET_MIN = 0; 29 30 /** 31 * Maximum valid compression preset level is 9. 32 */ 33 public static final int PRESET_MAX = 9; 34 35 /** 36 * Default compression preset level is 6. 37 */ 38 public static final int PRESET_DEFAULT = 6; 39 40 /** 41 * Minimum dictionary size is 4 KiB. 42 */ 43 public static final int DICT_SIZE_MIN = 4096; 44 45 /** 46 * Maximum dictionary size for compression is 768 MiB. 47 * <p> 48 * The decompressor supports bigger dictionaries, up to almost 2 GiB. 49 * With HC4 the encoder would support dictionaries bigger than 768 MiB. 50 * The 768 MiB limit comes from the current implementation of BT4 where 51 * we would otherwise hit the limits of signed ints in array indexing. 52 * <p> 53 * If you really need bigger dictionary for decompression, 54 * use {@link LZMA2InputStream} directly. 55 */ 56 public static final int DICT_SIZE_MAX = 768 << 20; 57 58 /** 59 * The default dictionary size is 8 MiB. 60 */ 61 public static final int DICT_SIZE_DEFAULT = 8 << 20; 62 63 /** 64 * Maximum value for lc + lp is 4. 65 */ 66 public static final int LC_LP_MAX = 4; 67 68 /** 69 * The default number of literal context bits is 3. 70 */ 71 public static final int LC_DEFAULT = 3; 72 73 /** 74 * The default number of literal position bits is 0. 75 */ 76 public static final int LP_DEFAULT = 0; 77 78 /** 79 * Maximum value for pb is 4. 80 */ 81 public static final int PB_MAX = 4; 82 83 /** 84 * The default number of position bits is 2. 85 */ 86 public static final int PB_DEFAULT = 2; 87 88 /** 89 * Compression mode: uncompressed. 90 * The data is wrapped into a LZMA2 stream without compression. 91 */ 92 public static final int MODE_UNCOMPRESSED = 0; 93 94 /** 95 * Compression mode: fast. 96 * This is usually combined with a hash chain match finder. 97 */ 98 public static final int MODE_FAST = LZMAEncoder.MODE_FAST; 99 100 /** 101 * Compression mode: normal. 102 * This is usually combined with a binary tree match finder. 103 */ 104 public static final int MODE_NORMAL = LZMAEncoder.MODE_NORMAL; 105 106 /** 107 * Minimum value for <code>niceLen</code> is 8. 108 */ 109 public static final int NICE_LEN_MIN = 8; 110 111 /** 112 * Maximum value for <code>niceLen</code> is 273. 113 */ 114 public static final int NICE_LEN_MAX = 273; 115 116 /** 117 * Match finder: Hash Chain 2-3-4 118 */ 119 public static final int MF_HC4 = LZEncoder.MF_HC4; 120 121 /** 122 * Match finder: Binary tree 2-3-4 123 */ 124 public static final int MF_BT4 = LZEncoder.MF_BT4; 125 126 private static final int[] presetToDictSize = { 127 1 << 18, 1 << 20, 1 << 21, 1 << 22, 1 << 22, 128 1 << 23, 1 << 23, 1 << 24, 1 << 25, 1 << 26 }; 129 130 private static final int[] presetToDepthLimit = { 4, 8, 24, 48 }; 131 132 private int dictSize; 133 private byte[] presetDict = null; 134 private int lc; 135 private int lp; 136 private int pb; 137 private int mode; 138 private int niceLen; 139 private int mf; 140 private int depthLimit; 141 142 /** 143 * Creates new LZMA2 options and sets them to the default values. 144 * This is equivalent to <code>LZMA2Options(PRESET_DEFAULT)</code>. 145 */ LZMA2Options()146 public LZMA2Options() { 147 try { 148 setPreset(PRESET_DEFAULT); 149 } catch (UnsupportedOptionsException e) { 150 assert false; 151 throw new RuntimeException(); 152 } 153 } 154 155 /** 156 * Creates new LZMA2 options and sets them to the given preset. 157 * 158 * @throws UnsupportedOptionsException 159 * <code>preset</code> is not supported 160 */ LZMA2Options(int preset)161 public LZMA2Options(int preset) throws UnsupportedOptionsException { 162 setPreset(preset); 163 } 164 165 /** 166 * Creates new LZMA2 options and sets them to the given custom values. 167 * 168 * @throws UnsupportedOptionsException 169 * unsupported options were specified 170 */ LZMA2Options(int dictSize, int lc, int lp, int pb, int mode, int niceLen, int mf, int depthLimit)171 public LZMA2Options(int dictSize, int lc, int lp, int pb, int mode, 172 int niceLen, int mf, int depthLimit) 173 throws UnsupportedOptionsException { 174 setDictSize(dictSize); 175 setLcLp(lc, lp); 176 setPb(pb); 177 setMode(mode); 178 setNiceLen(niceLen); 179 setMatchFinder(mf); 180 setDepthLimit(depthLimit); 181 } 182 183 /** 184 * Sets the compression options to the given preset. 185 * <p> 186 * The presets 0-3 are fast presets with medium compression. 187 * The presets 4-6 are fairly slow presets with high compression. 188 * The default preset (<code>PRESET_DEFAULT</code>) is 6. 189 * <p> 190 * The presets 7-9 are like the preset 6 but use bigger dictionaries 191 * and have higher compressor and decompressor memory requirements. 192 * Unless the uncompressed size of the file exceeds 8 MiB, 193 * 16 MiB, or 32 MiB, it is waste of memory to use the 194 * presets 7, 8, or 9, respectively. 195 * 196 * @throws UnsupportedOptionsException 197 * <code>preset</code> is not supported 198 */ setPreset(int preset)199 public void setPreset(int preset) throws UnsupportedOptionsException { 200 if (preset < 0 || preset > 9) 201 throw new UnsupportedOptionsException( 202 "Unsupported preset: " + preset); 203 204 lc = LC_DEFAULT; 205 lp = LP_DEFAULT; 206 pb = PB_DEFAULT; 207 dictSize = presetToDictSize[preset]; 208 209 if (preset <= 3) { 210 mode = MODE_FAST; 211 mf = MF_HC4; 212 niceLen = preset <= 1 ? 128 : NICE_LEN_MAX; 213 depthLimit = presetToDepthLimit[preset]; 214 } else { 215 mode = MODE_NORMAL; 216 mf = MF_BT4; 217 niceLen = (preset == 4) ? 16 : (preset == 5) ? 32 : 64; 218 depthLimit = 0; 219 } 220 } 221 222 /** 223 * Sets the dictionary size in bytes. 224 * <p> 225 * The dictionary (or history buffer) holds the most recently seen 226 * uncompressed data. Bigger dictionary usually means better compression. 227 * However, using a dictioanary bigger than the size of the uncompressed 228 * data is waste of memory. 229 * <p> 230 * Any value in the range [DICT_SIZE_MIN, DICT_SIZE_MAX] is valid, 231 * but sizes of 2^n and 2^n + 2^(n-1) bytes are somewhat 232 * recommended. 233 * 234 * @throws UnsupportedOptionsException 235 * <code>dictSize</code> is not supported 236 */ setDictSize(int dictSize)237 public void setDictSize(int dictSize) throws UnsupportedOptionsException { 238 if (dictSize < DICT_SIZE_MIN) 239 throw new UnsupportedOptionsException( 240 "LZMA2 dictionary size must be at least 4 KiB: " 241 + dictSize + " B"); 242 243 if (dictSize > DICT_SIZE_MAX) 244 throw new UnsupportedOptionsException( 245 "LZMA2 dictionary size must not exceed " 246 + (DICT_SIZE_MAX >> 20) + " MiB: " + dictSize + " B"); 247 248 this.dictSize = dictSize; 249 } 250 251 /** 252 * Gets the dictionary size in bytes. 253 */ getDictSize()254 public int getDictSize() { 255 return dictSize; 256 } 257 258 /** 259 * Sets a preset dictionary. Use null to disable the use of 260 * a preset dictionary. By default there is no preset dictionary. 261 * <p> 262 * <b>The .xz format doesn't support a preset dictionary for now. 263 * Do not set a preset dictionary unless you use raw LZMA2.</b> 264 * <p> 265 * Preset dictionary can be useful when compressing many similar, 266 * relatively small chunks of data independently from each other. 267 * A preset dictionary should contain typical strings that occur in 268 * the files being compressed. The most probable strings should be 269 * near the end of the preset dictionary. The preset dictionary used 270 * for compression is also needed for decompression. 271 */ setPresetDict(byte[] presetDict)272 public void setPresetDict(byte[] presetDict) { 273 this.presetDict = presetDict; 274 } 275 276 /** 277 * Gets the preset dictionary. 278 */ getPresetDict()279 public byte[] getPresetDict() { 280 return presetDict; 281 } 282 283 /** 284 * Sets the number of literal context bits and literal position bits. 285 * <p> 286 * The sum of <code>lc</code> and <code>lp</code> is limited to 4. 287 * Trying to exceed it will throw an exception. This function lets 288 * you change both at the same time. 289 * 290 * @throws UnsupportedOptionsException 291 * <code>lc</code> and <code>lp</code> 292 * are invalid 293 */ setLcLp(int lc, int lp)294 public void setLcLp(int lc, int lp) throws UnsupportedOptionsException { 295 if (lc < 0 || lp < 0 || lc > LC_LP_MAX || lp > LC_LP_MAX 296 || lc + lp > LC_LP_MAX) 297 throw new UnsupportedOptionsException( 298 "lc + lp must not exceed " + LC_LP_MAX + ": " 299 + lc + " + " + lp); 300 301 this.lc = lc; 302 this.lp = lp; 303 } 304 305 /** 306 * Sets the number of literal context bits. 307 * <p> 308 * All bytes that cannot be encoded as matches are encoded as literals. 309 * That is, literals are simply 8-bit bytes that are encoded one at 310 * a time. 311 * <p> 312 * The literal coding makes an assumption that the highest <code>lc</code> 313 * bits of the previous uncompressed byte correlate with the next byte. 314 * For example, in typical English text, an upper-case letter is often 315 * followed by a lower-case letter, and a lower-case letter is usually 316 * followed by another lower-case letter. In the US-ASCII character set, 317 * the highest three bits are 010 for upper-case letters and 011 for 318 * lower-case letters. When <code>lc</code> is at least 3, the literal 319 * coding can take advantage of this property in the uncompressed data. 320 * <p> 321 * The default value (3) is usually good. If you want maximum compression, 322 * try <code>setLc(4)</code>. Sometimes it helps a little, and sometimes it 323 * makes compression worse. If it makes it worse, test for example 324 * <code>setLc(2)</code> too. 325 * 326 * @throws UnsupportedOptionsException 327 * <code>lc</code> is invalid, or the sum 328 * of <code>lc</code> and <code>lp</code> 329 * exceed LC_LP_MAX 330 */ setLc(int lc)331 public void setLc(int lc) throws UnsupportedOptionsException { 332 setLcLp(lc, lp); 333 } 334 335 /** 336 * Sets the number of literal position bits. 337 * <p> 338 * This affets what kind of alignment in the uncompressed data is 339 * assumed when encoding literals. See {@link #setPb(int) setPb} for 340 * more information about alignment. 341 * 342 * @throws UnsupportedOptionsException 343 * <code>lp</code> is invalid, or the sum 344 * of <code>lc</code> and <code>lp</code> 345 * exceed LC_LP_MAX 346 */ setLp(int lp)347 public void setLp(int lp) throws UnsupportedOptionsException { 348 setLcLp(lc, lp); 349 } 350 351 /** 352 * Gets the number of literal context bits. 353 */ getLc()354 public int getLc() { 355 return lc; 356 } 357 358 /** 359 * Gets the number of literal position bits. 360 */ getLp()361 public int getLp() { 362 return lp; 363 } 364 365 /** 366 * Sets the number of position bits. 367 * <p> 368 * This affects what kind of alignment in the uncompressed data is 369 * assumed in general. The default (2) means four-byte alignment 370 * (2^<code>pb</code> = 2^2 = 4), which is often a good choice when 371 * there's no better guess. 372 * <p> 373 * When the alignment is known, setting the number of position bits 374 * accordingly may reduce the file size a little. For example with text 375 * files having one-byte alignment (US-ASCII, ISO-8859-*, UTF-8), using 376 * <code>setPb(0)</code> can improve compression slightly. For UTF-16 377 * text, <code>setPb(1)</code> is a good choice. If the alignment is 378 * an odd number like 3 bytes, <code>setPb(0)</code> might be the best 379 * choice. 380 * <p> 381 * Even though the assumed alignment can be adjusted with 382 * <code>setPb</code> and <code>setLp</code>, LZMA2 still slightly favors 383 * 16-byte alignment. It might be worth taking into account when designing 384 * file formats that are likely to be often compressed with LZMA2. 385 * 386 * @throws UnsupportedOptionsException 387 * <code>pb</code> is invalid 388 */ setPb(int pb)389 public void setPb(int pb) throws UnsupportedOptionsException { 390 if (pb < 0 || pb > PB_MAX) 391 throw new UnsupportedOptionsException( 392 "pb must not exceed " + PB_MAX + ": " + pb); 393 394 this.pb = pb; 395 } 396 397 /** 398 * Gets the number of position bits. 399 */ getPb()400 public int getPb() { 401 return pb; 402 } 403 404 /** 405 * Sets the compression mode. 406 * <p> 407 * This specifies the method to analyze the data produced by 408 * a match finder. The default is <code>MODE_FAST</code> for presets 409 * 0-3 and <code>MODE_NORMAL</code> for presets 4-9. 410 * <p> 411 * Usually <code>MODE_FAST</code> is used with Hash Chain match finders 412 * and <code>MODE_NORMAL</code> with Binary Tree match finders. This is 413 * also what the presets do. 414 * <p> 415 * The special mode <code>MODE_UNCOMPRESSED</code> doesn't try to 416 * compress the data at all (and doesn't use a match finder) and will 417 * simply wrap it in uncompressed LZMA2 chunks. 418 * 419 * @throws UnsupportedOptionsException 420 * <code>mode</code> is not supported 421 */ setMode(int mode)422 public void setMode(int mode) throws UnsupportedOptionsException { 423 if (mode < MODE_UNCOMPRESSED || mode > MODE_NORMAL) 424 throw new UnsupportedOptionsException( 425 "Unsupported compression mode: " + mode); 426 427 this.mode = mode; 428 } 429 430 /** 431 * Gets the compression mode. 432 */ getMode()433 public int getMode() { 434 return mode; 435 } 436 437 /** 438 * Sets the nice length of matches. 439 * Once a match of at least <code>niceLen</code> bytes is found, 440 * the algorithm stops looking for better matches. Higher values tend 441 * to give better compression at the expense of speed. The default 442 * depends on the preset. 443 * 444 * @throws UnsupportedOptionsException 445 * <code>niceLen</code> is invalid 446 */ setNiceLen(int niceLen)447 public void setNiceLen(int niceLen) throws UnsupportedOptionsException { 448 if (niceLen < NICE_LEN_MIN) 449 throw new UnsupportedOptionsException( 450 "Minimum nice length of matches is " 451 + NICE_LEN_MIN + " bytes: " + niceLen); 452 453 if (niceLen > NICE_LEN_MAX) 454 throw new UnsupportedOptionsException( 455 "Maximum nice length of matches is " + NICE_LEN_MAX 456 + ": " + niceLen); 457 458 this.niceLen = niceLen; 459 } 460 461 /** 462 * Gets the nice length of matches. 463 */ getNiceLen()464 public int getNiceLen() { 465 return niceLen; 466 } 467 468 /** 469 * Sets the match finder type. 470 * <p> 471 * Match finder has a major effect on compression speed, memory usage, 472 * and compression ratio. Usually Hash Chain match finders are faster 473 * than Binary Tree match finders. The default depends on the preset: 474 * 0-3 use <code>MF_HC4</code> and 4-9 use <code>MF_BT4</code>. 475 * 476 * @throws UnsupportedOptionsException 477 * <code>mf</code> is not supported 478 */ setMatchFinder(int mf)479 public void setMatchFinder(int mf) throws UnsupportedOptionsException { 480 if (mf != MF_HC4 && mf != MF_BT4) 481 throw new UnsupportedOptionsException( 482 "Unsupported match finder: " + mf); 483 484 this.mf = mf; 485 } 486 487 /** 488 * Gets the match finder type. 489 */ getMatchFinder()490 public int getMatchFinder() { 491 return mf; 492 } 493 494 /** 495 * Sets the match finder search depth limit. 496 * <p> 497 * The default is a special value of <code>0</code> which indicates that 498 * the depth limit should be automatically calculated by the selected 499 * match finder from the nice length of matches. 500 * <p> 501 * Reasonable depth limit for Hash Chain match finders is 4-100 and 502 * 16-1000 for Binary Tree match finders. Using very high values can 503 * make the compressor extremely slow with some files. Avoid settings 504 * higher than 1000 unless you are prepared to interrupt the compression 505 * in case it is taking far too long. 506 * 507 * @throws UnsupportedOptionsException 508 * <code>depthLimit</code> is invalid 509 */ setDepthLimit(int depthLimit)510 public void setDepthLimit(int depthLimit) 511 throws UnsupportedOptionsException { 512 if (depthLimit < 0) 513 throw new UnsupportedOptionsException( 514 "Depth limit cannot be negative: " + depthLimit); 515 516 this.depthLimit = depthLimit; 517 } 518 519 /** 520 * Gets the match finder search depth limit. 521 */ getDepthLimit()522 public int getDepthLimit() { 523 return depthLimit; 524 } 525 getEncoderMemoryUsage()526 public int getEncoderMemoryUsage() { 527 return (mode == MODE_UNCOMPRESSED) 528 ? UncompressedLZMA2OutputStream.getMemoryUsage() 529 : LZMA2OutputStream.getMemoryUsage(this); 530 } 531 getOutputStream(FinishableOutputStream out)532 public FinishableOutputStream getOutputStream(FinishableOutputStream out) { 533 if (mode == MODE_UNCOMPRESSED) 534 return new UncompressedLZMA2OutputStream(out); 535 536 return new LZMA2OutputStream(out, this); 537 } 538 539 /** 540 * Gets how much memory the LZMA2 decoder will need to decompress the data 541 * that was encoded with these options and stored in a .xz file. 542 * <p> 543 * The returned value may bigger than the value returned by a direct call 544 * to {@link LZMA2InputStream#getMemoryUsage(int)} if the dictionary size 545 * is not 2^n or 2^n + 2^(n-1) bytes. This is because the .xz 546 * headers store the dictionary size in such a format and other values 547 * are rounded up to the next such value. Such rounding is harmess except 548 * it might waste some memory if an unsual dictionary size is used. 549 * <p> 550 * If you use raw LZMA2 streams and unusual dictioanary size, call 551 * {@link LZMA2InputStream#getMemoryUsage} directly to get raw decoder 552 * memory requirements. 553 */ getDecoderMemoryUsage()554 public int getDecoderMemoryUsage() { 555 // Round the dictionary size up to the next 2^n or 2^n + 2^(n-1). 556 int d = dictSize - 1; 557 d |= d >>> 2; 558 d |= d >>> 3; 559 d |= d >>> 4; 560 d |= d >>> 8; 561 d |= d >>> 16; 562 return LZMA2InputStream.getMemoryUsage(d + 1); 563 } 564 getInputStream(InputStream in)565 public InputStream getInputStream(InputStream in) throws IOException { 566 return new LZMA2InputStream(in, dictSize); 567 } 568 getFilterEncoder()569 FilterEncoder getFilterEncoder() { 570 return new LZMA2Encoder(this); 571 } 572 clone()573 public Object clone() { 574 try { 575 return super.clone(); 576 } catch (CloneNotSupportedException e) { 577 assert false; 578 throw new RuntimeException(); 579 } 580 } 581 } 582