1 /* 2 ******************************************************************************* 3 * Copyright (C) 1996-2016, International Business Machines Corporation and * 4 * others. All Rights Reserved. * 5 ******************************************************************************* 6 */ 7 package com.ibm.icu.text; 8 9 /** 10 * A compression engine implementing the Standard Compression Scheme 11 * for Unicode (SCSU) as outlined in <A 12 * HREF="http://www.unicode.org/unicode/reports/tr6">Unicode Technical 13 * Report #6</A>. 14 * 15 * <P>The SCSU works by using dynamically positioned <EM>windows</EM> 16 * consisting of 128 consecutive characters in Unicode. During compression, 17 * characters within a window are encoded in the compressed stream as the bytes 18 * <TT>0x7F - 0xFF</TT>. The SCSU provides transparency for the characters 19 * (bytes) between <TT>U+0000 - U+00FF</TT>. The SCSU approximates the 20 * storage size of traditional character sets, for example 1 byte per 21 * character for ASCII or Latin-1 text, and 2 bytes per character for CJK 22 * ideographs.</P> 23 * 24 * <P><STRONG>USAGE</STRONG></P> 25 * 26 * <P>The static methods on <TT>UnicodeCompressor</TT> may be used in a 27 * straightforward manner to compress simple strings:</P> 28 * 29 * <PRE> 30 * String s = ... ; // get string from somewhere 31 * byte [] compressed = UnicodeCompressor.compress(s); 32 * </PRE> 33 * 34 * <P>The static methods have a fairly large memory footprint. 35 * For finer-grained control over memory usage, 36 * <TT>UnicodeCompressor</TT> offers more powerful APIs allowing 37 * iterative compression:</P> 38 * 39 * <PRE> 40 * // Compress an array "chars" of length "len" using a buffer of 512 bytes 41 * // to the OutputStream "out" 42 * 43 * UnicodeCompressor myCompressor = new UnicodeCompressor(); 44 * final static int BUFSIZE = 512; 45 * byte [] byteBuffer = new byte [ BUFSIZE ]; 46 * int bytesWritten = 0; 47 * int [] unicharsRead = new int [1]; 48 * int totalCharsCompressed = 0; 49 * int totalBytesWritten = 0; 50 * 51 * do { 52 * // do the compression 53 * bytesWritten = myCompressor.compress(chars, totalCharsCompressed, 54 * len, unicharsRead, 55 * byteBuffer, 0, BUFSIZE); 56 * 57 * // do something with the current set of bytes 58 * out.write(byteBuffer, 0, bytesWritten); 59 * 60 * // update the no. of characters compressed 61 * totalCharsCompressed += unicharsRead[0]; 62 * 63 * // update the no. of bytes written 64 * totalBytesWritten += bytesWritten; 65 * 66 * } while(totalCharsCompressed < len); 67 * 68 * myCompressor.reset(); // reuse compressor 69 * </PRE> 70 * 71 * @see UnicodeDecompressor 72 * 73 * @author Stephen F. Booth 74 * @stable ICU 2.4 75 */ 76 77 /* 78 * 79 * COMPRESSION STRATEGY 80 * 81 * Single Byte Mode 82 * 83 * There are three relevant cases. 84 * If the character is in the current window or is Latin-1 (U+0000, 85 * U+0009, U+000A, U+000D, U+0020 - U+007F), the character is placed 86 * directly in the stream as a single byte. 87 * 88 * 1. Current character is in defined, inactive window. 89 * 2. Current character is in undefined window. 90 * 3. Current character is uncompressible Unicode (U+3400 - U+DFFF). 91 * 92 * 1. Current character is in defined, inactive window 93 * A. Look ahead two characters 94 * B. If both following characters in same window as current character, 95 * switch to defined window 96 * C. If only next character is in same window as current character, 97 * quote defined window 98 * D. If neither of following characters is in same window as current, 99 * quote defined window 100 * 101 * 2. Current character is in undefined window 102 * A. Look ahead two characters 103 * B. If both following characters in same window as current character, 104 * define new window 105 * C. If only next character in same window as current character, 106 * switch to Unicode mode 107 * NOTE: This costs us one extra byte. However, 108 * since we have a limited number of windows to work with, it is 109 * assumed the cost will pay off later in savings from a window with 110 * more characters in it. 111 * D. If neither of following characters in same window as current, 112 * switch to Unicode mode. Alternative to above: just quote 113 * Unicode (same byte cost) 114 * 115 * 3. Current character is uncompressible Unicode (U+3400 - U+DFFF) 116 * A. Look ahead one character 117 * B. If next character in non-compressible region, switch to 118 * Unicode mode 119 * C. If next character not in non-compressible region, quote Unicode 120 * 121 * 122 * The following chart illustrates the bytes required for encoding characters 123 * in each possible way 124 * 125 * 126 * SINGLE BYTE MODE 127 * Characters in a row with same index 128 * tag encountered 1 2 3 4 129 * --------------------------------------------------------------- 130 * none (in current window) 1 2 3 4 131 * 132 * quote Unicode 3 6 9 12 133 * 134 * window not switch to Unicode 3 5 7 9 byte 135 * defined define window 3 4 5 6 cost 136 * 137 * window switch to window 2 3 4 5 138 * defined quote window 2 4 6 8 139 * 140 * Unicode Mode 141 * 142 * There are two relevant cases. 143 * If the character is in the non-compressible region 144 * (U+3400 - U+DFFF), the character is simply written to the 145 * stream as a pair of bytes. 146 * 147 * 1. Current character is in defined, inactive window. 148 * 2. Current character is in undefined window. 149 * 150 * 1.Current character is in defined, inactive window 151 * A. Look ahead one character 152 * B. If next character has same index as current character, 153 * switch to defined window (and switch to single-byte mode) 154 * C. If not, just put bytes in stream 155 * 156 * 157 * 2. Current character is in undefined window 158 * A. Look ahead two characters 159 * B. If both in same window as current character, define window 160 * (and switch to single-byte mode) 161 * C. If only next character in same window, just put bytes in stream 162 * NOTE: This costs us one extra byte. However, 163 * since we have a limited number of windows to work with, it is 164 * assumed the cost will pay off later in savings from a window with 165 * more characters in it. 166 * D. If neither in same window, put bytes in stream 167 * 168 * 169 * The following chart illustrates the bytes required for encoding characters 170 * in each possible way 171 * 172 * 173 * UNICODE MODE 174 * Characters in a row with same index 175 * tag encountered 1 2 3 4 176 * --------------------------------------------------------------- 177 * none 2 4 6 8 178 * 179 * quote Unicode 3 6 9 12 180 * 181 * window not define window 3 4 5 6 byte 182 * defined cost 183 * window switch to window 2 3 4 5 184 * defined 185 */ 186 public final class UnicodeCompressor implements SCSU 187 { 188 //========================== 189 // Class variables 190 //========================== 191 192 /** For quick identification of a byte as a single-byte mode tag */ 193 private static boolean [] sSingleTagTable = { 194 // table generated by CompressionTableGenerator 195 false, true, true, true, true, true, true, true, true, false, 196 false, true, true, false, true, true, true, true, true, true, 197 true, true, true, true, true, true, true, true, true, true, 198 true, true, false, false, false, false, false, false,false, 199 false, false, false, false, false, false, false, false, false, 200 false, false, false, false, false, false, false, false, false, 201 false, false, false, false, false, false, false, false, false, 202 false, false, false, false, false, false, false, false, false, 203 false, false, false, false, false, false, false, false, false, 204 false, false, false, false, false, false, false, false, false, 205 false, false, false, false, false, false, false, false, false, 206 false, false, false, false, false, false, false, false, false, 207 false, false, false, false, false, false, false, false, false, 208 false, false, false, false, false, false, false, false, false, 209 false, false, false, false, false, false, false, false, false, 210 false, false, false, false, false, false, false, false, false, 211 false, false, false, false, false, false, false, false, false, 212 false, false, false, false, false, false, false, false, false, 213 false, false, false, false, false, false, false, false, false, 214 false, false, false, false, false, false, false, false, false, 215 false, false, false, false, false, false, false, false, false, 216 false, false, false, false, false, false, false, false, false, 217 false, false, false, false, false, false, false, false, false, 218 false, false, false, false, false, false, false, false, false, 219 false, false, false, false, false, false, false, false, false, 220 false, false, false, false, false, false, false, false, false, 221 false, false, false, false, false, false, false, false, false, 222 false, false, false, false, false, false, false, false, false, 223 false 224 }; 225 226 /** For quick identification of a byte as a unicode mode tag */ 227 private static boolean [] sUnicodeTagTable = { 228 // table generated by CompressionTableGenerator 229 false, false, false, false, false, false, false, false, false, 230 false, false, false, false, false, false, false, false, false, 231 false, false, false, false, false, false, false, false, false, 232 false, false, false, false, false, false, false, false, false, 233 false, false, false, false, false, false, false, false, false, 234 false, false, false, false, false, false, false, false, false, 235 false, false, false, false, false, false, false, false, false, 236 false, false, false, false, false, false, false, false, false, 237 false, false, false, false, false, false, false, false, false, 238 false, false, false, false, false, false, false, false, false, 239 false, false, false, false, false, false, false, false, false, 240 false, false, false, false, false, false, false, false, false, 241 false, false, false, false, false, false, false, false, false, 242 false, false, false, false, false, false, false, false, false, 243 false, false, false, false, false, false, false, false, false, 244 false, false, false, false, false, false, false, false, false, 245 false, false, false, false, false, false, false, false, false, 246 false, false, false, false, false, false, false, false, false, 247 false, false, false, false, false, false, false, false, false, 248 false, false, false, false, false, false, false, false, false, 249 false, false, false, false, false, false, false, false, false, 250 false, false, false, false, false, false, false, false, false, 251 false, false, false, false, false, false, false, false, false, 252 false, false, false, false, false, false, false, false, false, 253 false, false, false, false, false, false, false, false, true, 254 true, true, true, true, true, true, true, true, true, true, 255 true, true, true, true, true, true, true, true, false, false, 256 false, false, false, false, false, false, false, false, false, 257 false, false 258 }; 259 260 //========================== 261 // Instance variables 262 //========================== 263 264 /** Alias to current dynamic window */ 265 private int fCurrentWindow = 0; 266 267 /** Dynamic compression window offsets */ 268 private int [] fOffsets = new int [ NUMWINDOWS ]; 269 270 /** Current compression mode */ 271 private int fMode = SINGLEBYTEMODE; 272 273 /** Keeps count of times character indices are encountered */ 274 private int [] fIndexCount = new int [ MAXINDEX + 1 ]; 275 276 /** The time stamps indicate when a window was last defined */ 277 private int [] fTimeStamps = new int [ NUMWINDOWS ]; 278 279 /** The current time stamp */ 280 private int fTimeStamp = 0; 281 282 283 /** 284 * Create a UnicodeCompressor. 285 * Sets all windows to their default values. 286 * @see #reset 287 * @stable ICU 2.4 288 */ UnicodeCompressor()289 public UnicodeCompressor() 290 { 291 reset(); // initialize to defaults 292 } 293 294 /** 295 * Compress a string into a byte array. 296 * @param buffer The string to compress. 297 * @return A byte array containing the compressed characters. 298 * @see #compress(char [], int, int) 299 * @stable ICU 2.4 300 */ compress(String buffer)301 public static byte [] compress(String buffer) 302 { 303 return compress(buffer.toCharArray(), 0, buffer.length()); 304 } 305 306 /** 307 * Compress a Unicode character array into a byte array. 308 * @param buffer The character buffer to compress. 309 * @param start The start of the character run to compress. 310 * @param limit The limit of the character run to compress. 311 * @return A byte array containing the compressed characters. 312 * @see #compress(String) 313 * @stable ICU 2.4 314 */ compress(char [] buffer, int start, int limit)315 public static byte [] compress(char [] buffer, 316 int start, 317 int limit) 318 { 319 UnicodeCompressor comp = new UnicodeCompressor(); 320 321 // use a buffer that we know will never overflow 322 // in the worst case, each character will take 3 bytes 323 // to encode: UQU, hibyte, lobyte. In this case, the 324 // compressed data will look like: SCU, UQU, hibyte, lobyte, ... 325 // buffer must be at least 4 bytes in size 326 int len = Math.max(4, 3 * (limit - start) + 1); 327 byte [] temp = new byte [len]; 328 329 int byteCount = comp.compress(buffer, start, limit, null, 330 temp, 0, len); 331 332 byte [] result = new byte [byteCount]; 333 System.arraycopy(temp, 0, result, 0, byteCount); 334 return result; 335 } 336 337 /** 338 * Compress a Unicode character array into a byte array. 339 * 340 * This function will only consume input that can be completely 341 * output. 342 * 343 * @param charBuffer The character buffer to compress. 344 * @param charBufferStart The start of the character run to compress. 345 * @param charBufferLimit The limit of the character run to compress. 346 * @param charsRead A one-element array. If not null, on return 347 * the number of characters read from charBuffer. 348 * @param byteBuffer A buffer to receive the compressed data. This 349 * buffer must be at minimum four bytes in size. 350 * @param byteBufferStart The starting offset to which to write 351 * compressed data. 352 * @param byteBufferLimit The limiting offset for writing compressed data. 353 * @return The number of bytes written to byteBuffer. 354 * @stable ICU 2.4 355 */ compress(char [] charBuffer, int charBufferStart, int charBufferLimit, int [] charsRead, byte [] byteBuffer, int byteBufferStart, int byteBufferLimit)356 public int compress(char [] charBuffer, 357 int charBufferStart, 358 int charBufferLimit, 359 int [] charsRead, 360 byte [] byteBuffer, 361 int byteBufferStart, 362 int byteBufferLimit) 363 { 364 // the current position in the target byte buffer 365 int bytePos = byteBufferStart; 366 367 // the current position in the source unicode character buffer 368 int ucPos = charBufferStart; 369 370 // the current unicode character from the source buffer 371 int curUC = INVALIDCHAR; 372 373 // the index for the current character 374 int curIndex = -1; 375 376 // look ahead 377 int nextUC = INVALIDCHAR; 378 int forwardUC = INVALIDCHAR; 379 380 // temporary for window searching 381 int whichWindow = 0; 382 383 // high and low bytes of the current unicode character 384 int hiByte = 0; 385 int loByte = 0; 386 387 388 // byteBuffer must be at least 4 bytes in size 389 if(byteBuffer.length < 4 || (byteBufferLimit - byteBufferStart) < 4) 390 throw new IllegalArgumentException("byteBuffer.length < 4"); 391 392 mainLoop: 393 while(ucPos < charBufferLimit && bytePos < byteBufferLimit) { 394 switch(fMode) { 395 // main single byte mode compression loop 396 case SINGLEBYTEMODE: 397 singleByteModeLoop: 398 while(ucPos < charBufferLimit && bytePos < byteBufferLimit) { 399 // get current char 400 curUC = charBuffer[ucPos++]; 401 402 // get next char 403 if(ucPos < charBufferLimit) 404 nextUC = charBuffer[ucPos]; 405 else 406 nextUC = INVALIDCHAR; 407 408 // chars less than 0x0080 (excluding tags) go straight 409 // in stream 410 if(curUC < 0x0080) { 411 loByte = curUC & 0xFF; 412 413 // we need to check and make sure we don't 414 // accidentally write a single byte mode tag to 415 // the stream unless it's quoted 416 if(sSingleTagTable[loByte]) { 417 // make sure there is enough room to 418 // write both bytes if not, rewind the 419 // source stream and break out 420 if( (bytePos + 1) >= byteBufferLimit) 421 { --ucPos; break mainLoop; } 422 423 // since we know the byte is less than 0x80, SQUOTE0 424 // will use static window 0, or ASCII 425 byteBuffer[bytePos++] = (byte) SQUOTE0; 426 } 427 428 byteBuffer[bytePos++] = (byte) loByte; 429 } 430 431 // if the char belongs to current window, convert it 432 // to a byte by adding the generic compression offset 433 // and subtracting the window's offset 434 else if(inDynamicWindow(curUC, fCurrentWindow) ) { 435 byteBuffer[bytePos++] = (byte) 436 (curUC - fOffsets[ fCurrentWindow ] 437 + COMPRESSIONOFFSET); 438 } 439 440 // if char is not in compressible range, either switch to or 441 // quote from unicode 442 else if( ! isCompressible(curUC) ) { 443 // only check next character if it is valid 444 if(nextUC != INVALIDCHAR && isCompressible(nextUC)) { 445 // make sure there is enough room to 446 // write all three bytes if not, 447 // rewind the source stream and break 448 // out 449 if( (bytePos + 2) >= byteBufferLimit) 450 { --ucPos; break mainLoop; } 451 452 byteBuffer[bytePos++] = (byte) SQUOTEU; 453 byteBuffer[bytePos++] = (byte) (curUC >>> 8); 454 byteBuffer[bytePos++] = (byte) (curUC & 0xFF); 455 } 456 else { 457 // make sure there is enough room to 458 // write all four bytes if not, rewind 459 // the source stream and break out 460 if((bytePos + 3) >= byteBufferLimit) 461 { --ucPos; break mainLoop; } 462 463 byteBuffer[bytePos++] = (byte) SCHANGEU; 464 465 hiByte = curUC >>> 8; 466 loByte = curUC & 0xFF; 467 468 if(sUnicodeTagTable[hiByte]) 469 // add quote Unicode tag 470 byteBuffer[bytePos++] = (byte) UQUOTEU; 471 472 byteBuffer[bytePos++] = (byte) hiByte; 473 byteBuffer[bytePos++] = (byte) loByte; 474 475 fMode = UNICODEMODE; 476 break singleByteModeLoop; 477 } 478 } 479 480 // if the char is in a currently defined dynamic 481 // window, figure out which one, and either switch to 482 // it or quote from it 483 else if((whichWindow = findDynamicWindow(curUC)) 484 != INVALIDWINDOW ) { 485 // look ahead 486 if( (ucPos + 1) < charBufferLimit ) 487 forwardUC = charBuffer[ucPos + 1]; 488 else 489 forwardUC = INVALIDCHAR; 490 491 // all three chars in same window, switch to that 492 // window inDynamicWindow will return false for 493 // INVALIDCHAR 494 if(inDynamicWindow(nextUC, whichWindow) 495 && inDynamicWindow(forwardUC, whichWindow)) { 496 // make sure there is enough room to 497 // write both bytes if not, rewind the 498 // source stream and break out 499 if( (bytePos + 1) >= byteBufferLimit) 500 { --ucPos; break mainLoop; } 501 502 byteBuffer[bytePos++] = (byte)(SCHANGE0 + whichWindow); 503 byteBuffer[bytePos++] = (byte) 504 (curUC - fOffsets[whichWindow] 505 + COMPRESSIONOFFSET); 506 fTimeStamps [ whichWindow ] = ++fTimeStamp; 507 fCurrentWindow = whichWindow; 508 } 509 510 // either only next char or neither in same 511 // window, so quote 512 else { 513 // make sure there is enough room to 514 // write both bytes if not, rewind the 515 // source stream and break out 516 if((bytePos + 1) >= byteBufferLimit) 517 { --ucPos; break mainLoop; } 518 519 byteBuffer[bytePos++] = (byte) (SQUOTE0 + whichWindow); 520 byteBuffer[bytePos++] = (byte) 521 (curUC - fOffsets[whichWindow] 522 + COMPRESSIONOFFSET); 523 } 524 } 525 526 // if a static window is defined, and the following 527 // character is not in that static window, quote from 528 // the static window Note: to quote from a static 529 // window, don't add 0x80 530 else if((whichWindow = findStaticWindow(curUC)) 531 != INVALIDWINDOW 532 && ! inStaticWindow(nextUC, whichWindow) ) { 533 // make sure there is enough room to write both 534 // bytes if not, rewind the source stream and 535 // break out 536 if((bytePos + 1) >= byteBufferLimit) 537 { --ucPos; break mainLoop; } 538 539 byteBuffer[bytePos++] = (byte) (SQUOTE0 + whichWindow); 540 byteBuffer[bytePos++] = (byte) 541 (curUC - sOffsets[whichWindow]); 542 } 543 544 // if a window is not defined, decide if we want to 545 // define a new one or switch to unicode mode 546 else { 547 // determine index for current char (char is compressible) 548 curIndex = makeIndex(curUC); 549 fIndexCount[curIndex]++; 550 551 // look ahead 552 if((ucPos + 1) < charBufferLimit) 553 forwardUC = charBuffer[ucPos + 1]; 554 else 555 forwardUC = INVALIDCHAR; 556 557 // if we have encountered this index at least once 558 // before, define a new window 559 // OR 560 // three chars in a row with same index, define a 561 // new window (makeIndex will return RESERVEDINDEX 562 // for INVALIDCHAR) 563 if((fIndexCount[curIndex] > 1) || 564 (curIndex == makeIndex(nextUC) 565 && curIndex == makeIndex(forwardUC))) { 566 // make sure there is enough room to write all 567 // three bytes if not, rewind the source 568 // stream and break out 569 if( (bytePos + 2) >= byteBufferLimit) 570 { --ucPos; break mainLoop; } 571 572 // get least recently defined window 573 whichWindow = getLRDefinedWindow(); 574 575 byteBuffer[bytePos++] = (byte)(SDEFINE0 + whichWindow); 576 byteBuffer[bytePos++] = (byte) curIndex; 577 byteBuffer[bytePos++] = (byte) 578 (curUC - sOffsetTable[curIndex] 579 + COMPRESSIONOFFSET); 580 581 fOffsets[whichWindow] = sOffsetTable[curIndex]; 582 fCurrentWindow = whichWindow; 583 fTimeStamps [whichWindow] = ++fTimeStamp; 584 } 585 586 // only two chars in a row with same index, so 587 // switch to unicode mode (makeIndex will return 588 // RESERVEDINDEX for INVALIDCHAR) 589 // OR 590 // three chars have different indices, so switch 591 // to unicode mode 592 else { 593 // make sure there is enough room to write all 594 // four bytes if not, rewind the source stream 595 // and break out 596 if((bytePos + 3) >= byteBufferLimit) 597 { --ucPos; break mainLoop; } 598 599 byteBuffer[bytePos++] = (byte) SCHANGEU; 600 601 hiByte = curUC >>> 8; 602 loByte = curUC & 0xFF; 603 604 if(sUnicodeTagTable[hiByte]) 605 // add quote Unicode tag 606 byteBuffer[bytePos++] = (byte) UQUOTEU; 607 608 byteBuffer[bytePos++] = (byte) hiByte; 609 byteBuffer[bytePos++] = (byte) loByte; 610 611 fMode = UNICODEMODE; 612 break singleByteModeLoop; 613 } 614 } 615 } 616 break; 617 618 case UNICODEMODE: 619 // main unicode mode compression loop 620 unicodeModeLoop: 621 while(ucPos < charBufferLimit && bytePos < byteBufferLimit) { 622 // get current char 623 curUC = charBuffer[ucPos++]; 624 625 // get next char 626 if( ucPos < charBufferLimit ) 627 nextUC = charBuffer[ucPos]; 628 else 629 nextUC = INVALIDCHAR; 630 631 // if we have two uncompressible chars in a row, 632 // put the current char's bytes in the stream 633 if( ! isCompressible(curUC) 634 || (nextUC != INVALIDCHAR && ! isCompressible(nextUC))) { 635 // make sure there is enough room to write all three bytes 636 // if not, rewind the source stream and break out 637 if( (bytePos + 2) >= byteBufferLimit) 638 { --ucPos; break mainLoop; } 639 640 hiByte = curUC >>> 8; 641 loByte = curUC & 0xFF; 642 643 if(sUnicodeTagTable[ hiByte ]) 644 // add quote Unicode tag 645 byteBuffer[bytePos++] = (byte) UQUOTEU; 646 647 byteBuffer[bytePos++] = (byte) hiByte; 648 byteBuffer[bytePos++] = (byte) loByte; 649 } 650 651 // bytes less than 0x80 can go straight in the stream, 652 // but in single-byte mode 653 else if(curUC < 0x0080) { 654 loByte = curUC & 0xFF; 655 656 // if two chars in a row below 0x80 and the 657 // current char is not a single-byte mode tag, 658 // switch to single-byte mode 659 if(nextUC != INVALIDCHAR 660 && nextUC < 0x0080 && ! sSingleTagTable[ loByte ] ) { 661 // make sure there is enough room to 662 // write both bytes if not, rewind the 663 // source stream and break out 664 if( (bytePos + 1) >= byteBufferLimit) 665 { --ucPos; break mainLoop; } 666 667 // use the last-active window 668 whichWindow = fCurrentWindow; 669 byteBuffer[bytePos++] = (byte)(UCHANGE0 + whichWindow); 670 byteBuffer[bytePos++] = (byte) loByte; 671 672 //fCurrentWindow = 0; 673 fTimeStamps [whichWindow] = ++fTimeStamp; 674 fMode = SINGLEBYTEMODE; 675 break unicodeModeLoop; 676 } 677 678 // otherwise, just write the bytes to the stream 679 // (this will cover the case of only 1 char less than 0x80 680 // and single-byte mode tags) 681 else { 682 // make sure there is enough room to 683 // write both bytes if not, rewind the 684 // source stream and break out 685 if((bytePos + 1) >= byteBufferLimit) 686 { --ucPos; break mainLoop; } 687 688 // since the character is less than 0x80, the 689 // high byte is always 0x00 - no need for 690 // (curUC >>> 8) 691 byteBuffer[bytePos++] = (byte) 0x00; 692 byteBuffer[bytePos++] = (byte) loByte; 693 } 694 } 695 696 // figure out if the current char is in a defined window 697 else if((whichWindow = findDynamicWindow(curUC)) 698 != INVALIDWINDOW ) { 699 // if two chars in a row in the same window, 700 // switch to that window and go to single-byte mode 701 // inDynamicWindow will return false for INVALIDCHAR 702 if(inDynamicWindow(nextUC, whichWindow)) { 703 // make sure there is enough room to 704 // write both bytes if not, rewind the 705 // source stream and break out 706 if((bytePos + 1) >= byteBufferLimit) 707 { --ucPos; break mainLoop; } 708 709 byteBuffer[bytePos++] = (byte)(UCHANGE0 + whichWindow); 710 byteBuffer[bytePos++] = (byte) 711 (curUC - fOffsets[whichWindow] 712 + COMPRESSIONOFFSET); 713 714 fTimeStamps [ whichWindow ] = ++fTimeStamp; 715 fCurrentWindow = whichWindow; 716 fMode = SINGLEBYTEMODE; 717 break unicodeModeLoop; 718 } 719 720 // otherwise, just quote the unicode for the char 721 else { 722 // make sure there is enough room to 723 // write all three bytes if not, 724 // rewind the source stream and break 725 // out 726 if((bytePos + 2) >= byteBufferLimit) 727 { --ucPos; break mainLoop; } 728 729 hiByte = curUC >>> 8; 730 loByte = curUC & 0xFF; 731 732 if(sUnicodeTagTable[ hiByte ]) 733 // add quote Unicode tag 734 byteBuffer[bytePos++] = (byte) UQUOTEU; 735 736 byteBuffer[bytePos++] = (byte) hiByte; 737 byteBuffer[bytePos++] = (byte) loByte; 738 } 739 } 740 741 // char is not in a defined window 742 else { 743 // determine index for current char (char is compressible) 744 curIndex = makeIndex(curUC); 745 fIndexCount[curIndex]++; 746 747 // look ahead 748 if( (ucPos + 1) < charBufferLimit ) 749 forwardUC = charBuffer[ucPos + 1]; 750 else 751 forwardUC = INVALIDCHAR; 752 753 // if we have encountered this index at least once 754 // before, define a new window for it that hasn't 755 // previously been redefined 756 // OR 757 // if three chars in a row with the same index, 758 // define a new window (makeIndex will return 759 // RESERVEDINDEX for INVALIDCHAR) 760 if((fIndexCount[curIndex] > 1) || 761 (curIndex == makeIndex(nextUC) 762 && curIndex == makeIndex(forwardUC))) { 763 // make sure there is enough room to 764 // write all three bytes if not, 765 // rewind the source stream and break 766 // out 767 if((bytePos + 2) >= byteBufferLimit) 768 { --ucPos; break mainLoop; } 769 770 // get least recently defined window 771 whichWindow = getLRDefinedWindow(); 772 773 byteBuffer[bytePos++] = (byte)(UDEFINE0 + whichWindow); 774 byteBuffer[bytePos++] = (byte) curIndex; 775 byteBuffer[bytePos++] = (byte) 776 (curUC - sOffsetTable[curIndex] 777 + COMPRESSIONOFFSET); 778 779 fOffsets[whichWindow] = sOffsetTable[curIndex]; 780 fCurrentWindow = whichWindow; 781 fTimeStamps [whichWindow] = ++fTimeStamp; 782 fMode = SINGLEBYTEMODE; 783 break unicodeModeLoop; 784 } 785 786 // otherwise just quote the unicode, and save our 787 // windows for longer runs 788 else { 789 // make sure there is enough room to 790 // write all three bytes if not, 791 // rewind the source stream and break 792 // out 793 if((bytePos + 2) >= byteBufferLimit) 794 { --ucPos; break mainLoop; } 795 796 hiByte = curUC >>> 8; 797 loByte = curUC & 0xFF; 798 799 if(sUnicodeTagTable[ hiByte ]) 800 // add quote Unicode tag 801 byteBuffer[bytePos++] = (byte) UQUOTEU; 802 803 byteBuffer[bytePos++] = (byte) hiByte; 804 byteBuffer[bytePos++] = (byte) loByte; 805 } 806 } 807 } 808 } // end switch 809 } 810 811 // fill in output parameter 812 if(charsRead != null) 813 charsRead [0] = (ucPos - charBufferStart); 814 815 // return # of bytes written 816 return (bytePos - byteBufferStart); 817 } 818 819 /** 820 * Reset the compressor to its initial state. 821 * @stable ICU 2.4 822 */ reset()823 public void reset() 824 { 825 int i; 826 827 // reset dynamic windows 828 fOffsets[0] = 0x0080; // Latin-1 829 fOffsets[1] = 0x00C0; // Latin-1 Supplement + Latin Extended-A 830 fOffsets[2] = 0x0400; // Cyrillic 831 fOffsets[3] = 0x0600; // Arabic 832 fOffsets[4] = 0x0900; // Devanagari 833 fOffsets[5] = 0x3040; // Hiragana 834 fOffsets[6] = 0x30A0; // Katakana 835 fOffsets[7] = 0xFF00; // Fullwidth ASCII 836 837 838 // reset time stamps 839 for(i = 0; i < NUMWINDOWS; i++) { 840 fTimeStamps[i] = 0; 841 } 842 843 // reset count of seen indices 844 for(i = 0; i <= MAXINDEX; i++ ) { 845 fIndexCount[i] = 0; 846 } 847 848 fTimeStamp = 0; // Reset current time stamp 849 fCurrentWindow = 0; // Make current window Latin-1 850 fMode = SINGLEBYTEMODE; // Always start in single-byte mode 851 } 852 853 //========================== 854 // Determine the index for a character 855 //========================== 856 857 /** 858 * Create the index value for a character. 859 * For more information on this function, refer to table X-3 860 * <A HREF="http://www.unicode.org/unicode/reports/tr6">UTR6</A>. 861 * @param c The character in question. 862 * @return An index for c 863 */ makeIndex(int c)864 private static int makeIndex(int c) 865 { 866 // check the predefined indices 867 if(c >= 0x00C0 && c < 0x0140) 868 return LATININDEX; 869 else if(c >= 0x0250 && c < 0x02D0) 870 return IPAEXTENSIONINDEX; 871 else if(c >= 0x0370 && c < 0x03F0) 872 return GREEKINDEX; 873 else if(c >= 0x0530 && c < 0x0590) 874 return ARMENIANINDEX; 875 else if(c >= 0x3040 && c < 0x30A0) 876 return HIRAGANAINDEX; 877 else if(c >= 0x30A0 && c < 0x3120) 878 return KATAKANAINDEX; 879 else if(c >= 0xFF60 && c < 0xFF9F) 880 return HALFWIDTHKATAKANAINDEX; 881 882 // calculate index 883 else if(c >= 0x0080 && c < 0x3400) 884 return (c / 0x80) & 0xFF; 885 else if(c >= 0xE000 && c <= 0xFFFF) 886 return ((c - 0xAC00) / 0x80) & 0xFF; 887 888 // should never happen 889 else { 890 return RESERVEDINDEX; 891 } 892 } 893 894 //========================== 895 // Check if a given character fits in a window 896 //========================== 897 898 /** 899 * Determine if a character is in a dynamic window. 900 * @param c The character to test 901 * @param whichWindow The dynamic window the test 902 * @return true if <TT>c</TT> will fit in <TT>whichWindow</TT>, 903 * false otherwise. 904 */ inDynamicWindow(int c, int whichWindow)905 private boolean inDynamicWindow(int c, 906 int whichWindow) 907 { 908 return (c >= fOffsets[whichWindow] 909 && c < (fOffsets[whichWindow] + 0x80)); 910 } 911 912 /** 913 * Determine if a character is in a static window. 914 * @param c The character to test 915 * @param whichWindow The static window the test 916 * @return true if <TT>c</TT> will fit in <TT>whichWindow</TT>, 917 * false otherwise. 918 */ inStaticWindow(int c, int whichWindow)919 private static boolean inStaticWindow(int c, 920 int whichWindow) 921 { 922 return (c >= sOffsets[whichWindow] 923 && c < (sOffsets[whichWindow] + 0x80)); 924 } 925 926 //========================== 927 // Check if a given character is compressible 928 //========================== 929 930 /** 931 * Determine if a character is compressible. 932 * @param c The character to test. 933 * @return true if the <TT>c</TT> is compressible, false otherwise. 934 */ isCompressible(int c)935 private static boolean isCompressible(int c) 936 { 937 return (c < 0x3400 || c >= 0xE000); 938 } 939 940 //========================== 941 // Check if a window is defined for a given character 942 //========================== 943 944 /** 945 * Determine if a dynamic window for a certain character is defined 946 * @param c The character in question 947 * @return The dynamic window containing <TT>c</TT>, or 948 * INVALIDWINDOW if not defined. 949 */ findDynamicWindow(int c)950 private int findDynamicWindow(int c) 951 { 952 // supposedly faster to count down 953 //for(int i = 0; i < NUMWINDOWS; i++) { 954 for(int i = NUMWINDOWS - 1; i >= 0; --i) { 955 if(inDynamicWindow(c, i)) { 956 ++fTimeStamps[i]; 957 return i; 958 } 959 } 960 961 return INVALIDWINDOW; 962 } 963 964 /** 965 * Determine if a static window for a certain character is defined 966 * @param c The character in question 967 * @return The static window containing <TT>c</TT>, or 968 * INVALIDWINDOW if not defined. 969 */ findStaticWindow(int c)970 private static int findStaticWindow(int c) 971 { 972 // supposedly faster to count down 973 //for(int i = 0; i < NUMSTATICWINDOWS; i++) { 974 for(int i = NUMSTATICWINDOWS - 1; i >= 0; --i) { 975 if(inStaticWindow(c, i)) { 976 return i; 977 } 978 } 979 980 return INVALIDWINDOW; 981 } 982 983 //========================== 984 // Find the least-recently used window 985 //========================== 986 987 /** Find the least-recently defined window */ getLRDefinedWindow()988 private int getLRDefinedWindow() 989 { 990 int leastRU = Integer.MAX_VALUE; 991 int whichWindow = INVALIDWINDOW; 992 993 // find least recently used window 994 // supposedly faster to count down 995 //for( int i = 0; i < NUMWINDOWS; i++ ) { 996 for(int i = NUMWINDOWS - 1; i >= 0; --i ) { 997 if( fTimeStamps[i] < leastRU ) { 998 leastRU = fTimeStamps[i]; 999 whichWindow = i; 1000 } 1001 } 1002 1003 return whichWindow; 1004 } 1005 1006 } 1007