1 /* GENERATED SOURCE. DO NOT MODIFY. */ 2 // © 2016 and later: Unicode, Inc. and others. 3 // License & terms of use: http://www.unicode.org/copyright.html#License 4 /* 5 ******************************************************************************* 6 * Copyright (C) 1996-2016, International Business Machines Corporation and * 7 * others. All Rights Reserved. * 8 ******************************************************************************* 9 */ 10 package ohos.global.icu.text; 11 12 /** 13 * A compression engine implementing the Standard Compression Scheme 14 * for Unicode (SCSU) as outlined in <A 15 * HREF="http://www.unicode.org/unicode/reports/tr6">Unicode Technical 16 * Report #6</A>. 17 * 18 * <P>The SCSU works by using dynamically positioned <EM>windows</EM> 19 * consisting of 128 consecutive characters in Unicode. During compression, 20 * characters within a window are encoded in the compressed stream as the bytes 21 * <TT>0x7F - 0xFF</TT>. The SCSU provides transparency for the characters 22 * (bytes) between <TT>U+0000 - U+00FF</TT>. The SCSU approximates the 23 * storage size of traditional character sets, for example 1 byte per 24 * character for ASCII or Latin-1 text, and 2 bytes per character for CJK 25 * ideographs.</P> 26 * 27 * <P><STRONG>USAGE</STRONG></P> 28 * 29 * <P>The static methods on <TT>UnicodeCompressor</TT> may be used in a 30 * straightforward manner to compress simple strings:</P> 31 * 32 * <PRE> 33 * String s = ... ; // get string from somewhere 34 * byte [] compressed = UnicodeCompressor.compress(s); 35 * </PRE> 36 * 37 * <P>The static methods have a fairly large memory footprint. 38 * For finer-grained control over memory usage, 39 * <TT>UnicodeCompressor</TT> offers more powerful APIs allowing 40 * iterative compression:</P> 41 * 42 * <PRE> 43 * // Compress an array "chars" of length "len" using a buffer of 512 bytes 44 * // to the OutputStream "out" 45 * 46 * UnicodeCompressor myCompressor = new UnicodeCompressor(); 47 * final static int BUFSIZE = 512; 48 * byte [] byteBuffer = new byte [ BUFSIZE ]; 49 * int bytesWritten = 0; 50 * int [] unicharsRead = new int [1]; 51 * int totalCharsCompressed = 0; 52 * int totalBytesWritten = 0; 53 * 54 * do { 55 * // do the compression 56 * bytesWritten = myCompressor.compress(chars, totalCharsCompressed, 57 * len, unicharsRead, 58 * byteBuffer, 0, BUFSIZE); 59 * 60 * // do something with the current set of bytes 61 * out.write(byteBuffer, 0, bytesWritten); 62 * 63 * // update the no. of characters compressed 64 * totalCharsCompressed += unicharsRead[0]; 65 * 66 * // update the no. of bytes written 67 * totalBytesWritten += bytesWritten; 68 * 69 * } while(totalCharsCompressed < len); 70 * 71 * myCompressor.reset(); // reuse compressor 72 * </PRE> 73 * 74 * @see UnicodeDecompressor 75 * 76 * @author Stephen F. Booth 77 * @hide exposed on OHOS 78 */ 79 80 /* 81 * 82 * COMPRESSION STRATEGY 83 * 84 * Single Byte Mode 85 * 86 * There are three relevant cases. 87 * If the character is in the current window or is Latin-1 (U+0000, 88 * U+0009, U+000A, U+000D, U+0020 - U+007F), the character is placed 89 * directly in the stream as a single byte. 90 * 91 * 1. Current character is in defined, inactive window. 92 * 2. Current character is in undefined window. 93 * 3. Current character is uncompressible Unicode (U+3400 - U+DFFF). 94 * 95 * 1. Current character is in defined, inactive window 96 * A. Look ahead two characters 97 * B. If both following characters in same window as current character, 98 * switch to defined window 99 * C. If only next character is in same window as current character, 100 * quote defined window 101 * D. If neither of following characters is in same window as current, 102 * quote defined window 103 * 104 * 2. Current character is in undefined window 105 * A. Look ahead two characters 106 * B. If both following characters in same window as current character, 107 * define new window 108 * C. If only next character in same window as current character, 109 * switch to Unicode mode 110 * NOTE: This costs us one extra byte. However, 111 * since we have a limited number of windows to work with, it is 112 * assumed the cost will pay off later in savings from a window with 113 * more characters in it. 114 * D. If neither of following characters in same window as current, 115 * switch to Unicode mode. Alternative to above: just quote 116 * Unicode (same byte cost) 117 * 118 * 3. Current character is uncompressible Unicode (U+3400 - U+DFFF) 119 * A. Look ahead one character 120 * B. If next character in non-compressible region, switch to 121 * Unicode mode 122 * C. If next character not in non-compressible region, quote Unicode 123 * 124 * 125 * The following chart illustrates the bytes required for encoding characters 126 * in each possible way 127 * 128 * 129 * SINGLE BYTE MODE 130 * Characters in a row with same index 131 * tag encountered 1 2 3 4 132 * --------------------------------------------------------------- 133 * none (in current window) 1 2 3 4 134 * 135 * quote Unicode 3 6 9 12 136 * 137 * window not switch to Unicode 3 5 7 9 byte 138 * defined define window 3 4 5 6 cost 139 * 140 * window switch to window 2 3 4 5 141 * defined quote window 2 4 6 8 142 * 143 * Unicode Mode 144 * 145 * There are two relevant cases. 146 * If the character is in the non-compressible region 147 * (U+3400 - U+DFFF), the character is simply written to the 148 * stream as a pair of bytes. 149 * 150 * 1. Current character is in defined, inactive window. 151 * 2. Current character is in undefined window. 152 * 153 * 1.Current character is in defined, inactive window 154 * A. Look ahead one character 155 * B. If next character has same index as current character, 156 * switch to defined window (and switch to single-byte mode) 157 * C. If not, just put bytes in stream 158 * 159 * 160 * 2. Current character is in undefined window 161 * A. Look ahead two characters 162 * B. If both in same window as current character, define window 163 * (and switch to single-byte mode) 164 * C. If only next character in same window, just put bytes in stream 165 * NOTE: This costs us one extra byte. However, 166 * since we have a limited number of windows to work with, it is 167 * assumed the cost will pay off later in savings from a window with 168 * more characters in it. 169 * D. If neither in same window, put bytes in stream 170 * 171 * 172 * The following chart illustrates the bytes required for encoding characters 173 * in each possible way 174 * 175 * 176 * UNICODE MODE 177 * Characters in a row with same index 178 * tag encountered 1 2 3 4 179 * --------------------------------------------------------------- 180 * none 2 4 6 8 181 * 182 * quote Unicode 3 6 9 12 183 * 184 * window not define window 3 4 5 6 byte 185 * defined cost 186 * window switch to window 2 3 4 5 187 * defined 188 */ 189 public final class UnicodeCompressor implements SCSU 190 { 191 //========================== 192 // Class variables 193 //========================== 194 195 /** For quick identification of a byte as a single-byte mode tag */ 196 private static boolean [] sSingleTagTable = { 197 // table generated by CompressionTableGenerator 198 false, true, true, true, true, true, true, true, true, false, 199 false, true, true, false, true, true, true, true, true, true, 200 true, true, true, true, true, true, true, true, true, true, 201 true, true, false, false, false, false, false, false,false, 202 false, false, false, false, false, false, false, false, false, 203 false, false, false, false, false, false, false, false, false, 204 false, false, false, false, false, false, false, false, false, 205 false, false, false, false, false, false, false, false, false, 206 false, false, false, false, false, false, false, false, false, 207 false, false, false, false, false, false, false, false, false, 208 false, false, false, false, false, false, false, false, false, 209 false, false, false, false, false, false, false, false, false, 210 false, false, false, false, false, false, false, false, false, 211 false, false, false, false, false, false, false, false, false, 212 false, false, false, false, false, false, false, false, false, 213 false, false, false, false, false, false, false, false, false, 214 false, false, false, false, false, false, false, false, false, 215 false, false, false, false, false, false, false, false, false, 216 false, false, false, false, false, false, false, false, false, 217 false, false, false, false, false, false, false, false, false, 218 false, false, false, false, false, false, false, false, false, 219 false, false, false, false, false, false, false, false, false, 220 false, false, false, false, false, false, false, false, false, 221 false, false, false, false, false, false, false, false, false, 222 false, false, false, false, false, false, false, false, false, 223 false, false, false, false, false, false, false, false, false, 224 false, false, false, false, false, false, false, false, false, 225 false, false, false, false, false, false, false, false, false, 226 false 227 }; 228 229 /** For quick identification of a byte as a unicode mode tag */ 230 private static boolean [] sUnicodeTagTable = { 231 // table generated by CompressionTableGenerator 232 false, false, false, false, false, false, false, false, false, 233 false, false, false, false, false, false, false, false, false, 234 false, false, false, false, false, false, false, false, false, 235 false, false, false, false, false, false, false, false, false, 236 false, false, false, false, false, false, false, false, false, 237 false, false, false, false, false, false, false, false, false, 238 false, false, false, false, false, false, false, false, false, 239 false, false, false, false, false, false, false, false, false, 240 false, false, false, false, false, false, false, false, false, 241 false, false, false, false, false, false, false, false, false, 242 false, false, false, false, false, false, false, false, false, 243 false, false, false, false, false, false, false, false, false, 244 false, false, false, false, false, false, false, false, false, 245 false, false, false, false, false, false, false, false, false, 246 false, false, false, false, false, false, false, false, false, 247 false, false, false, false, false, false, false, false, false, 248 false, false, false, false, false, false, false, false, false, 249 false, false, false, false, false, false, false, false, false, 250 false, false, false, false, false, false, false, false, false, 251 false, false, false, false, false, false, false, false, false, 252 false, false, false, false, false, false, false, false, false, 253 false, false, false, false, false, false, false, false, false, 254 false, false, false, false, false, false, false, false, false, 255 false, false, false, false, false, false, false, false, false, 256 false, false, false, false, false, false, false, false, true, 257 true, true, true, true, true, true, true, true, true, true, 258 true, true, true, true, true, true, true, true, false, false, 259 false, false, false, false, false, false, false, false, false, 260 false, false 261 }; 262 263 //========================== 264 // Instance variables 265 //========================== 266 267 /** Alias to current dynamic window */ 268 private int fCurrentWindow = 0; 269 270 /** Dynamic compression window offsets */ 271 private int [] fOffsets = new int [ NUMWINDOWS ]; 272 273 /** Current compression mode */ 274 private int fMode = SINGLEBYTEMODE; 275 276 /** Keeps count of times character indices are encountered */ 277 private int [] fIndexCount = new int [ MAXINDEX + 1 ]; 278 279 /** The time stamps indicate when a window was last defined */ 280 private int [] fTimeStamps = new int [ NUMWINDOWS ]; 281 282 /** The current time stamp */ 283 private int fTimeStamp = 0; 284 285 286 /** 287 * Create a UnicodeCompressor. 288 * Sets all windows to their default values. 289 * @see #reset 290 */ UnicodeCompressor()291 public UnicodeCompressor() 292 { 293 reset(); // initialize to defaults 294 } 295 296 /** 297 * Compress a string into a byte array. 298 * @param buffer The string to compress. 299 * @return A byte array containing the compressed characters. 300 * @see #compress(char [], int, int) 301 */ compress(String buffer)302 public static byte [] compress(String buffer) 303 { 304 return compress(buffer.toCharArray(), 0, buffer.length()); 305 } 306 307 /** 308 * Compress a Unicode character array into a byte array. 309 * @param buffer The character buffer to compress. 310 * @param start The start of the character run to compress. 311 * @param limit The limit of the character run to compress. 312 * @return A byte array containing the compressed characters. 313 * @see #compress(String) 314 */ compress(char [] buffer, int start, int limit)315 public static byte [] compress(char [] buffer, 316 int start, 317 int limit) 318 { 319 UnicodeCompressor comp = new UnicodeCompressor(); 320 321 // use a buffer that we know will never overflow 322 // in the worst case, each character will take 3 bytes 323 // to encode: UQU, hibyte, lobyte. In this case, the 324 // compressed data will look like: SCU, UQU, hibyte, lobyte, ... 325 // buffer must be at least 4 bytes in size 326 int len = Math.max(4, 3 * (limit - start) + 1); 327 byte [] temp = new byte [len]; 328 329 int byteCount = comp.compress(buffer, start, limit, null, 330 temp, 0, len); 331 332 byte [] result = new byte [byteCount]; 333 System.arraycopy(temp, 0, result, 0, byteCount); 334 return result; 335 } 336 337 /** 338 * Compress a Unicode character array into a byte array. 339 * 340 * This function will only consume input that can be completely 341 * output. 342 * 343 * @param charBuffer The character buffer to compress. 344 * @param charBufferStart The start of the character run to compress. 345 * @param charBufferLimit The limit of the character run to compress. 346 * @param charsRead A one-element array. If not null, on return 347 * the number of characters read from charBuffer. 348 * @param byteBuffer A buffer to receive the compressed data. This 349 * buffer must be at minimum four bytes in size. 350 * @param byteBufferStart The starting offset to which to write 351 * compressed data. 352 * @param byteBufferLimit The limiting offset for writing compressed data. 353 * @return The number of bytes written to byteBuffer. 354 */ compress(char [] charBuffer, int charBufferStart, int charBufferLimit, int [] charsRead, byte [] byteBuffer, int byteBufferStart, int byteBufferLimit)355 public int compress(char [] charBuffer, 356 int charBufferStart, 357 int charBufferLimit, 358 int [] charsRead, 359 byte [] byteBuffer, 360 int byteBufferStart, 361 int byteBufferLimit) 362 { 363 // the current position in the target byte buffer 364 int bytePos = byteBufferStart; 365 366 // the current position in the source unicode character buffer 367 int ucPos = charBufferStart; 368 369 // the current unicode character from the source buffer 370 int curUC = INVALIDCHAR; 371 372 // the index for the current character 373 int curIndex = -1; 374 375 // look ahead 376 int nextUC = INVALIDCHAR; 377 int forwardUC = INVALIDCHAR; 378 379 // temporary for window searching 380 int whichWindow = 0; 381 382 // high and low bytes of the current unicode character 383 int hiByte = 0; 384 int loByte = 0; 385 386 387 // byteBuffer must be at least 4 bytes in size 388 if(byteBuffer.length < 4 || (byteBufferLimit - byteBufferStart) < 4) 389 throw new IllegalArgumentException("byteBuffer.length < 4"); 390 391 mainLoop: 392 while(ucPos < charBufferLimit && bytePos < byteBufferLimit) { 393 switch(fMode) { 394 // main single byte mode compression loop 395 case SINGLEBYTEMODE: 396 singleByteModeLoop: 397 while(ucPos < charBufferLimit && bytePos < byteBufferLimit) { 398 // get current char 399 curUC = charBuffer[ucPos++]; 400 401 // get next char 402 if(ucPos < charBufferLimit) 403 nextUC = charBuffer[ucPos]; 404 else 405 nextUC = INVALIDCHAR; 406 407 // chars less than 0x0080 (excluding tags) go straight 408 // in stream 409 if(curUC < 0x0080) { 410 loByte = curUC & 0xFF; 411 412 // we need to check and make sure we don't 413 // accidentally write a single byte mode tag to 414 // the stream unless it's quoted 415 if(sSingleTagTable[loByte]) { 416 // make sure there is enough room to 417 // write both bytes if not, rewind the 418 // source stream and break out 419 if( (bytePos + 1) >= byteBufferLimit) 420 { --ucPos; break mainLoop; } 421 422 // since we know the byte is less than 0x80, SQUOTE0 423 // will use static window 0, or ASCII 424 byteBuffer[bytePos++] = (byte) SQUOTE0; 425 } 426 427 byteBuffer[bytePos++] = (byte) loByte; 428 } 429 430 // if the char belongs to current window, convert it 431 // to a byte by adding the generic compression offset 432 // and subtracting the window's offset 433 else if(inDynamicWindow(curUC, fCurrentWindow) ) { 434 byteBuffer[bytePos++] = (byte) 435 (curUC - fOffsets[ fCurrentWindow ] 436 + COMPRESSIONOFFSET); 437 } 438 439 // if char is not in compressible range, either switch to or 440 // quote from unicode 441 else if( ! isCompressible(curUC) ) { 442 // only check next character if it is valid 443 if(nextUC != INVALIDCHAR && isCompressible(nextUC)) { 444 // make sure there is enough room to 445 // write all three bytes if not, 446 // rewind the source stream and break 447 // out 448 if( (bytePos + 2) >= byteBufferLimit) 449 { --ucPos; break mainLoop; } 450 451 byteBuffer[bytePos++] = (byte) SQUOTEU; 452 byteBuffer[bytePos++] = (byte) (curUC >>> 8); 453 byteBuffer[bytePos++] = (byte) (curUC & 0xFF); 454 } 455 else { 456 // make sure there is enough room to 457 // write all four bytes if not, rewind 458 // the source stream and break out 459 if((bytePos + 3) >= byteBufferLimit) 460 { --ucPos; break mainLoop; } 461 462 byteBuffer[bytePos++] = (byte) SCHANGEU; 463 464 hiByte = curUC >>> 8; 465 loByte = curUC & 0xFF; 466 467 if(sUnicodeTagTable[hiByte]) 468 // add quote Unicode tag 469 byteBuffer[bytePos++] = (byte) UQUOTEU; 470 471 byteBuffer[bytePos++] = (byte) hiByte; 472 byteBuffer[bytePos++] = (byte) loByte; 473 474 fMode = UNICODEMODE; 475 break singleByteModeLoop; 476 } 477 } 478 479 // if the char is in a currently defined dynamic 480 // window, figure out which one, and either switch to 481 // it or quote from it 482 else if((whichWindow = findDynamicWindow(curUC)) 483 != INVALIDWINDOW ) { 484 // look ahead 485 if( (ucPos + 1) < charBufferLimit ) 486 forwardUC = charBuffer[ucPos + 1]; 487 else 488 forwardUC = INVALIDCHAR; 489 490 // all three chars in same window, switch to that 491 // window inDynamicWindow will return false for 492 // INVALIDCHAR 493 if(inDynamicWindow(nextUC, whichWindow) 494 && inDynamicWindow(forwardUC, whichWindow)) { 495 // make sure there is enough room to 496 // write both bytes if not, rewind the 497 // source stream and break out 498 if( (bytePos + 1) >= byteBufferLimit) 499 { --ucPos; break mainLoop; } 500 501 byteBuffer[bytePos++] = (byte)(SCHANGE0 + whichWindow); 502 byteBuffer[bytePos++] = (byte) 503 (curUC - fOffsets[whichWindow] 504 + COMPRESSIONOFFSET); 505 fTimeStamps [ whichWindow ] = ++fTimeStamp; 506 fCurrentWindow = whichWindow; 507 } 508 509 // either only next char or neither in same 510 // window, so quote 511 else { 512 // make sure there is enough room to 513 // write both bytes if not, rewind the 514 // source stream and break out 515 if((bytePos + 1) >= byteBufferLimit) 516 { --ucPos; break mainLoop; } 517 518 byteBuffer[bytePos++] = (byte) (SQUOTE0 + whichWindow); 519 byteBuffer[bytePos++] = (byte) 520 (curUC - fOffsets[whichWindow] 521 + COMPRESSIONOFFSET); 522 } 523 } 524 525 // if a static window is defined, and the following 526 // character is not in that static window, quote from 527 // the static window Note: to quote from a static 528 // window, don't add 0x80 529 else if((whichWindow = findStaticWindow(curUC)) 530 != INVALIDWINDOW 531 && ! inStaticWindow(nextUC, whichWindow) ) { 532 // make sure there is enough room to write both 533 // bytes if not, rewind the source stream and 534 // break out 535 if((bytePos + 1) >= byteBufferLimit) 536 { --ucPos; break mainLoop; } 537 538 byteBuffer[bytePos++] = (byte) (SQUOTE0 + whichWindow); 539 byteBuffer[bytePos++] = (byte) 540 (curUC - sOffsets[whichWindow]); 541 } 542 543 // if a window is not defined, decide if we want to 544 // define a new one or switch to unicode mode 545 else { 546 // determine index for current char (char is compressible) 547 curIndex = makeIndex(curUC); 548 fIndexCount[curIndex]++; 549 550 // look ahead 551 if((ucPos + 1) < charBufferLimit) 552 forwardUC = charBuffer[ucPos + 1]; 553 else 554 forwardUC = INVALIDCHAR; 555 556 // if we have encountered this index at least once 557 // before, define a new window 558 // OR 559 // three chars in a row with same index, define a 560 // new window (makeIndex will return RESERVEDINDEX 561 // for INVALIDCHAR) 562 if((fIndexCount[curIndex] > 1) || 563 (curIndex == makeIndex(nextUC) 564 && curIndex == makeIndex(forwardUC))) { 565 // make sure there is enough room to write all 566 // three bytes if not, rewind the source 567 // stream and break out 568 if( (bytePos + 2) >= byteBufferLimit) 569 { --ucPos; break mainLoop; } 570 571 // get least recently defined window 572 whichWindow = getLRDefinedWindow(); 573 574 byteBuffer[bytePos++] = (byte)(SDEFINE0 + whichWindow); 575 byteBuffer[bytePos++] = (byte) curIndex; 576 byteBuffer[bytePos++] = (byte) 577 (curUC - sOffsetTable[curIndex] 578 + COMPRESSIONOFFSET); 579 580 fOffsets[whichWindow] = sOffsetTable[curIndex]; 581 fCurrentWindow = whichWindow; 582 fTimeStamps [whichWindow] = ++fTimeStamp; 583 } 584 585 // only two chars in a row with same index, so 586 // switch to unicode mode (makeIndex will return 587 // RESERVEDINDEX for INVALIDCHAR) 588 // OR 589 // three chars have different indices, so switch 590 // to unicode mode 591 else { 592 // make sure there is enough room to write all 593 // four bytes if not, rewind the source stream 594 // and break out 595 if((bytePos + 3) >= byteBufferLimit) 596 { --ucPos; break mainLoop; } 597 598 byteBuffer[bytePos++] = (byte) SCHANGEU; 599 600 hiByte = curUC >>> 8; 601 loByte = curUC & 0xFF; 602 603 if(sUnicodeTagTable[hiByte]) 604 // add quote Unicode tag 605 byteBuffer[bytePos++] = (byte) UQUOTEU; 606 607 byteBuffer[bytePos++] = (byte) hiByte; 608 byteBuffer[bytePos++] = (byte) loByte; 609 610 fMode = UNICODEMODE; 611 break singleByteModeLoop; 612 } 613 } 614 } 615 break; 616 617 case UNICODEMODE: 618 // main unicode mode compression loop 619 unicodeModeLoop: 620 while(ucPos < charBufferLimit && bytePos < byteBufferLimit) { 621 // get current char 622 curUC = charBuffer[ucPos++]; 623 624 // get next char 625 if( ucPos < charBufferLimit ) 626 nextUC = charBuffer[ucPos]; 627 else 628 nextUC = INVALIDCHAR; 629 630 // if we have two uncompressible chars in a row, 631 // put the current char's bytes in the stream 632 if( ! isCompressible(curUC) 633 || (nextUC != INVALIDCHAR && ! isCompressible(nextUC))) { 634 // make sure there is enough room to write all three bytes 635 // if not, rewind the source stream and break out 636 if( (bytePos + 2) >= byteBufferLimit) 637 { --ucPos; break mainLoop; } 638 639 hiByte = curUC >>> 8; 640 loByte = curUC & 0xFF; 641 642 if(sUnicodeTagTable[ hiByte ]) 643 // add quote Unicode tag 644 byteBuffer[bytePos++] = (byte) UQUOTEU; 645 646 byteBuffer[bytePos++] = (byte) hiByte; 647 byteBuffer[bytePos++] = (byte) loByte; 648 } 649 650 // bytes less than 0x80 can go straight in the stream, 651 // but in single-byte mode 652 else if(curUC < 0x0080) { 653 loByte = curUC & 0xFF; 654 655 // if two chars in a row below 0x80 and the 656 // current char is not a single-byte mode tag, 657 // switch to single-byte mode 658 if(nextUC != INVALIDCHAR 659 && nextUC < 0x0080 && ! sSingleTagTable[ loByte ] ) { 660 // make sure there is enough room to 661 // write both bytes if not, rewind the 662 // source stream and break out 663 if( (bytePos + 1) >= byteBufferLimit) 664 { --ucPos; break mainLoop; } 665 666 // use the last-active window 667 whichWindow = fCurrentWindow; 668 byteBuffer[bytePos++] = (byte)(UCHANGE0 + whichWindow); 669 byteBuffer[bytePos++] = (byte) loByte; 670 671 //fCurrentWindow = 0; 672 fTimeStamps [whichWindow] = ++fTimeStamp; 673 fMode = SINGLEBYTEMODE; 674 break unicodeModeLoop; 675 } 676 677 // otherwise, just write the bytes to the stream 678 // (this will cover the case of only 1 char less than 0x80 679 // and single-byte mode tags) 680 else { 681 // make sure there is enough room to 682 // write both bytes if not, rewind the 683 // source stream and break out 684 if((bytePos + 1) >= byteBufferLimit) 685 { --ucPos; break mainLoop; } 686 687 // since the character is less than 0x80, the 688 // high byte is always 0x00 - no need for 689 // (curUC >>> 8) 690 byteBuffer[bytePos++] = (byte) 0x00; 691 byteBuffer[bytePos++] = (byte) loByte; 692 } 693 } 694 695 // figure out if the current char is in a defined window 696 else if((whichWindow = findDynamicWindow(curUC)) 697 != INVALIDWINDOW ) { 698 // if two chars in a row in the same window, 699 // switch to that window and go to single-byte mode 700 // inDynamicWindow will return false for INVALIDCHAR 701 if(inDynamicWindow(nextUC, whichWindow)) { 702 // make sure there is enough room to 703 // write both bytes if not, rewind the 704 // source stream and break out 705 if((bytePos + 1) >= byteBufferLimit) 706 { --ucPos; break mainLoop; } 707 708 byteBuffer[bytePos++] = (byte)(UCHANGE0 + whichWindow); 709 byteBuffer[bytePos++] = (byte) 710 (curUC - fOffsets[whichWindow] 711 + COMPRESSIONOFFSET); 712 713 fTimeStamps [ whichWindow ] = ++fTimeStamp; 714 fCurrentWindow = whichWindow; 715 fMode = SINGLEBYTEMODE; 716 break unicodeModeLoop; 717 } 718 719 // otherwise, just quote the unicode for the char 720 else { 721 // make sure there is enough room to 722 // write all three bytes if not, 723 // rewind the source stream and break 724 // out 725 if((bytePos + 2) >= byteBufferLimit) 726 { --ucPos; break mainLoop; } 727 728 hiByte = curUC >>> 8; 729 loByte = curUC & 0xFF; 730 731 if(sUnicodeTagTable[ hiByte ]) 732 // add quote Unicode tag 733 byteBuffer[bytePos++] = (byte) UQUOTEU; 734 735 byteBuffer[bytePos++] = (byte) hiByte; 736 byteBuffer[bytePos++] = (byte) loByte; 737 } 738 } 739 740 // char is not in a defined window 741 else { 742 // determine index for current char (char is compressible) 743 curIndex = makeIndex(curUC); 744 fIndexCount[curIndex]++; 745 746 // look ahead 747 if( (ucPos + 1) < charBufferLimit ) 748 forwardUC = charBuffer[ucPos + 1]; 749 else 750 forwardUC = INVALIDCHAR; 751 752 // if we have encountered this index at least once 753 // before, define a new window for it that hasn't 754 // previously been redefined 755 // OR 756 // if three chars in a row with the same index, 757 // define a new window (makeIndex will return 758 // RESERVEDINDEX for INVALIDCHAR) 759 if((fIndexCount[curIndex] > 1) || 760 (curIndex == makeIndex(nextUC) 761 && curIndex == makeIndex(forwardUC))) { 762 // make sure there is enough room to 763 // write all three bytes if not, 764 // rewind the source stream and break 765 // out 766 if((bytePos + 2) >= byteBufferLimit) 767 { --ucPos; break mainLoop; } 768 769 // get least recently defined window 770 whichWindow = getLRDefinedWindow(); 771 772 byteBuffer[bytePos++] = (byte)(UDEFINE0 + whichWindow); 773 byteBuffer[bytePos++] = (byte) curIndex; 774 byteBuffer[bytePos++] = (byte) 775 (curUC - sOffsetTable[curIndex] 776 + COMPRESSIONOFFSET); 777 778 fOffsets[whichWindow] = sOffsetTable[curIndex]; 779 fCurrentWindow = whichWindow; 780 fTimeStamps [whichWindow] = ++fTimeStamp; 781 fMode = SINGLEBYTEMODE; 782 break unicodeModeLoop; 783 } 784 785 // otherwise just quote the unicode, and save our 786 // windows for longer runs 787 else { 788 // make sure there is enough room to 789 // write all three bytes if not, 790 // rewind the source stream and break 791 // out 792 if((bytePos + 2) >= byteBufferLimit) 793 { --ucPos; break mainLoop; } 794 795 hiByte = curUC >>> 8; 796 loByte = curUC & 0xFF; 797 798 if(sUnicodeTagTable[ hiByte ]) 799 // add quote Unicode tag 800 byteBuffer[bytePos++] = (byte) UQUOTEU; 801 802 byteBuffer[bytePos++] = (byte) hiByte; 803 byteBuffer[bytePos++] = (byte) loByte; 804 } 805 } 806 } 807 } // end switch 808 } 809 810 // fill in output parameter 811 if(charsRead != null) 812 charsRead [0] = (ucPos - charBufferStart); 813 814 // return # of bytes written 815 return (bytePos - byteBufferStart); 816 } 817 818 /** 819 * Reset the compressor to its initial state. 820 */ reset()821 public void reset() 822 { 823 int i; 824 825 // reset dynamic windows 826 fOffsets[0] = 0x0080; // Latin-1 827 fOffsets[1] = 0x00C0; // Latin-1 Supplement + Latin Extended-A 828 fOffsets[2] = 0x0400; // Cyrillic 829 fOffsets[3] = 0x0600; // Arabic 830 fOffsets[4] = 0x0900; // Devanagari 831 fOffsets[5] = 0x3040; // Hiragana 832 fOffsets[6] = 0x30A0; // Katakana 833 fOffsets[7] = 0xFF00; // Fullwidth ASCII 834 835 836 // reset time stamps 837 for(i = 0; i < NUMWINDOWS; i++) { 838 fTimeStamps[i] = 0; 839 } 840 841 // reset count of seen indices 842 for(i = 0; i <= MAXINDEX; i++ ) { 843 fIndexCount[i] = 0; 844 } 845 846 fTimeStamp = 0; // Reset current time stamp 847 fCurrentWindow = 0; // Make current window Latin-1 848 fMode = SINGLEBYTEMODE; // Always start in single-byte mode 849 } 850 851 //========================== 852 // Determine the index for a character 853 //========================== 854 855 /** 856 * Create the index value for a character. 857 * For more information on this function, refer to table X-3 858 * <A HREF="http://www.unicode.org/unicode/reports/tr6">UTR6</A>. 859 * @param c The character in question. 860 * @return An index for c 861 */ makeIndex(int c)862 private static int makeIndex(int c) 863 { 864 // check the predefined indices 865 if(c >= 0x00C0 && c < 0x0140) 866 return LATININDEX; 867 else if(c >= 0x0250 && c < 0x02D0) 868 return IPAEXTENSIONINDEX; 869 else if(c >= 0x0370 && c < 0x03F0) 870 return GREEKINDEX; 871 else if(c >= 0x0530 && c < 0x0590) 872 return ARMENIANINDEX; 873 else if(c >= 0x3040 && c < 0x30A0) 874 return HIRAGANAINDEX; 875 else if(c >= 0x30A0 && c < 0x3120) 876 return KATAKANAINDEX; 877 else if(c >= 0xFF60 && c < 0xFF9F) 878 return HALFWIDTHKATAKANAINDEX; 879 880 // calculate index 881 else if(c >= 0x0080 && c < 0x3400) 882 return (c / 0x80) & 0xFF; 883 else if(c >= 0xE000 && c <= 0xFFFF) 884 return ((c - 0xAC00) / 0x80) & 0xFF; 885 886 // should never happen 887 else { 888 return RESERVEDINDEX; 889 } 890 } 891 892 //========================== 893 // Check if a given character fits in a window 894 //========================== 895 896 /** 897 * Determine if a character is in a dynamic window. 898 * @param c The character to test 899 * @param whichWindow The dynamic window the test 900 * @return true if <TT>c</TT> will fit in <TT>whichWindow</TT>, 901 * false otherwise. 902 */ inDynamicWindow(int c, int whichWindow)903 private boolean inDynamicWindow(int c, 904 int whichWindow) 905 { 906 return (c >= fOffsets[whichWindow] 907 && c < (fOffsets[whichWindow] + 0x80)); 908 } 909 910 /** 911 * Determine if a character is in a static window. 912 * @param c The character to test 913 * @param whichWindow The static window the test 914 * @return true if <TT>c</TT> will fit in <TT>whichWindow</TT>, 915 * false otherwise. 916 */ inStaticWindow(int c, int whichWindow)917 private static boolean inStaticWindow(int c, 918 int whichWindow) 919 { 920 return (c >= sOffsets[whichWindow] 921 && c < (sOffsets[whichWindow] + 0x80)); 922 } 923 924 //========================== 925 // Check if a given character is compressible 926 //========================== 927 928 /** 929 * Determine if a character is compressible. 930 * @param c The character to test. 931 * @return true if the <TT>c</TT> is compressible, false otherwise. 932 */ isCompressible(int c)933 private static boolean isCompressible(int c) 934 { 935 return (c < 0x3400 || c >= 0xE000); 936 } 937 938 //========================== 939 // Check if a window is defined for a given character 940 //========================== 941 942 /** 943 * Determine if a dynamic window for a certain character is defined 944 * @param c The character in question 945 * @return The dynamic window containing <TT>c</TT>, or 946 * INVALIDWINDOW if not defined. 947 */ findDynamicWindow(int c)948 private int findDynamicWindow(int c) 949 { 950 // supposedly faster to count down 951 //for(int i = 0; i < NUMWINDOWS; i++) { 952 for(int i = NUMWINDOWS - 1; i >= 0; --i) { 953 if(inDynamicWindow(c, i)) { 954 ++fTimeStamps[i]; 955 return i; 956 } 957 } 958 959 return INVALIDWINDOW; 960 } 961 962 /** 963 * Determine if a static window for a certain character is defined 964 * @param c The character in question 965 * @return The static window containing <TT>c</TT>, or 966 * INVALIDWINDOW if not defined. 967 */ findStaticWindow(int c)968 private static int findStaticWindow(int c) 969 { 970 // supposedly faster to count down 971 //for(int i = 0; i < NUMSTATICWINDOWS; i++) { 972 for(int i = NUMSTATICWINDOWS - 1; i >= 0; --i) { 973 if(inStaticWindow(c, i)) { 974 return i; 975 } 976 } 977 978 return INVALIDWINDOW; 979 } 980 981 //========================== 982 // Find the least-recently used window 983 //========================== 984 985 /** Find the least-recently defined window */ getLRDefinedWindow()986 private int getLRDefinedWindow() 987 { 988 int leastRU = Integer.MAX_VALUE; 989 int whichWindow = INVALIDWINDOW; 990 991 // find least recently used window 992 // supposedly faster to count down 993 //for( int i = 0; i < NUMWINDOWS; i++ ) { 994 for(int i = NUMWINDOWS - 1; i >= 0; --i ) { 995 if( fTimeStamps[i] < leastRU ) { 996 leastRU = fTimeStamps[i]; 997 whichWindow = i; 998 } 999 } 1000 1001 return whichWindow; 1002 } 1003 1004 } 1005