• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* GENERATED SOURCE. DO NOT MODIFY. */
2 // © 2016 and later: Unicode, Inc. and others.
3 // License & terms of use: http://www.unicode.org/copyright.html#License
4 /*
5  *******************************************************************************
6  * Copyright (C) 1996-2016, International Business Machines Corporation and    *
7  * others. All Rights Reserved.                                                *
8  *******************************************************************************
9  */
10 package ohos.global.icu.text;
11 
12 /**
13 * A compression engine implementing the Standard Compression Scheme
14 * for Unicode (SCSU) as outlined in <A
15 * HREF="http://www.unicode.org/unicode/reports/tr6">Unicode Technical
16 * Report #6</A>.
17 *
18 * <P>The SCSU works by using dynamically positioned <EM>windows</EM>
19 * consisting of 128 consecutive characters in Unicode.  During compression,
20 * characters within a window are encoded in the compressed stream as the bytes
21 * <TT>0x7F - 0xFF</TT>. The SCSU provides transparency for the characters
22 * (bytes) between <TT>U+0000 - U+00FF</TT>.  The SCSU approximates the
23 * storage size of traditional character sets, for example 1 byte per
24 * character for ASCII or Latin-1 text, and 2 bytes per character for CJK
25 * ideographs.</P>
26 *
27 * <P><STRONG>USAGE</STRONG></P>
28 *
29 * <P>The static methods on <TT>UnicodeCompressor</TT> may be used in a
30 * straightforward manner to compress simple strings:</P>
31 *
32 * <PRE>
33 *  String s = ... ; // get string from somewhere
34 *  byte [] compressed = UnicodeCompressor.compress(s);
35 * </PRE>
36 *
37 * <P>The static methods have a fairly large memory footprint.
38 * For finer-grained control over memory usage,
39 * <TT>UnicodeCompressor</TT> offers more powerful APIs allowing
40 * iterative compression:</P>
41 *
42 * <PRE>
43 *  // Compress an array "chars" of length "len" using a buffer of 512 bytes
44 *  // to the OutputStream "out"
45 *
46 *  UnicodeCompressor myCompressor         = new UnicodeCompressor();
47 *  final static int  BUFSIZE              = 512;
48 *  byte []           byteBuffer           = new byte [ BUFSIZE ];
49 *  int               bytesWritten         = 0;
50 *  int []            unicharsRead         = new int [1];
51 *  int               totalCharsCompressed = 0;
52 *  int               totalBytesWritten    = 0;
53 *
54 *  do {
55 *    // do the compression
56 *    bytesWritten = myCompressor.compress(chars, totalCharsCompressed,
57 *                                         len, unicharsRead,
58 *                                         byteBuffer, 0, BUFSIZE);
59 *
60 *    // do something with the current set of bytes
61 *    out.write(byteBuffer, 0, bytesWritten);
62 *
63 *    // update the no. of characters compressed
64 *    totalCharsCompressed += unicharsRead[0];
65 *
66 *    // update the no. of bytes written
67 *    totalBytesWritten += bytesWritten;
68 *
69 *  } while(totalCharsCompressed &lt; len);
70 *
71 *  myCompressor.reset(); // reuse compressor
72 * </PRE>
73 *
74 * @see UnicodeDecompressor
75 *
76 * @author Stephen F. Booth
77 * @hide exposed on OHOS
78 */
79 
80 /*
81 *
82 * COMPRESSION STRATEGY
83 *
84 * Single Byte Mode
85 *
86 * There are three relevant cases.
87 * If the character is in the current window or is Latin-1 (U+0000,
88 * U+0009, U+000A, U+000D, U+0020 - U+007F), the character is placed
89 * directly in the stream as a single byte.
90 *
91 *  1. Current character is in defined, inactive window.
92 *  2. Current character is in undefined window.
93 *  3. Current character is uncompressible Unicode (U+3400 - U+DFFF).
94 *
95 *  1. Current character is in defined, inactive window
96 *    A. Look ahead two characters
97 *    B. If both following characters in same window as current character,
98 *       switch to defined window
99 *    C. If only next character is in same window as current character,
100 *       quote defined window
101 *    D. If neither of following characters is in same window as current,
102 *       quote defined window
103 *
104 *  2. Current character is in undefined window
105 *    A. Look ahead two characters
106 *    B. If both following characters in same window as current character,
107 *       define new window
108 *    C. If only next character in same window as current character,
109 *       switch to Unicode mode
110 *       NOTE: This costs us one extra byte.  However,
111 *        since we have a limited number of windows to work with, it is
112 *        assumed the cost will pay off later in savings from a window with
113 *        more characters in it.
114 *    D. If neither of following characters in same window as current,
115 *       switch to Unicode mode.  Alternative to above: just quote
116 *       Unicode (same byte cost)
117 *
118 *  3. Current character is uncompressible Unicode (U+3400 - U+DFFF)
119 *    A. Look ahead one character
120 *    B. If next character in non-compressible region, switch to
121 *       Unicode mode
122 *    C. If next character not in non-compressible region, quote Unicode
123 *
124 *
125 * The following chart illustrates the bytes required for encoding characters
126 * in each possible way
127 *
128 *
129 *                                   SINGLE BYTE MODE
130 *                                       Characters in a row with same index
131 *               tag encountered             1       2       3       4
132 *               ---------------------------------------------------------------
133 *               none (in current window)    1       2       3       4
134 *
135 *               quote Unicode               3       6       9       12
136 *
137 *   window not  switch to Unicode           3       5       7       9     byte
138 *   defined     define window               3       4       5       6     cost
139 *
140 *   window      switch to window            2       3       4       5
141 *   defined     quote window                2       4       6       8
142 *
143 *  Unicode Mode
144 *
145 * There are two relevant cases.
146 * If the character is in the non-compressible region
147 * (U+3400 - U+DFFF), the character is simply written to the
148 * stream as a pair of bytes.
149 *
150 * 1. Current character is in defined, inactive window.
151 * 2. Current character is in undefined window.
152 *
153 *  1.Current character is in defined, inactive window
154 *    A. Look ahead one character
155 *    B. If next character has same index as current character,
156 *       switch to defined window (and switch to single-byte mode)
157 *    C. If not, just put bytes in stream
158 *
159 *
160 *  2. Current character is in undefined window
161 *    A. Look ahead two characters
162 *    B. If both in same window as current character, define window
163 *       (and switch to single-byte mode)
164 *    C. If only next character in same window, just put bytes in stream
165 *        NOTE: This costs us one extra byte.  However,
166 *        since we have a limited number of windows to work with, it is
167 *        assumed the cost will pay off later in savings from a window with
168 *        more characters in it.
169 *    D. If neither in same window, put bytes in stream
170 *
171 *
172 * The following chart illustrates the bytes required for encoding characters
173 * in each possible way
174 *
175 *
176 *                                   UNICODE MODE
177 *                                       Characters in a row with same index
178 *               tag encountered             1       2       3       4
179 *               ---------------------------------------------------------------
180 *               none                        2       4       6       8
181 *
182 *               quote Unicode               3       6       9       12
183 *
184 *   window not  define window               3       4       5       6     byte
185 *   defined                                                               cost
186 *   window      switch to window            2       3       4       5
187 *   defined
188 */
189 public final class UnicodeCompressor implements SCSU
190 {
191     //==========================
192     // Class variables
193     //==========================
194 
195     /** For quick identification of a byte as a single-byte mode tag */
196     private static boolean [] sSingleTagTable = {
197         // table generated by CompressionTableGenerator
198         false, true, true, true, true, true, true, true, true, false,
199     false, true, true, false, true, true, true, true, true, true,
200     true, true, true, true, true, true, true, true, true, true,
201     true, true, false, false, false, false, false, false,false,
202     false, false, false, false, false, false, false, false, false,
203     false, false, false, false, false, false, false, false, false,
204     false, false, false, false, false, false, false, false, false,
205     false, false, false, false, false, false, false, false, false,
206     false, false, false, false, false, false, false, false, false,
207     false, false, false, false, false, false, false, false, false,
208     false, false, false, false, false, false, false, false, false,
209     false, false, false, false, false, false, false, false, false,
210     false, false, false, false, false, false, false, false, false,
211     false, false, false, false, false, false, false, false, false,
212     false, false, false, false, false, false, false, false, false,
213     false, false, false, false, false, false, false, false, false,
214     false, false, false, false, false, false, false, false, false,
215     false, false, false, false, false, false, false, false, false,
216     false, false, false, false, false, false, false, false, false,
217     false, false, false, false, false, false, false, false, false,
218     false, false, false, false, false, false, false, false, false,
219     false, false, false, false, false, false, false, false, false,
220     false, false, false, false, false, false, false, false, false,
221     false, false, false, false, false, false, false, false, false,
222     false, false, false, false, false, false, false, false, false,
223     false, false, false, false, false, false, false, false, false,
224     false, false, false, false, false, false, false, false, false,
225     false, false, false, false, false, false, false, false, false,
226     false
227     };
228 
229     /** For quick identification of a byte as a unicode mode tag */
230     private static boolean [] sUnicodeTagTable = {
231         // table generated by CompressionTableGenerator
232         false, false, false, false, false, false, false, false, false,
233     false, false, false, false, false, false, false, false, false,
234     false, false, false, false, false, false, false, false, false,
235     false, false, false, false, false, false, false, false, false,
236     false, false, false, false, false, false, false, false, false,
237     false, false, false, false, false, false, false, false, false,
238     false, false, false, false, false, false, false, false, false,
239     false, false, false, false, false, false, false, false, false,
240     false, false, false, false, false, false, false, false, false,
241     false, false, false, false, false, false, false, false, false,
242     false, false, false, false, false, false, false, false, false,
243     false, false, false, false, false, false, false, false, false,
244     false, false, false, false, false, false, false, false, false,
245     false, false, false, false, false, false, false, false, false,
246     false, false, false, false, false, false, false, false, false,
247     false, false, false, false, false, false, false, false, false,
248     false, false, false, false, false, false, false, false, false,
249     false, false, false, false, false, false, false, false, false,
250     false, false, false, false, false, false, false, false, false,
251     false, false, false, false, false, false, false, false, false,
252     false, false, false, false, false, false, false, false, false,
253     false, false, false, false, false, false, false, false, false,
254     false, false, false, false, false, false, false, false, false,
255     false, false, false, false, false, false, false, false, false,
256     false, false, false, false, false, false, false, false, true,
257     true, true, true, true, true, true, true, true, true, true,
258     true, true, true, true, true, true, true, true, false, false,
259     false, false, false, false, false, false, false, false, false,
260     false, false
261     };
262 
263     //==========================
264     // Instance variables
265     //==========================
266 
267     /** Alias to current dynamic window */
268     private int       fCurrentWindow   = 0;
269 
270     /** Dynamic compression window offsets */
271     private int []    fOffsets         = new int [ NUMWINDOWS ];
272 
273     /** Current compression mode */
274     private int       fMode            = SINGLEBYTEMODE;
275 
276     /** Keeps count of times character indices are encountered */
277     private int []    fIndexCount      = new int [ MAXINDEX + 1 ];
278 
279     /** The time stamps indicate when a window was last defined */
280     private int []    fTimeStamps      = new int [ NUMWINDOWS ];
281 
282     /** The current time stamp */
283     private int       fTimeStamp       = 0;
284 
285 
286     /**
287      * Create a UnicodeCompressor.
288      * Sets all windows to their default values.
289      * @see #reset
290      */
UnicodeCompressor()291     public UnicodeCompressor()
292     {
293     reset();              // initialize to defaults
294     }
295 
296     /**
297      * Compress a string into a byte array.
298      * @param buffer The string to compress.
299      * @return A byte array containing the compressed characters.
300      * @see #compress(char [], int, int)
301      */
compress(String buffer)302     public static byte [] compress(String buffer)
303     {
304     return compress(buffer.toCharArray(), 0, buffer.length());
305     }
306 
307     /**
308      * Compress a Unicode character array into a byte array.
309      * @param buffer The character buffer to compress.
310      * @param start The start of the character run to compress.
311      * @param limit The limit of the character run to compress.
312      * @return A byte array containing the compressed characters.
313      * @see #compress(String)
314      */
compress(char [] buffer, int start, int limit)315     public static byte [] compress(char [] buffer,
316                    int start,
317                    int limit)
318     {
319     UnicodeCompressor comp = new UnicodeCompressor();
320 
321     // use a buffer that we know will never overflow
322     // in the worst case, each character will take 3 bytes
323     // to encode: UQU, hibyte, lobyte.  In this case, the
324     // compressed data will look like: SCU, UQU, hibyte, lobyte, ...
325     // buffer must be at least 4 bytes in size
326     int len = Math.max(4, 3 * (limit - start) + 1);
327     byte [] temp = new byte [len];
328 
329     int byteCount = comp.compress(buffer, start, limit, null,
330                       temp, 0, len);
331 
332     byte [] result = new byte [byteCount];
333     System.arraycopy(temp, 0, result, 0, byteCount);
334     return result;
335     }
336 
337     /**
338      * Compress a Unicode character array into a byte array.
339      *
340      * This function will only consume input that can be completely
341      * output.
342      *
343      * @param charBuffer The character buffer to compress.
344      * @param charBufferStart The start of the character run to compress.
345      * @param charBufferLimit The limit of the character run to compress.
346      * @param charsRead A one-element array.  If not null, on return
347      * the number of characters read from charBuffer.
348      * @param byteBuffer A buffer to receive the compressed data.  This
349      * buffer must be at minimum four bytes in size.
350      * @param byteBufferStart The starting offset to which to write
351      * compressed data.
352      * @param byteBufferLimit The limiting offset for writing compressed data.
353      * @return The number of bytes written to byteBuffer.
354      */
compress(char [] charBuffer, int charBufferStart, int charBufferLimit, int [] charsRead, byte [] byteBuffer, int byteBufferStart, int byteBufferLimit)355     public int compress(char []     charBuffer,
356             int         charBufferStart,
357             int         charBufferLimit,
358             int []      charsRead,
359             byte []     byteBuffer,
360             int         byteBufferStart,
361             int         byteBufferLimit)
362     {
363         // the current position in the target byte buffer
364     int     bytePos       = byteBufferStart;
365 
366     // the current position in the source unicode character buffer
367     int     ucPos         = charBufferStart;
368 
369     // the current unicode character from the source buffer
370     int     curUC         = INVALIDCHAR;
371 
372     // the index for the current character
373         int     curIndex      = -1;
374 
375     // look ahead
376     int     nextUC        = INVALIDCHAR;
377     int     forwardUC     = INVALIDCHAR;
378 
379         // temporary for window searching
380     int     whichWindow   = 0;
381 
382     // high and low bytes of the current unicode character
383     int     hiByte        = 0;
384     int     loByte        = 0;
385 
386 
387     // byteBuffer must be at least 4 bytes in size
388     if(byteBuffer.length < 4 || (byteBufferLimit - byteBufferStart) < 4)
389         throw new IllegalArgumentException("byteBuffer.length < 4");
390 
391     mainLoop:
392     while(ucPos < charBufferLimit && bytePos < byteBufferLimit) {
393         switch(fMode) {
394         // main single byte mode compression loop
395         case SINGLEBYTEMODE:
396         singleByteModeLoop:
397         while(ucPos < charBufferLimit && bytePos < byteBufferLimit) {
398         // get current char
399         curUC = charBuffer[ucPos++];
400 
401         // get next char
402         if(ucPos < charBufferLimit)
403             nextUC = charBuffer[ucPos];
404         else
405             nextUC = INVALIDCHAR;
406 
407         // chars less than 0x0080 (excluding tags) go straight
408         // in stream
409         if(curUC < 0x0080) {
410             loByte = curUC & 0xFF;
411 
412             // we need to check and make sure we don't
413             // accidentally write a single byte mode tag to
414             // the stream unless it's quoted
415             if(sSingleTagTable[loByte]) {
416                                 // make sure there is enough room to
417                                 // write both bytes if not, rewind the
418                                 // source stream and break out
419             if( (bytePos + 1) >= byteBufferLimit)
420                 { --ucPos; break mainLoop; }
421 
422             // since we know the byte is less than 0x80, SQUOTE0
423             // will use static window 0, or ASCII
424             byteBuffer[bytePos++] = (byte) SQUOTE0;
425             }
426 
427             byteBuffer[bytePos++] = (byte) loByte;
428         }
429 
430         // if the char belongs to current window, convert it
431         // to a byte by adding the generic compression offset
432         // and subtracting the window's offset
433         else if(inDynamicWindow(curUC, fCurrentWindow) ) {
434             byteBuffer[bytePos++] = (byte)
435             (curUC - fOffsets[ fCurrentWindow ]
436              + COMPRESSIONOFFSET);
437         }
438 
439         // if char is not in compressible range, either switch to or
440         // quote from unicode
441         else if( ! isCompressible(curUC) ) {
442             // only check next character if it is valid
443             if(nextUC != INVALIDCHAR && isCompressible(nextUC)) {
444                                 // make sure there is enough room to
445                                 // write all three bytes if not,
446                                 // rewind the source stream and break
447                                 // out
448             if( (bytePos + 2) >= byteBufferLimit)
449                 { --ucPos; break mainLoop; }
450 
451             byteBuffer[bytePos++] = (byte) SQUOTEU;
452             byteBuffer[bytePos++] = (byte) (curUC >>> 8);
453             byteBuffer[bytePos++] = (byte) (curUC & 0xFF);
454             }
455             else {
456                                 // make sure there is enough room to
457                                 // write all four bytes if not, rewind
458                                 // the source stream and break out
459             if((bytePos + 3) >= byteBufferLimit)
460                 { --ucPos; break mainLoop; }
461 
462             byteBuffer[bytePos++] = (byte) SCHANGEU;
463 
464             hiByte = curUC >>> 8;
465             loByte = curUC & 0xFF;
466 
467             if(sUnicodeTagTable[hiByte])
468                 // add quote Unicode tag
469                 byteBuffer[bytePos++]   = (byte) UQUOTEU;
470 
471             byteBuffer[bytePos++] = (byte) hiByte;
472             byteBuffer[bytePos++] = (byte) loByte;
473 
474             fMode = UNICODEMODE;
475             break singleByteModeLoop;
476             }
477         }
478 
479         // if the char is in a currently defined dynamic
480         // window, figure out which one, and either switch to
481         // it or quote from it
482         else if((whichWindow = findDynamicWindow(curUC))
483             != INVALIDWINDOW ) {
484             // look ahead
485             if( (ucPos + 1) < charBufferLimit )
486             forwardUC = charBuffer[ucPos + 1];
487             else
488             forwardUC = INVALIDCHAR;
489 
490             // all three chars in same window, switch to that
491             // window inDynamicWindow will return false for
492             // INVALIDCHAR
493             if(inDynamicWindow(nextUC, whichWindow)
494                && inDynamicWindow(forwardUC, whichWindow)) {
495                                 // make sure there is enough room to
496                                 // write both bytes if not, rewind the
497                                 // source stream and break out
498             if( (bytePos + 1) >= byteBufferLimit)
499                 { --ucPos; break mainLoop; }
500 
501             byteBuffer[bytePos++] = (byte)(SCHANGE0 + whichWindow);
502             byteBuffer[bytePos++] = (byte)
503                 (curUC - fOffsets[whichWindow]
504                  + COMPRESSIONOFFSET);
505             fTimeStamps [ whichWindow ] = ++fTimeStamp;
506             fCurrentWindow = whichWindow;
507             }
508 
509             // either only next char or neither in same
510             // window, so quote
511             else {
512                                 // make sure there is enough room to
513                                 // write both bytes if not, rewind the
514                                 // source stream and break out
515             if((bytePos + 1) >= byteBufferLimit)
516                 { --ucPos; break mainLoop; }
517 
518             byteBuffer[bytePos++] = (byte) (SQUOTE0 + whichWindow);
519             byteBuffer[bytePos++] = (byte)
520                 (curUC - fOffsets[whichWindow]
521                  + COMPRESSIONOFFSET);
522             }
523         }
524 
525         // if a static window is defined, and the following
526         // character is not in that static window, quote from
527         // the static window Note: to quote from a static
528         // window, don't add 0x80
529         else if((whichWindow = findStaticWindow(curUC))
530             != INVALIDWINDOW
531             && ! inStaticWindow(nextUC, whichWindow) ) {
532             // make sure there is enough room to write both
533             // bytes if not, rewind the source stream and
534             // break out
535             if((bytePos + 1) >= byteBufferLimit)
536             { --ucPos; break mainLoop; }
537 
538             byteBuffer[bytePos++] = (byte) (SQUOTE0 + whichWindow);
539             byteBuffer[bytePos++] = (byte)
540             (curUC - sOffsets[whichWindow]);
541         }
542 
543         // if a window is not defined, decide if we want to
544         // define a new one or switch to unicode mode
545         else {
546             // determine index for current char (char is compressible)
547             curIndex = makeIndex(curUC);
548             fIndexCount[curIndex]++;
549 
550             // look ahead
551             if((ucPos + 1) < charBufferLimit)
552             forwardUC = charBuffer[ucPos + 1];
553             else
554             forwardUC = INVALIDCHAR;
555 
556             // if we have encountered this index at least once
557             // before, define a new window
558             // OR
559             // three chars in a row with same index, define a
560             // new window (makeIndex will return RESERVEDINDEX
561             // for INVALIDCHAR)
562             if((fIndexCount[curIndex] > 1) ||
563                (curIndex == makeIndex(nextUC)
564             && curIndex == makeIndex(forwardUC))) {
565             // make sure there is enough room to write all
566             // three bytes if not, rewind the source
567             // stream and break out
568             if( (bytePos + 2) >= byteBufferLimit)
569                 { --ucPos; break mainLoop; }
570 
571             // get least recently defined window
572             whichWindow = getLRDefinedWindow();
573 
574             byteBuffer[bytePos++] = (byte)(SDEFINE0 + whichWindow);
575             byteBuffer[bytePos++] = (byte) curIndex;
576             byteBuffer[bytePos++] = (byte)
577                 (curUC - sOffsetTable[curIndex]
578                  + COMPRESSIONOFFSET);
579 
580             fOffsets[whichWindow] = sOffsetTable[curIndex];
581             fCurrentWindow = whichWindow;
582             fTimeStamps [whichWindow] = ++fTimeStamp;
583             }
584 
585             // only two chars in a row with same index, so
586             // switch to unicode mode (makeIndex will return
587             // RESERVEDINDEX for INVALIDCHAR)
588             // OR
589             // three chars have different indices, so switch
590             // to unicode mode
591             else {
592             // make sure there is enough room to write all
593             // four bytes if not, rewind the source stream
594             // and break out
595             if((bytePos + 3) >= byteBufferLimit)
596                 { --ucPos; break mainLoop; }
597 
598             byteBuffer[bytePos++] = (byte) SCHANGEU;
599 
600             hiByte = curUC >>> 8;
601             loByte = curUC & 0xFF;
602 
603             if(sUnicodeTagTable[hiByte])
604                 // add quote Unicode tag
605                 byteBuffer[bytePos++] = (byte) UQUOTEU;
606 
607             byteBuffer[bytePos++] = (byte) hiByte;
608             byteBuffer[bytePos++] = (byte) loByte;
609 
610             fMode = UNICODEMODE;
611             break singleByteModeLoop;
612             }
613         }
614         }
615         break;
616 
617         case UNICODEMODE:
618         // main unicode mode compression loop
619         unicodeModeLoop:
620         while(ucPos < charBufferLimit && bytePos < byteBufferLimit) {
621         // get current char
622         curUC = charBuffer[ucPos++];
623 
624         // get next char
625         if( ucPos < charBufferLimit )
626             nextUC = charBuffer[ucPos];
627         else
628             nextUC = INVALIDCHAR;
629 
630         // if we have two uncompressible chars in a row,
631         // put the current char's bytes in the stream
632         if( ! isCompressible(curUC)
633             || (nextUC != INVALIDCHAR && ! isCompressible(nextUC))) {
634             // make sure there is enough room to write all three bytes
635             // if not, rewind the source stream and break out
636             if( (bytePos + 2) >= byteBufferLimit)
637             { --ucPos; break mainLoop; }
638 
639             hiByte = curUC >>> 8;
640             loByte = curUC & 0xFF;
641 
642             if(sUnicodeTagTable[ hiByte ])
643             // add quote Unicode tag
644             byteBuffer[bytePos++] = (byte) UQUOTEU;
645 
646             byteBuffer[bytePos++] = (byte) hiByte;
647             byteBuffer[bytePos++] = (byte) loByte;
648         }
649 
650         // bytes less than 0x80 can go straight in the stream,
651         // but in single-byte mode
652         else if(curUC < 0x0080) {
653             loByte = curUC & 0xFF;
654 
655             // if two chars in a row below 0x80 and the
656             // current char is not a single-byte mode tag,
657             // switch to single-byte mode
658             if(nextUC != INVALIDCHAR
659                && nextUC < 0x0080 && ! sSingleTagTable[ loByte ] ) {
660                                 // make sure there is enough room to
661                                 // write both bytes if not, rewind the
662                                 // source stream and break out
663             if( (bytePos + 1) >= byteBufferLimit)
664                 { --ucPos; break mainLoop; }
665 
666             // use the last-active window
667             whichWindow = fCurrentWindow;
668             byteBuffer[bytePos++] = (byte)(UCHANGE0 + whichWindow);
669             byteBuffer[bytePos++] = (byte) loByte;
670 
671             //fCurrentWindow = 0;
672             fTimeStamps [whichWindow] = ++fTimeStamp;
673             fMode = SINGLEBYTEMODE;
674             break unicodeModeLoop;
675             }
676 
677             // otherwise, just write the bytes to the stream
678             // (this will cover the case of only 1 char less than 0x80
679             // and single-byte mode tags)
680             else {
681                                 // make sure there is enough room to
682                                 // write both bytes if not, rewind the
683                                 // source stream and break out
684             if((bytePos + 1) >= byteBufferLimit)
685                 { --ucPos; break mainLoop; }
686 
687             // since the character is less than 0x80, the
688             // high byte is always 0x00 - no need for
689             // (curUC >>> 8)
690             byteBuffer[bytePos++] = (byte) 0x00;
691             byteBuffer[bytePos++] = (byte) loByte;
692             }
693         }
694 
695         // figure out if the current char is in a defined window
696         else if((whichWindow = findDynamicWindow(curUC))
697             != INVALIDWINDOW ) {
698             // if two chars in a row in the same window,
699             // switch to that window and go to single-byte mode
700             // inDynamicWindow will return false for INVALIDCHAR
701             if(inDynamicWindow(nextUC, whichWindow)) {
702                                 // make sure there is enough room to
703                                 // write both bytes if not, rewind the
704                                 // source stream and break out
705             if((bytePos + 1) >= byteBufferLimit)
706                 { --ucPos; break mainLoop; }
707 
708             byteBuffer[bytePos++] = (byte)(UCHANGE0 + whichWindow);
709             byteBuffer[bytePos++] = (byte)
710                 (curUC - fOffsets[whichWindow]
711                  + COMPRESSIONOFFSET);
712 
713             fTimeStamps [ whichWindow ] = ++fTimeStamp;
714             fCurrentWindow = whichWindow;
715             fMode = SINGLEBYTEMODE;
716             break unicodeModeLoop;
717             }
718 
719             // otherwise, just quote the unicode for the char
720             else {
721                                 // make sure there is enough room to
722                                 // write all three bytes if not,
723                                 // rewind the source stream and break
724                                 // out
725             if((bytePos + 2) >= byteBufferLimit)
726                 { --ucPos; break mainLoop; }
727 
728             hiByte = curUC >>> 8;
729             loByte = curUC & 0xFF;
730 
731             if(sUnicodeTagTable[ hiByte ])
732                 // add quote Unicode tag
733                 byteBuffer[bytePos++] = (byte) UQUOTEU;
734 
735             byteBuffer[bytePos++] = (byte) hiByte;
736             byteBuffer[bytePos++] = (byte) loByte;
737             }
738         }
739 
740         // char is not in a defined window
741         else {
742             // determine index for current char (char is compressible)
743             curIndex = makeIndex(curUC);
744             fIndexCount[curIndex]++;
745 
746             // look ahead
747             if( (ucPos + 1) < charBufferLimit )
748             forwardUC = charBuffer[ucPos + 1];
749             else
750             forwardUC = INVALIDCHAR;
751 
752             // if we have encountered this index at least once
753             // before, define a new window for it that hasn't
754             // previously been redefined
755             // OR
756             // if three chars in a row with the same index,
757             // define a new window (makeIndex will return
758             // RESERVEDINDEX for INVALIDCHAR)
759             if((fIndexCount[curIndex] > 1) ||
760                (curIndex == makeIndex(nextUC)
761             && curIndex == makeIndex(forwardUC))) {
762                                 // make sure there is enough room to
763                                 // write all three bytes if not,
764                                 // rewind the source stream and break
765                                 // out
766             if((bytePos + 2) >= byteBufferLimit)
767                 { --ucPos; break mainLoop; }
768 
769             // get least recently defined window
770             whichWindow = getLRDefinedWindow();
771 
772             byteBuffer[bytePos++] = (byte)(UDEFINE0 + whichWindow);
773             byteBuffer[bytePos++] = (byte) curIndex;
774             byteBuffer[bytePos++] = (byte)
775                 (curUC - sOffsetTable[curIndex]
776                  + COMPRESSIONOFFSET);
777 
778             fOffsets[whichWindow] = sOffsetTable[curIndex];
779             fCurrentWindow = whichWindow;
780             fTimeStamps [whichWindow] = ++fTimeStamp;
781             fMode = SINGLEBYTEMODE;
782             break unicodeModeLoop;
783             }
784 
785             // otherwise just quote the unicode, and save our
786             // windows for longer runs
787             else {
788                                 // make sure there is enough room to
789                                 // write all three bytes if not,
790                                 // rewind the source stream and break
791                                 // out
792             if((bytePos + 2) >= byteBufferLimit)
793                 { --ucPos; break mainLoop; }
794 
795             hiByte = curUC >>> 8;
796             loByte = curUC & 0xFF;
797 
798             if(sUnicodeTagTable[ hiByte ])
799                 // add quote Unicode tag
800                 byteBuffer[bytePos++] = (byte) UQUOTEU;
801 
802             byteBuffer[bytePos++] = (byte) hiByte;
803             byteBuffer[bytePos++] = (byte) loByte;
804             }
805         }
806         }
807         }  // end switch
808     }
809 
810         // fill in output parameter
811     if(charsRead != null)
812         charsRead [0] = (ucPos - charBufferStart);
813 
814         // return # of bytes written
815         return (bytePos - byteBufferStart);
816     }
817 
818     /**
819      * Reset the compressor to its initial state.
820      */
reset()821     public void reset()
822     {
823     int i;
824 
825         // reset dynamic windows
826         fOffsets[0] = 0x0080;    // Latin-1
827         fOffsets[1] = 0x00C0;    // Latin-1 Supplement + Latin Extended-A
828         fOffsets[2] = 0x0400;    // Cyrillic
829         fOffsets[3] = 0x0600;    // Arabic
830         fOffsets[4] = 0x0900;    // Devanagari
831         fOffsets[5] = 0x3040;    // Hiragana
832         fOffsets[6] = 0x30A0;    // Katakana
833         fOffsets[7] = 0xFF00;    // Fullwidth ASCII
834 
835 
836         // reset time stamps
837         for(i = 0; i < NUMWINDOWS; i++) {
838             fTimeStamps[i]          = 0;
839         }
840 
841         // reset count of seen indices
842         for(i = 0; i <= MAXINDEX; i++ ) {
843             fIndexCount[i] = 0;
844         }
845 
846         fTimeStamp      = 0;                // Reset current time stamp
847         fCurrentWindow  = 0;                // Make current window Latin-1
848         fMode           = SINGLEBYTEMODE;   // Always start in single-byte mode
849     }
850 
851     //==========================
852     // Determine the index for a character
853     //==========================
854 
855     /**
856      * Create the index value for a character.
857      * For more information on this function, refer to table X-3
858      * <A HREF="http://www.unicode.org/unicode/reports/tr6">UTR6</A>.
859      * @param c The character in question.
860      * @return An index for c
861      */
makeIndex(int c)862     private static int makeIndex(int c)
863     {
864         // check the predefined indices
865         if(c >= 0x00C0 && c < 0x0140)
866             return LATININDEX;
867         else if(c >= 0x0250 && c < 0x02D0)
868             return IPAEXTENSIONINDEX;
869         else if(c >= 0x0370 && c < 0x03F0)
870             return GREEKINDEX;
871         else if(c >= 0x0530 && c < 0x0590)
872             return ARMENIANINDEX;
873         else if(c >= 0x3040 && c < 0x30A0)
874             return HIRAGANAINDEX;
875         else if(c >= 0x30A0 && c < 0x3120)
876             return KATAKANAINDEX;
877         else if(c >= 0xFF60 && c < 0xFF9F)
878             return HALFWIDTHKATAKANAINDEX;
879 
880         // calculate index
881         else if(c >= 0x0080 && c < 0x3400)
882             return (c / 0x80) & 0xFF;
883         else if(c >= 0xE000 && c <= 0xFFFF)
884             return ((c - 0xAC00) / 0x80) & 0xFF;
885 
886         // should never happen
887         else {
888             return RESERVEDINDEX;
889         }
890     }
891 
892     //==========================
893     // Check if a given character fits in a window
894     //==========================
895 
896     /**
897     * Determine if a character is in a dynamic window.
898     * @param c The character to test
899     * @param whichWindow The dynamic window the test
900     * @return true if <TT>c</TT> will fit in <TT>whichWindow</TT>,
901     * false otherwise.
902     */
inDynamicWindow(int c, int whichWindow)903     private boolean inDynamicWindow(int c,
904                     int whichWindow)
905     {
906         return (c >= fOffsets[whichWindow]
907         && c < (fOffsets[whichWindow] + 0x80));
908     }
909 
910     /**
911      * Determine if a character is in a static window.
912     * @param c The character to test
913     * @param whichWindow The static window the test
914     * @return true if <TT>c</TT> will fit in <TT>whichWindow</TT>,
915     * false otherwise.
916     */
inStaticWindow(int c, int whichWindow)917     private static boolean inStaticWindow(int c,
918                       int whichWindow)
919     {
920         return (c >= sOffsets[whichWindow]
921         && c < (sOffsets[whichWindow] + 0x80));
922     }
923 
924     //==========================
925     // Check if a given character is compressible
926     //==========================
927 
928     /**
929     * Determine if a character is compressible.
930     * @param c The character to test.
931     * @return true if the <TT>c</TT> is compressible, false otherwise.
932     */
isCompressible(int c)933     private static boolean isCompressible(int c)
934     {
935         return (c < 0x3400 || c >= 0xE000);
936     }
937 
938     //==========================
939     // Check if a window is defined for a given character
940     //==========================
941 
942     /**
943      * Determine if a dynamic window for a certain character is defined
944      * @param c The character in question
945      * @return The dynamic window containing <TT>c</TT>, or
946      * INVALIDWINDOW if not defined.
947      */
findDynamicWindow(int c)948     private int findDynamicWindow(int c)
949     {
950     // supposedly faster to count down
951         //for(int i = 0; i < NUMWINDOWS; i++) {
952     for(int i = NUMWINDOWS - 1; i >= 0; --i) {
953         if(inDynamicWindow(c, i)) {
954         ++fTimeStamps[i];
955                 return i;
956         }
957     }
958 
959         return INVALIDWINDOW;
960     }
961 
962     /**
963      * Determine if a static window for a certain character is defined
964      * @param c The character in question
965      * @return The static window containing <TT>c</TT>, or
966      * INVALIDWINDOW if not defined.
967      */
findStaticWindow(int c)968     private static int findStaticWindow(int c)
969     {
970     // supposedly faster to count down
971         //for(int i = 0; i < NUMSTATICWINDOWS; i++) {
972     for(int i = NUMSTATICWINDOWS - 1; i >= 0; --i) {
973         if(inStaticWindow(c, i)) {
974                 return i;
975         }
976     }
977 
978         return INVALIDWINDOW;
979     }
980 
981     //==========================
982     // Find the least-recently used window
983     //==========================
984 
985     /** Find the least-recently defined window */
getLRDefinedWindow()986     private int getLRDefinedWindow()
987     {
988         int leastRU         = Integer.MAX_VALUE;
989         int whichWindow     = INVALIDWINDOW;
990 
991         // find least recently used window
992         // supposedly faster to count down
993         //for( int i = 0; i < NUMWINDOWS; i++ ) {
994         for(int i = NUMWINDOWS - 1; i >= 0; --i ) {
995             if( fTimeStamps[i] < leastRU ) {
996                 leastRU   = fTimeStamps[i];
997                 whichWindow  = i;
998             }
999         }
1000 
1001         return whichWindow;
1002     }
1003 
1004 }
1005